In [1]:
# Import library needed
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
# Display data
data = pd.read_csv('imo.csv')
data = data[data["label"].notna()]

X, y = data["post_canonical"], data["label"]

display(data.sample(5))
print(data.shape)


Unnamed: 0,id_key,no,contest_category,contest_name,year,link,pdf,post_rendered,post_canonical,label
5413,5414,6,International Contest,centroamerican,2009_centroamerican,https://artofproblemsolving.com/community/c456...,https://artofproblemsolving.com/downloads/prin...,"Find all prime numbers <img src=""//latex.artof...",Find all prime numbers $ p$ and $ q$ such that...,Number Theory
547,548,N2,International Contest,imo_shortlist,2019_isl,https://artofproblemsolving.com/community/c130...,https://artofproblemsolving.com/downloads/prin...,"Find all triples <img src=""//latex.artofproble...","Find all triples $(a, b, c)$ of positive integ...",Number Theory
7534,7535,6,International Contest,jbmo_shortlists,2011_jbmo_shortlist,https://artofproblemsolving.com/community/c542...,https://artofproblemsolving.com/downloads/prin...,"Let <img src=""//latex.artofproblemsolving.com/...",Let $n>3$ be a positive integer. Equilateral t...,Combinatorics
912,913,1,International Contest,imo_shortlist,2006_imo_shortlist,https://artofproblemsolving.com/community/c395...,https://artofproblemsolving.com/downloads/prin...,"We have <img src=""//latex.artofproblemsolving....","We have $ n \geq 2$ lamps $ L_{1}, . . . ,L_{n...",Combinatorics
3302,3303,14,International Contest,imo_longlists,1970_imo_longlists,https://artofproblemsolving.com/community/c400...,https://artofproblemsolving.com/downloads/prin...,"Let <span style=""white-space:nowrap;""><img src...",Let $\alpha + \beta +\gamma = \pi$. Prove that...,Algebra


(2871, 10)


In [3]:
# Preprocess text data
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(X)

# Split training and test 
X_train, X_test, y_train, y_test = train_test_split(text_features, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model with train dataset
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# confusion_matrix_data = confusion_matrix(data["label"], y_pred)

# # Calculate accuracy for each label
# unique_labels = ["Algebra", "Combinatorics", "Geometry", "Number Theory"]
# label_accuracies = {}
# true_predictions = {}
# total_samples = {}
# for label in unique_labels:
#     label_index = np.where(model.classes_ == label)[0][0]
#     true_positives = confusion_matrix_data[label_index, label_index]
#     total_samples[label] = np.sum(confusion_matrix_data[label_index, :])
#     label_accuracy = true_positives / total_samples[label]
#     label_accuracies[label] = label_accuracy
#     true_predictions[label] = true_positives

# Print the results
print(f"Accuracy:  {accuracy*100:<.5f}%")
print(f"Precision: {precision*100:<.5f}%")
print(f"Recall:    {recall*100:<.5f}%")
print(f"F1-score:  {f1*100:<.5f}%")
print()

# print("Accuracy for each label:")
# for label, acc in label_accuracies.items():
#     print(f"{label}: {acc:<.5f}")

# print("\nNumber of true predictions for each label:")
# for label, true_count in true_predictions.items():
#     print(f"{label}: {true_count}")

# print("\nTotal number of occurrences for each label in the data:")
# for label, total_count in total_samples.items():
#     print(f"{label}: {total_count}")

# # Create a heatmap for the confusion matrix
# plt.figure(figsize=(10, 8))
# sns.heatmap(confusion_matrix_data, annot=True, cmap='Blues', fmt='g', xticklabels=model.classes_, yticklabels=model.classes_)
# plt.xlabel('Predicted labels')
# plt.ylabel('True labels')
# plt.title('Confusion Matrix')
# plt.show()


Accuracy:  88.86957%
Precision: 88.88559%
Recall:    88.86957%
F1-score:  88.84629%



In [14]:
# Get the confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)

# Get the unique labels
labels = np.unique(y_test)

# Calculate accuracy for each label
for label in labels:
    idx = np.where(model.classes_ == label)[0][0]
    label_accuracy = conf_mat[idx, idx] / np.sum(conf_mat[idx, :])
    print(f"Accuracy for {label:<14s} {label_accuracy*100:.4f}%")


Accuracy for Algebra        83.8028%
Accuracy for Combinatorics  82.6087%
Accuracy for Geometry       98.4615%
Accuracy for Number Theory  85.3659%


In [13]:
# For new predict
sample = ['In a country every 2 cities are connected either by a direct bus route or a direct plane flight. A $clique$ is a set of cities such that every 2 cities in the set are connected by a direct flight. A $cluque$ is a set of cities such that every 2 cities in the set are connected by a direct flight, and every 2 cities in the set are connected to the same number of cities by a bus route. A $claque$ is a set of cities such that every 2 cities in the set are connected by a direct flight, and every 2 numbers of bus routes from a city in the set are different. Prove that the number of cities of any clique is at most the product of the biggest possible number of cities in a cluque and the the biggest possible number of cities in a claque.  Tuymaada 2017 Q3 Juniors']
processed = vectorizer.transform(sample)
pred = model.predict_proba(processed)

class_labels = model.classes_
class_probabilities = {class_labels[i]: pred[0][i] * 100 for i in range(len(class_labels))}
print(class_probabilities)

sorted_class_probabilities = {label: class_probabilities[label] for label in ['Algebra', 'Combinatorics', 'Geometry', 'Number Theory']}
print(sorted_class_probabilities)

print()
for label, prob in zip(class_labels, pred[0]):
    print(f"{label:<14s}= {prob*100:.2f}%")
    

{'Algebra': 10.456458031136547, 'Combinatorics': 64.82155191962697, 'Geometry': 7.558976440978409, 'Number Theory': 17.163013608257927}
{'Algebra': 10.456458031136547, 'Combinatorics': 64.82155191962697, 'Geometry': 7.558976440978409, 'Number Theory': 17.163013608257927}

Algebra       = 10.46%
Combinatorics = 64.82%
Geometry      = 7.56%
Number Theory = 17.16%


In [26]:
import pandas as pd
data = pd.read_csv('imo.csv')
temp = data[data['label'].isnull()]
temp = temp.sample(5)

In [27]:
temp

Unnamed: 0,id_key,no,contest_category,contest_name,year,link,pdf,post_rendered,post_canonical,label
1540,1541,24,International Contest,imo_shortlist,1983_imo_shortlist,https://artofproblemsolving.com/community/c393...,https://artofproblemsolving.com/downloads/prin...,"Let <img src=""//latex.artofproblemsolving.com/...",Let $d_n$ be the last nonzero digit of the dec...,
10703,10704,3,International Contest,tuymaada_olympiad,2017_tuymaada_olympiad,https://artofproblemsolving.com/community/c534...,https://artofproblemsolving.com/downloads/prin...,In a country every 2 cities are connected eith...,In a country every 2 cities are connected eith...,
4983,4984,5,International Contest,baltic_way,1997_baltic_way,https://artofproblemsolving.com/community/c513...,https://artofproblemsolving.com/downloads/prin...,"In a sequence <img src=""//latex.artofproblemso...","In a sequence $u_0,u_1,\ldots $ of positive in...",
7016,7017,4,International Contest,international_olympiad_of_metropolises,2018_iom,https://artofproblemsolving.com/community/c719...,https://artofproblemsolving.com/downloads/prin...,"Let <img src=""//latex.artofproblemsolving.com/...",Let $1 = d_0 < d_1 < \dots < d_m = 4k$ be all ...,
8914,8915,3,International Contest,pan_african,2010_pan_african,https://artofproblemsolving.com/community/c452...,https://artofproblemsolving.com/downloads/prin...,"Does there exist a function <img src=""//latex....",Does there exist a function $f:\mathbb{Z}\to\m...,


In [28]:
temp.iloc[1, :].post_canonical

'In a country every 2 cities are connected either by a direct bus route or a direct plane flight. A $clique$ is a set of cities such that every 2 cities in the set are connected by a direct flight. A $cluque$ is a set of cities such that every 2 cities in the set are connected by a direct flight, and every 2 cities in the set are connected to the same number of cities by a bus route. A $claque$ is a set of cities such that every 2 cities in the set are connected by a direct flight, and every 2 numbers of bus routes from a city in the set are different. Prove that the number of cities of any clique is at most the product of the biggest possible number of cities in a cluque and the the biggest possible number of cities in a claque.  Tuymaada 2017 Q3 Juniors'