In [1]:
import pandas as pd
import numpy as np
from utils import tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary, getTrainingAndTestingData2, plot_wordclouds_per_class, display_results_table
from modelUtils import run_model2
from naive_bayes_custom import NaiveBayesCustom

# Load the datasets
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

y_train = df_train['label'].values
y_val = df_test['label'].values
num_classes = len(df_train['label'].unique())
num_samples = len(y_val)

print(f"Number of classes: {num_classes}")
print(f"Number of validation samples: {num_samples}")

Number of classes: 14
Number of validation samples: 35000


In [12]:
# Question no 7

# ----------------------------
# (a) Random prediction baseline
expected_random_accuracy = 1 / num_classes
print(f"Expected Random Accuracy (analytical): {expected_random_accuracy*100:.2f}%")

# Simulation-based random prediction
random_pred = np.random.choice(num_classes, size=num_samples)
accuracy_random_sim = np.mean(random_pred == y_val)
print(f"Random Prediction Accuracy (simulation): {accuracy_random_sim*100:.2f}%")

# ----------------------------
# (b) Positive-class baseline (most frequent class in training)
unique_classes, counts = np.unique(y_train, return_counts=True)
most_common_class = unique_classes[np.argmax(counts)]
positive_pred = np.full_like(y_val, most_common_class)
accuracy_positive = np.mean(positive_pred == y_val)
print(f"Positive-Class Accuracy: {accuracy_positive*100:.2f}%")

# ----------------------------
# (c) Model improvement
accuracy_model = 0.9728  # replace with your model's test accuracy
improvement_random = accuracy_model - expected_random_accuracy
improvement_positive = accuracy_model - accuracy_positive

print(f"Improvement over random baseline: {improvement_random*100:.2f}%")
print(f"Improvement over positive-class baseline: {improvement_positive*100:.2f}%")

Expected Random Accuracy (analytical): 7.14%
Random Prediction Accuracy (simulation): 7.12%
Positive-Class Accuracy: 7.14%
Improvement over random baseline: 90.14%
Improvement over positive-class baseline: 90.14%


In [3]:
# Question 9
df_train.head()

Unnamed: 0,label,title,content
0,3,Ernie Cox,Ernest Ernie Cox (February 17 1894 – February...
1,10,Holosteum,Holosteum is a genus of plants in the Pink fa...
2,9,Pestarella tyrrhena,Pestarella tyrrhena (formerly Callianassa tyr...
3,1,MidSun Junior High School,MidSun Junior High School is a Canadian middl...
4,6,St James' Church Wrightington Bar,St James' Church Wrightington Bar is in Churc...


In [4]:
df_train, vocabularyTitle = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "title", target_col = "Tokenized Title", remove_stop_words = False, with_stemming = False, window = [1,2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "title", target_col = "Tokenized Title", remove_stop_words = False, with_stemming = False, window = [1,2])

df_train, vocabularyContent = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1,2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1,2])

In [5]:
df_train.head()

Unnamed: 0,label,title,content,Tokenized Title,Tokenized Description
0,3,Ernie Cox,Ernest Ernie Cox (February 17 1894 – February...,"[ernie, cox, ernie cox]","[ernest, ernie, cox, (february, 17, 1894, –, f..."
1,10,Holosteum,Holosteum is a genus of plants in the Pink fa...,[holosteum],"[holosteum, is, a, genus, of, plants, in, the,..."
2,9,Pestarella tyrrhena,Pestarella tyrrhena (formerly Callianassa tyr...,"[pestarella, tyrrhena, pestarella tyrrhena]","[pestarella, tyrrhena, (formerly, callianassa,..."
3,1,MidSun Junior High School,MidSun Junior High School is a Canadian middl...,"[midsun, junior, high, school, midsun junior, ...","[midsun, junior, high, school, is, a, canadian..."
4,6,St James' Church Wrightington Bar,St James' Church Wrightington Bar is in Churc...,"[st, james', church, wrightington, bar, st jam...","[st, james', church, wrightington, bar, is, in..."


In [6]:
trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")

# adding new features - "Text Length"
trainingData["Description Length"] = trainingData["Tokenized Description"].apply(len)
trainingData["Title Length"] = trainingData["Tokenized Title"].apply(len)

testingData["Description Length"] = testingData["Tokenized Description"].apply(len)
testingData["Title Length"] = testingData["Tokenized Title"].apply(len)

print("after transformation")
trainingData.head()


Training Data Sample:
                                     Tokenized Title  \
0                            [ernie, cox, ernie cox]   
1                                        [holosteum]   
2        [pestarella, tyrrhena, pestarella tyrrhena]   
3  [midsun, junior, high, school, midsun junior, ...   
4  [st, james', church, wrightington, bar, st jam...   

                               Tokenized Description  Class Index  
0  [ernest, ernie, cox, (february, 17, 1894, –, f...            3  
1  [holosteum, is, a, genus, of, plants, in, the,...           10  
2  [pestarella, tyrrhena, (formerly, callianassa,...            9  
3  [midsun, junior, high, school, is, a, canadian...            1  
4  [st, james', church, wrightington, bar, is, in...            6  

Testing Data Sample:
                                     Tokenized Title  \
0                      [lajos, drahos, lajos drahos]   
1  [uss, huntsville, (1857), uss huntsville, hunt...   
2                                         

Unnamed: 0,Tokenized Title,Tokenized Description,Class Index,Description Length,Title Length
0,"[ernie, cox, ernie cox]","[ernest, ernie, cox, (february, 17, 1894, –, f...",3,127,3
1,[holosteum],"[holosteum, is, a, genus, of, plants, in, the,...",10,107,1
2,"[pestarella, tyrrhena, pestarella tyrrhena]","[pestarella, tyrrhena, (formerly, callianassa,...",9,133,3
3,"[midsun, junior, high, school, midsun junior, ...","[midsun, junior, high, school, is, a, canadian...",1,153,7
4,"[st, james', church, wrightington, bar, st jam...","[st, james', church, wrightington, bar, is, in...",6,81,9


In [7]:
results = {}

model = NaiveBayesCustom()
modelName = "taking token length of description and title separately"
print(f"--------{modelName}---------")
results[modelName] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 0.1, 
                                text_col1 = "Tokenized Title", 
                                text_col2 = "Tokenized Description",
                                new_feature_cols = ["Title Length", "Description Length"]
                               )

--------taking token length of description and title separately---------
Training with 14 classes, 140000 examples.
Title vocab size: 309323, Content vocab size: 2405125
Shape of phi_y: (14,)
Shape of phi_j_given_y: (14, 309323)
Shape of phi_y: (14,)
Shape of phi_j_given_y: (14, 2405125)
Predictions added to column 'Predicted'.
Evaluating on train data...
Evaluating on 140000 examples
Overall Accuracy: 99.90%

Class 0 -> Precision: 0.9988, Recall: 0.9975, F1-Score: 0.9981
Class 1 -> Precision: 0.9975, Recall: 0.9994, F1-Score: 0.9985
Class 2 -> Precision: 0.9992, Recall: 0.9980, F1-Score: 0.9986
Class 3 -> Precision: 0.9992, Recall: 0.9996, F1-Score: 0.9994
Class 4 -> Precision: 0.9982, Recall: 0.9990, F1-Score: 0.9986
Class 5 -> Precision: 0.9996, Recall: 0.9996, F1-Score: 0.9996
Class 6 -> Precision: 0.9980, Recall: 0.9981, F1-Score: 0.9981
Class 7 -> Precision: 0.9986, Recall: 1.0000, F1-Score: 0.9993
Class 8 -> Precision: 0.9999, Recall: 0.9986, F1-Score: 0.9992
Class 9 -> Precisio

In [8]:
trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")

# adding new features - "Text Length"
trainingData["Avg Length"] = (trainingData["Tokenized Description"].apply(len) + trainingData["Tokenized Title"].apply(len))/2
testingData["Avg Length"] = (testingData["Tokenized Description"].apply(len) + testingData["Tokenized Title"].apply(len))/2

print("after transformation")
print(trainingData.head())

model = NaiveBayesCustom()
modelName = "taking avg token length of description and title"
print(f"--------{modelName}---------")
results[modelName] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 0.1, 
                                text_col1 = "Tokenized Title", 
                                text_col2 = "Tokenized Description",
                                new_feature_cols = ["Avg Length"]
                               )


Training Data Sample:
                                     Tokenized Title  \
0                            [ernie, cox, ernie cox]   
1                                        [holosteum]   
2        [pestarella, tyrrhena, pestarella tyrrhena]   
3  [midsun, junior, high, school, midsun junior, ...   
4  [st, james', church, wrightington, bar, st jam...   

                               Tokenized Description  Class Index  
0  [ernest, ernie, cox, (february, 17, 1894, –, f...            3  
1  [holosteum, is, a, genus, of, plants, in, the,...           10  
2  [pestarella, tyrrhena, (formerly, callianassa,...            9  
3  [midsun, junior, high, school, is, a, canadian...            1  
4  [st, james', church, wrightington, bar, is, in...            6  

Testing Data Sample:
                                     Tokenized Title  \
0                      [lajos, drahos, lajos drahos]   
1  [uss, huntsville, (1857), uss huntsville, hunt...   
2                                         

In [9]:
trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")

# adding new features - "Text Length"
trainingData["Description Length"] = df_train["content"].apply(len)
trainingData["Title Length"] = df_train["title"].apply(len)

testingData["Description Length"] = df_test["content"].apply(len)
testingData["Title Length"] = df_test["title"].apply(len)
print("after transformation")
print(trainingData.head())

model = NaiveBayesCustom()
modelName = "taking string length of description and title separately"
print(f"--------{modelName}---------")
results[modelName] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 0.1, 
                                text_col1 = "Tokenized Title", 
                                text_col2 = "Tokenized Description",
                                new_feature_cols = ["Description Length", "Title Length"]
                               )


Training Data Sample:
                                     Tokenized Title  \
0                            [ernie, cox, ernie cox]   
1                                        [holosteum]   
2        [pestarella, tyrrhena, pestarella tyrrhena]   
3  [midsun, junior, high, school, midsun junior, ...   
4  [st, james', church, wrightington, bar, st jam...   

                               Tokenized Description  Class Index  
0  [ernest, ernie, cox, (february, 17, 1894, –, f...            3  
1  [holosteum, is, a, genus, of, plants, in, the,...           10  
2  [pestarella, tyrrhena, (formerly, callianassa,...            9  
3  [midsun, junior, high, school, is, a, canadian...            1  
4  [st, james', church, wrightington, bar, is, in...            6  

Testing Data Sample:
                                     Tokenized Title  \
0                      [lajos, drahos, lajos drahos]   
1  [uss, huntsville, (1857), uss huntsville, hunt...   
2                                         

In [10]:
trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")

# adding new features - "Text Length"
trainingData["Avg Length"] = (df_train["content"].apply(len) + df_train["title"].apply(len))/2

testingData["Avg Length"] = (df_test["content"].apply(len) + df_test["title"].apply(len))/2
print("after transformation")
print(trainingData.head())

model = NaiveBayesCustom()
modelName = "taking avg string length of description and title"
print(f"--------{modelName}---------")
results[modelName] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 0.1, 
                                text_col1 = "Tokenized Title", 
                                text_col2 = "Tokenized Description",
                                new_feature_cols = ["Avg Length"]
                               )


Training Data Sample:
                                     Tokenized Title  \
0                            [ernie, cox, ernie cox]   
1                                        [holosteum]   
2        [pestarella, tyrrhena, pestarella tyrrhena]   
3  [midsun, junior, high, school, midsun junior, ...   
4  [st, james', church, wrightington, bar, st jam...   

                               Tokenized Description  Class Index  
0  [ernest, ernie, cox, (february, 17, 1894, –, f...            3  
1  [holosteum, is, a, genus, of, plants, in, the,...           10  
2  [pestarella, tyrrhena, (formerly, callianassa,...            9  
3  [midsun, junior, high, school, is, a, canadian...            1  
4  [st, james', church, wrightington, bar, is, in...            6  

Testing Data Sample:
                                     Tokenized Title  \
0                      [lajos, drahos, lajos drahos]   
1  [uss, huntsville, (1857), uss huntsville, hunt...   
2                                         

In [11]:
display_results_table(results)


Model: taking token length of description and title separately
╒══════════════════════╤═════════╤═════════╕
│ Metric               │   Train │    Test │
╞══════════════════════╪═════════╪═════════╡
│ Overall Accuracy (%) │  99.9   │ 97.31   │
├──────────────────────┼─────────┼─────────┤
│ Overall Precision    │   0.999 │  0.9731 │
├──────────────────────┼─────────┼─────────┤
│ Overall Recall       │   0.999 │  0.9731 │
├──────────────────────┼─────────┼─────────┤
│ Overall F1 Score     │   0.999 │  0.9731 │
├──────────────────────┼─────────┼─────────┤
│ Macro F1 Score       │   0.999 │  0.973  │
╘══════════════════════╧═════════╧═════════╛

Model: taking avg token length of description and title
╒══════════════════════╤═════════╤═════════╕
│ Metric               │   Train │    Test │
╞══════════════════════╪═════════╪═════════╡
│ Overall Accuracy (%) │  99.9   │ 97.28   │
├──────────────────────┼─────────┼─────────┤
│ Overall Precision    │   0.999 │  0.9728 │
├──────────────────────┼