In [1]:
import pandas as pd
from naive_bayes import NaiveBayes
from utils import tokenizeAndGetTrainingAndTestingData, run_model, plot_wordclouds_per_class, removeStopWordsAndStem

In [2]:
df_train = pd.read_csv("train.csv")
print(f"df_train.shape : {df_train.shape}")

print(df_train.head())

df_test = pd.read_csv("test.csv")
print(f"df_test.shape : {df_test.shape}")

print(df_test.head())

results = {}

df_train.shape : (140000, 3)
   label                              title  \
0      3                          Ernie Cox   
1     10                          Holosteum   
2      9                Pestarella tyrrhena   
3      1          MidSun Junior High School   
4      6  St James' Church Wrightington Bar   

                                             content  
0   Ernest Ernie Cox (February 17 1894 – February...  
1   Holosteum is a genus of plants in the Pink fa...  
2   Pestarella tyrrhena (formerly Callianassa tyr...  
3   MidSun Junior High School is a Canadian middl...  
4   St James' Church Wrightington Bar is in Churc...  
df_test.shape : (35000, 3)
   label                          title  \
0      4                   Lajos Drahos   
1      5          USS Huntsville (1857)   
2      0                         SCAFCO   
3      6               McLean's Mansion   
4      5  Avioane Craiova IAR-93 Vultur   

                                             content  
0   Lajos Drahos 

In [3]:
# Question Part 1 :: Unigram without stemming and removing stop words
# Train the model with context only corresponding to the labels
vocabulary, trainingData, testingData = tokenizeAndGetTrainingAndTestingData(df_train, df_test, window = 1)

trainingData.head()
                               Tokenized Description  Class Index
0  [Ernest, Ernie, Cox, (February, 17, 1894, –, F...            3
1  [Holosteum, is, a, genus, of, plants, in, the,...           10
2  [Pestarella, tyrrhena, (formerly, Callianassa,...            9
3  [MidSun, Junior, High, School, is, a, Canadian...            1
4  [St, James', Church, Wrightington, Bar, is, in...            6
testingData.head()
                               Tokenized Description  Class Index
0  [Lajos, Drahos, (7, March, 1895, -, 2, June, 1...            4
1  [USS, Huntsville, was, a, steamer, acquired, b...            5
2  [Founded, in, 1954, by, Ben, G., Stone, SCAFCO...            0
3  [McLean's, Mansion, (originally, Holly, Lea), ...            6
4  [The, Avioane, Craiova, IAR-93, Vultur, (Eagle...            5


In [4]:
# 1. unigram - without stem - without stop words - done
model = NaiveBayes() 
print("--------UNIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["UNIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0)

--------UNIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 453340
Shape of phi_y: (14,)
Shape of phi_j_given_y: (14, 453340)
Evauating on train data...
Evaluating on 140000 examples
Overall Accuracy: 97.89%

Class 0 -> Precision: 0.9712, Recall: 0.9282, F1-Score: 0.9492
Class 1 -> Precision: 0.9732, Recall: 0.9896, F1-Score: 0.9814
Class 2 -> Precision: 0.9807, Recall: 0.9566, F1-Score: 0.9685
Class 3 -> Precision: 0.9849, Recall: 0.9938, F1-Score: 0.9893
Class 4 -> Precision: 0.9801, Recall: 0.9884, F1-Score: 0.9842
Class 5 -> Precision: 0.9823, Recall: 0.9941, F1-Score: 0.9882
Class 6 -> Precision: 0.9591, Recall: 0.9747, F1-Score: 0.9668
Class 7 -> Precision: 0.9761, Recall: 0.9918, F1-Score: 0.9839
Class 8 -> Precision: 0.9995, Recall: 0.9587, F1-Score: 0.9787
Class 9 -> Precision: 0.9989, Recall: 0.9731, F1-Score: 0.9858
Class 10 -> Precision: 0.9875, Recall: 0.9962, F1-Score: 0.9918
Class 11 -> Precision: 0.9670

In [5]:
# plot_wordclouds_per_class(trainingData, maxWords = 200, width = 800, height = 400)

In [6]:
# Question 1, part 2 :: Unigram with stemming and removing stop words

# # (a) Perform stemming and remove the stop-words in the training as well as the validation data.
# trainingData["Tokenized Description"] = trainingData["Tokenized Description"].apply(removeStopWordsAndStem)
testingData["Tokenized Description"] = testingData["Tokenized Description"].apply(removeStopWordsAndStem)
print(testingData["Tokenized Description"].head())

trainingData["Tokenized Description"] = trainingData["Tokenized Description"].apply(removeStopWordsAndStem)
print(trainingData["Tokenized Description"].head())

0    [lajo, draho, (7, march, 1895, -, 2, june, 198...
1    [uss, huntsvil, steamer, acquir, union, navi, ...
2    [found, 1954, ben, g., stone, scafco, corpor, ...
3    [mclean', mansion, (origin, holli, lea), homes...
4    [avioan, craiova, iar-93, vultur, (eagle), twi...
Name: Tokenized Description, dtype: object
0    [ernest, erni, cox, (februari, 17, 1894, –, fe...
1    [holosteum, genu, plant, pink, famili, (caryop...
2    [pestarella, tyrrhena, (formerli, callianassa,...
3    [midsun, junior, high, school, canadian, middl...
4    [st, james', church, wrightington, bar, church...
Name: Tokenized Description, dtype: object


In [7]:
# (b) Construct word clouds for both classes on the transformed data
print("Word cloud on training data")
# plot_wordclouds_per_class(trainingData, data_type = "Training Set", maxWords = 200, width = 800, height = 400)

print("Word cloud on testing data")
# plot_wordclouds_per_class(testingData, data_type = "Testing Set", maxWords = 200, width = 800, height = 400)

Word cloud on training data
Word cloud on testing data


In [8]:
# (c) Learn a new model on the transformed data. Report the validation set accuracy
# 2. unigram - with stem - with stop words       - done
model = NaiveBayes()
print("--------UNIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["UNIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0)

--------UNIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 453340
Shape of phi_y: (14,)
Shape of phi_j_given_y: (14, 453340)
Evauating on train data...
Evaluating on 140000 examples
Overall Accuracy: 93.98%

Class 0 -> Precision: 0.8319, Recall: 0.8306, F1-Score: 0.8313
Class 1 -> Precision: 0.9392, Recall: 0.9379, F1-Score: 0.9386
Class 2 -> Precision: 0.9278, Recall: 0.9016, F1-Score: 0.9145
Class 3 -> Precision: 0.9746, Recall: 0.9894, F1-Score: 0.9819
Class 4 -> Precision: 0.9560, Recall: 0.9695, F1-Score: 0.9627
Class 5 -> Precision: 0.9456, Recall: 0.9780, F1-Score: 0.9615
Class 6 -> Precision: 0.9114, Recall: 0.9211, F1-Score: 0.9162
Class 7 -> Precision: 0.9395, Recall: 0.9880, F1-Score: 0.9632
Class 8 -> Precision: 0.9952, Recall: 0.8791, F1-Score: 0.9336
Class 9 -> Precision: 0.9914, Recall: 0.9155, F1-Score: 0.9520
Class 10 -> Precision: 0.9678, Recall: 0.9717, F1-Score: 0.9698
Class 11 -> Precision: 0.9246, Reca

In [9]:
#(d) How does your accuracy change over the validation set? Comment on your observations.

In [10]:
# Question 3 :: Bigram with stemming and removing stop words
# 3. bigram - with stem - with stop words        -done
vocabulary, trainingData, testingData = tokenizeAndGetTrainingAndTestingData(df_train, df_test, window = 2)

testingData["Tokenized Description"] = testingData["Tokenized Description"].apply(removeStopWordsAndStem)
print(testingData["Tokenized Description"].head())

trainingData["Tokenized Description"] = trainingData["Tokenized Description"].apply(removeStopWordsAndStem)
print(trainingData["Tokenized Description"].head())

model = NaiveBayes()
print("--------BIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["BIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0)

trainingData.head()
                               Tokenized Description  Class Index
0  [Ernest Ernie, Ernie Cox, Cox (February, (Febr...            3
1  [Holosteum is, is a, a genus, genus of, of pla...           10
2  [Pestarella tyrrhena, tyrrhena (formerly, (for...            9
3  [MidSun Junior, Junior High, High School, Scho...            1
4  [St James', James' Church, Church Wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [Lajos Drahos, Drahos (7, (7 March, March 1895...            4
1  [USS Huntsville, Huntsville was, was a, a stea...            5
2  [Founded in, in 1954, 1954 by, by Ben, Ben G.,...            0
3  [McLean's Mansion, Mansion (originally, (origi...            6
4  [The Avioane, Avioane Craiova, Craiova IAR-93,...            5
0    [lajos draho, drahos (7, (7 march, march 1895,...
1    [uss huntsvil, huntsville wa, was a, a steam, ...
2    [founded in, in 1954, 1954 bi, by ben, ben g.,...
3   

In [11]:
# Question 3 :: Bigram without stemming or removing stop words
# bigram - without stem - without stop words  - done
vocabulary, trainingData, testingData = tokenizeAndGetTrainingAndTestingData(df_train, df_test, window = 2)

model = NaiveBayes()
print("--------BIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["BIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0)

trainingData.head()
                               Tokenized Description  Class Index
0  [Ernest Ernie, Ernie Cox, Cox (February, (Febr...            3
1  [Holosteum is, is a, a genus, genus of, of pla...           10
2  [Pestarella tyrrhena, tyrrhena (formerly, (for...            9
3  [MidSun Junior, Junior High, High School, Scho...            1
4  [St James', James' Church, Church Wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [Lajos Drahos, Drahos (7, (7 March, March 1895...            4
1  [USS Huntsville, Huntsville was, was a, a stea...            5
2  [Founded in, in 1954, 1954 by, by Ben, Ben G.,...            0
3  [McLean's Mansion, Mansion (originally, (origi...            6
4  [The Avioane, Avioane Craiova, Craiova IAR-93,...            5
--------BIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 2064355
Shape of phi_y: (14,)
Shape of phi

In [12]:
# unigram - with stem - without stop words
# Question 4 :: uigram with stemming and removing stop words
# 4. uigram - with stem - with stop words        -done
vocabulary, trainingData, testingData = tokenizeAndGetTrainingAndTestingData(df_train, df_test, window = 1)

# Apply preprocessing with stopword removal enabled
testingData["Tokenized Description"] = testingData["Tokenized Description"].apply(
    lambda tokens: removeStopWordsAndStem(tokens, remove_stop_words=False, with_stemming = True)
)
trainingData["Tokenized Description"] = trainingData["Tokenized Description"].apply(
    lambda tokens: removeStopWordsAndStem(tokens, remove_stop_words=False, with_stemming = True)
)

model = NaiveBayes()
print("--------UNIGRAM -- WITH STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["UNIGRAM-WITH STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0)

trainingData.head()
                               Tokenized Description  Class Index
0  [Ernest, Ernie, Cox, (February, 17, 1894, –, F...            3
1  [Holosteum, is, a, genus, of, plants, in, the,...           10
2  [Pestarella, tyrrhena, (formerly, Callianassa,...            9
3  [MidSun, Junior, High, School, is, a, Canadian...            1
4  [St, James', Church, Wrightington, Bar, is, in...            6
testingData.head()
                               Tokenized Description  Class Index
0  [Lajos, Drahos, (7, March, 1895, -, 2, June, 1...            4
1  [USS, Huntsville, was, a, steamer, acquired, b...            5
2  [Founded, in, 1954, by, Ben, G., Stone, SCAFCO...            0
3  [McLean's, Mansion, (originally, Holly, Lea), ...            6
4  [The, Avioane, Craiova, IAR-93, Vultur, (Eagle...            5
--------UNIGRAM -- WITH STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 453340
Shape of phi_y: (14,)
Shape of phi_j_

In [13]:
# 4. unigram - without stem - with stop words
vocabulary, trainingData, testingData = tokenizeAndGetTrainingAndTestingData(df_train, df_test, window = 1)

# Apply preprocessing with stopword removal enabled
testingData["Tokenized Description"] = testingData["Tokenized Description"].apply(
    lambda tokens: removeStopWordsAndStem(tokens, remove_stop_words=True, with_stemming = False)
)
trainingData["Tokenized Description"] = trainingData["Tokenized Description"].apply(
    lambda tokens: removeStopWordsAndStem(tokens, remove_stop_words=True, with_stemming = False)
)

model = NaiveBayes()
print("--------UNIGRAM -- WITHOUT STEMMING -- WITH STOP WORDS REMOVAL---------")
results["UNIGRAM-WITHOUT STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0)

trainingData.head()
                               Tokenized Description  Class Index
0  [Ernest, Ernie, Cox, (February, 17, 1894, –, F...            3
1  [Holosteum, is, a, genus, of, plants, in, the,...           10
2  [Pestarella, tyrrhena, (formerly, Callianassa,...            9
3  [MidSun, Junior, High, School, is, a, Canadian...            1
4  [St, James', Church, Wrightington, Bar, is, in...            6
testingData.head()
                               Tokenized Description  Class Index
0  [Lajos, Drahos, (7, March, 1895, -, 2, June, 1...            4
1  [USS, Huntsville, was, a, steamer, acquired, b...            5
2  [Founded, in, 1954, by, Ben, G., Stone, SCAFCO...            0
3  [McLean's, Mansion, (originally, Holly, Lea), ...            6
4  [The, Avioane, Craiova, IAR-93, Vultur, (Eagle...            5
--------UNIGRAM -- WITHOUT STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 453340
Shape of phi_y: (14,)
Shape of phi_j_

In [14]:
# 4. bigram - with stem - without stop words
vocabulary, trainingData, testingData = tokenizeAndGetTrainingAndTestingData(df_train, df_test, window = 2)

# Apply preprocessing with stopword removal enabled
testingData["Tokenized Description"] = testingData["Tokenized Description"].apply(
    lambda tokens: removeStopWordsAndStem(tokens, remove_stop_words=False, with_stemming = True)
)
trainingData["Tokenized Description"] = trainingData["Tokenized Description"].apply(
    lambda tokens: removeStopWordsAndStem(tokens, remove_stop_words=False, with_stemming = True)
)

model = NaiveBayes()
print("--------BIGRAM -- WITH STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["BIGRAM-WITH STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0)

trainingData.head()
                               Tokenized Description  Class Index
0  [Ernest Ernie, Ernie Cox, Cox (February, (Febr...            3
1  [Holosteum is, is a, a genus, genus of, of pla...           10
2  [Pestarella tyrrhena, tyrrhena (formerly, (for...            9
3  [MidSun Junior, Junior High, High School, Scho...            1
4  [St James', James' Church, Church Wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [Lajos Drahos, Drahos (7, (7 March, March 1895...            4
1  [USS Huntsville, Huntsville was, was a, a stea...            5
2  [Founded in, in 1954, 1954 by, by Ben, Ben G.,...            0
3  [McLean's Mansion, Mansion (originally, (origi...            6
4  [The Avioane, Avioane Craiova, Craiova IAR-93,...            5
--------BIGRAM -- WITH STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 2064355
Shape of phi_y: (14,)
Shape of phi_j_

In [15]:
# 4. bigram - without stem - with stop words
vocabulary, trainingData, testingData = tokenizeAndGetTrainingAndTestingData(df_train, df_test, window = 2)

# Apply preprocessing with stopword removal enabled
testingData["Tokenized Description"] = testingData["Tokenized Description"].apply(
    lambda tokens: removeStopWordsAndStem(tokens, remove_stop_words=True, with_stemming = False)
)
trainingData["Tokenized Description"] = trainingData["Tokenized Description"].apply(
    lambda tokens: removeStopWordsAndStem(tokens, remove_stop_words=True, with_stemming = False)
)

model = NaiveBayes()
print("--------BIGRAM -- WITHOUT STEMMING -- WITH STOP WORDS REMOVAL---------")
results["BIGRAM-WITHOUT STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0)

trainingData.head()
                               Tokenized Description  Class Index
0  [Ernest Ernie, Ernie Cox, Cox (February, (Febr...            3
1  [Holosteum is, is a, a genus, genus of, of pla...           10
2  [Pestarella tyrrhena, tyrrhena (formerly, (for...            9
3  [MidSun Junior, Junior High, High School, Scho...            1
4  [St James', James' Church, Church Wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [Lajos Drahos, Drahos (7, (7 March, March 1895...            4
1  [USS Huntsville, Huntsville was, was a, a stea...            5
2  [Founded in, in 1954, 1954 by, by Ben, Ben G.,...            0
3  [McLean's Mansion, Mansion (originally, (origi...            6
4  [The Avioane, Avioane Craiova, Craiova IAR-93,...            5
--------BIGRAM -- WITHOUT STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 2064355
Shape of phi_y: (14,)
Shape of phi_j_