In [1]:
import pandas as pd
from naive_bayes import NaiveBayes
from utils import tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary, getTrainingAndTestingData, run_model, plot_wordclouds_per_class

In [2]:
df_train = pd.read_csv("train.csv")
print(f"df_train.shape : {df_train.shape}")

print(df_train.head())

df_test = pd.read_csv("test.csv")
print(f"df_test.shape : {df_test.shape}")

print(df_test.head())

results = {}

df_train.shape : (140000, 3)
   label                              title  \
0      3                          Ernie Cox   
1     10                          Holosteum   
2      9                Pestarella tyrrhena   
3      1          MidSun Junior High School   
4      6  St James' Church Wrightington Bar   

                                             content  
0   Ernest Ernie Cox (February 17 1894 – February...  
1   Holosteum is a genus of plants in the Pink fa...  
2   Pestarella tyrrhena (formerly Callianassa tyr...  
3   MidSun Junior High School is a Canadian middl...  
4   St James' Church Wrightington Bar is in Churc...  
df_test.shape : (35000, 3)
   label                          title  \
0      4                   Lajos Drahos   
1      5          USS Huntsville (1857)   
2      0                         SCAFCO   
3      6               McLean's Mansion   
4      5  Avioane Craiova IAR-93 Vultur   

                                             content  
0   Lajos Drahos 

In [3]:
# Question Part 1 :: Unigram without stemming and removing stop words
# Train the model with context only corresponding to the labels
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = 1)
df_test, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = 1)

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

# 1. unigram - without stem - without stop words - done
model = NaiveBayes() 
print("--------UNIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["UNIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [ernest, ernie, cox, (february, 17, 1894, –, f...            3
1  [holosteum, is, a, genus, of, plants, in, the,...           10
2  [pestarella, tyrrhena, (formerly, callianassa,...            9
3  [midsun, junior, high, school, is, a, canadian...            1
4  [st, james', church, wrightington, bar, is, in...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajos, drahos, (7, march, 1895, -, 2, june, 1...            4
1  [uss, huntsville, was, a, steamer, acquired, b...            5
2  [founded, in, 1954, by, ben, g., stone, scafco...            0
3  [mclean's, mansion, (originally, holly, lea), ...            6
4  [the, avioane, craiova, iar-93, vultur, (eagle...            5
--------UNIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 158757
Shape of phi_y: (14,)
Shape of phi

In [4]:
# plot_wordclouds_per_class(trainingData, maxWords = 200, width = 800, height = 400)

In [5]:
# Question 1, part 2 :: Unigram with stemming and removing stop words

# # (a) Perform stemming and remove the stop-words in the training as well as the validation data.
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = 1)
df_test, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = 1)

# (b) Construct word clouds for both classes on the transformed data
print("Word cloud on training data")
# plot_wordclouds_per_class(trainingData, data_type = "Training Set", maxWords = 200, width = 800, height = 400)

print("Word cloud on testing data")
# plot_wordclouds_per_class(testingData, data_type = "Testing Set", maxWords = 200, width = 800, height = 400)

# (c) Learn a new model on the transformed data. Report the validation set accuracy
# 2. unigram - with stem - with stop words       - done
trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------UNIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["UNIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

Word cloud on training data
Word cloud on testing data
trainingData.head()
                               Tokenized Description  Class Index
0  [ernest, erni, cox, (februari, 17, 1894, –, fe...            3
1  [holosteum, genu, plant, pink, famili, (caryop...           10
2  [pestarella, tyrrhena, (formerli, callianassa,...            9
3  [midsun, junior, high, school, canadian, middl...            1
4  [st, james', church, wrightington, bar, church...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajo, draho, (7, march, 1895, -, 2, june, 198...            4
1  [uss, huntsvil, steamer, acquir, union, navi, ...            5
2  [found, 1954, ben, g., stone, scafco, corpor, ...            0
3  [mclean', mansion, (origin, holli, lea), homes...            6
4  [avioan, craiova, iar-93, vultur, (eagle), twi...            5
--------UNIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, voca

In [6]:
#(d) How does your accuracy change over the validation set? Comment on your observations.

In [7]:
# Question 3 :: Bigram with stemming and removing stop words
# 3. bigram - with stem - with stop words        -done
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = 2)
df_test, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = 2)

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")
model = NaiveBayes()
print("--------BIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["BIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [ernest erni, erni cox, cox (februari, (februa...            3
1  [holosteum genu, genu plant, plant pink, pink ...           10
2  [pestarella tyrrhena, tyrrhena (formerli, (for...            9
3  [midsun junior, junior high, high school, scho...            1
4  [st james', james' church, church wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajo draho, draho (7, (7 march, march 1895, 1...            4
1  [uss huntsvil, huntsvil steamer, steamer acqui...            5
2  [found 1954, 1954 ben, ben g., g. stone, stone...            0
3  [mclean' mansion, mansion (origin, (origin hol...            6
4  [avioan craiova, craiova iar-93, iar-93 vultur...            5
--------BIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 666694
Shape of phi_y: (14,)
Shape of phi_j_give

In [8]:
# Question 3 :: Bigram without stemming or removing stop words
# bigram - without stem - without stop words  - done
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = 2)
df_test, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = 2)

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------BIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["BIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0,  text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [ernest ernie, ernie cox, cox (february, (febr...            3
1  [holosteum is, is a, a genus, genus of, of pla...           10
2  [pestarella tyrrhena, tyrrhena (formerly, (for...            9
3  [midsun junior, junior high, high school, scho...            1
4  [st james', james' church, church wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajos drahos, drahos (7, (7 march, march 1895...            4
1  [uss huntsville, huntsville was, was a, a stea...            5
2  [founded in, in 1954, 1954 by, by ben, ben g.,...            0
3  [mclean's mansion, mansion (originally, (origi...            6
4  [the avioane, avioane craiova, craiova iar-93,...            5
--------BIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 649720
Shape of phi_y: (14,)
Shape of phi_

In [9]:
# unigram - with stem - without stop words
# Question 4 :: uigram with stemming and removing stop words
# 4. uigram - with stem - with stop words        -done
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = True, window = 1)
df_test, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = True, window = 1)

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------UNIGRAM -- WITH STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["UNIGRAM-WITH STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [ernest, erni, cox, (februari, 17, 1894, –, fe...            3
1  [holosteum, is, a, genu, of, plant, in, the, p...           10
2  [pestarella, tyrrhena, (formerli, callianassa,...            9
3  [midsun, junior, high, school, is, a, canadian...            1
4  [st, james', church, wrightington, bar, is, in...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajo, draho, (7, march, 1895, -, 2, june, 198...            4
1  [uss, huntsvil, wa, a, steamer, acquir, by, th...            5
2  [found, in, 1954, by, ben, g., stone, scafco, ...            0
3  [mclean', mansion, (origin, holli, lea), is, a...            6
4  [the, avioan, craiova, iar-93, vultur, (eagle)...            5
--------UNIGRAM -- WITH STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 146761
Shape of phi_y: (14,)
Shape of phi_j_

In [10]:
# 4. unigram - without stem - with stop words
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = False, window = 1)
df_test, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = False, window = 1)

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------UNIGRAM -- WITHOUT STEMMING -- WITH STOP WORDS REMOVAL---------")
results["UNIGRAM-WITHOUT STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [ernest, ernie, cox, (february, 17, 1894, –, f...            3
1  [holosteum, genus, plants, pink, family, (cary...           10
2  [pestarella, tyrrhena, (formerly, callianassa,...            9
3  [midsun, junior, high, school, canadian, middl...            1
4  [st, james', church, wrightington, bar, church...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajos, drahos, (7, march, 1895, -, 2, june, 1...            4
1  [uss, huntsville, steamer, acquired, union, na...            5
2  [founded, 1954, ben, g., stone, scafco, corpor...            0
3  [mclean's, mansion, (originally, holly, lea), ...            6
4  [avioane, craiova, iar-93, vultur, (eagle), tw...            5
--------UNIGRAM -- WITHOUT STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 158589
Shape of phi_y: (14,)
Shape of phi_j_

In [11]:
# 4. bigram - with stem - without stop words
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = True, window = 2)
df_test, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = True, window = 2)

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------BIGRAM -- WITH STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["BIGRAM-WITH STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [ernest erni, erni cox, cox (februari, (februa...            3
1  [holosteum is, is a, a genu, genu of, of plant...           10
2  [pestarella tyrrhena, tyrrhena (formerli, (for...            9
3  [midsun junior, junior high, high school, scho...            1
4  [st james', james' church, church wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajo draho, draho (7, (7 march, march 1895, 1...            4
1  [uss huntsvil, huntsvil wa, wa a, a steamer, s...            5
2  [found in, in 1954, 1954 by, by ben, ben g., g...            0
3  [mclean' mansion, mansion (origin, (origin hol...            6
4  [the avioan, avioan craiova, craiova iar-93, i...            5
--------BIGRAM -- WITH STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 621022
Shape of phi_y: (14,)
Shape of phi_j_g

In [12]:
# 4. bigram - without stem - with stop words
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = False, window = 2)
df_test, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = False, window = 2)

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------BIGRAM -- WITHOUT STEMMING -- WITH STOP WORDS REMOVAL---------")
results["BIGRAM-WITHOUT STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [ernest ernie, ernie cox, cox (february, (febr...            3
1  [holosteum genus, genus plants, plants pink, p...           10
2  [pestarella tyrrhena, tyrrhena (formerly, (for...            9
3  [midsun junior, junior high, high school, scho...            1
4  [st james', james' church, church wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajos drahos, drahos (7, (7 march, march 1895...            4
1  [uss huntsville, huntsville steamer, steamer a...            5
2  [founded 1954, 1954 ben, ben g., g. stone, sto...            0
3  [mclean's mansion, mansion (originally, (origi...            6
4  [avioane craiova, craiova iar-93, iar-93 vultu...            5
--------BIGRAM -- WITHOUT STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 687657
Shape of phi_y: (14,)
Shape of phi_j_g