In [1]:
import pandas as pd
from naive_bayes import NaiveBayes
from utils import tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary, getTrainingAndTestingData, plot_wordclouds_per_class
from modelUtils import run_model

In [2]:
df_train = pd.read_csv("train.csv")
print(f"df_train.shape : {df_train.shape}")

print(df_train.head())

df_test = pd.read_csv("test.csv")
print(f"df_test.shape : {df_test.shape}")

print(df_test.head())

results = {}
df_train["combined"] = df_train["title"] + " " + df_train["content"]
df_test["combined"] = df_test["title"] + " " + df_test["content"]

df_train.shape : (140000, 3)
   label                              title  \
0      3                          Ernie Cox   
1     10                          Holosteum   
2      9                Pestarella tyrrhena   
3      1          MidSun Junior High School   
4      6  St James' Church Wrightington Bar   

                                             content  
0   Ernest Ernie Cox (February 17 1894 – February...  
1   Holosteum is a genus of plants in the Pink fa...  
2   Pestarella tyrrhena (formerly Callianassa tyr...  
3   MidSun Junior High School is a Canadian middl...  
4   St James' Church Wrightington Bar is in Churc...  
df_test.shape : (35000, 3)
   label                          title  \
0      4                   Lajos Drahos   
1      5          USS Huntsville (1857)   
2      0                         SCAFCO   
3      6               McLean's Mansion   
4      5  Avioane Craiova IAR-93 Vultur   

                                             content  
0   Lajos Drahos 

In [3]:
# Question Part 1 :: Unigram without stemming and removing stop words
# Train the model with context and title concatenated only corresponding to the labels
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "combined", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "combined", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1])

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")
    
# 1. unigram - without stem - without stop words - done
model = NaiveBayes() 
print("--------UNIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["UNIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [ernie, cox, ernest, ernie, cox, (february, 17...            3
1  [holosteum, holosteum, is, a, genus, of, plant...           10
2  [pestarella, tyrrhena, pestarella, tyrrhena, (...            9
3  [midsun, junior, high, school, midsun, junior,...            1
4  [st, james', church, wrightington, bar, st, ja...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajos, drahos, lajos, drahos, (7, march, 1895...            4
1  [uss, huntsville, (1857), uss, huntsville, was...            5
2  [scafco, founded, in, 1954, by, ben, g., stone...            0
3  [mclean's, mansion, mclean's, mansion, (origin...            6
4  [avioane, craiova, iar-93, vultur, the, avioan...            5
--------UNIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 430904
Shape of phi_y: (14,)
Shape of phi

In [4]:
# Question 1, part 2 :: Unigram with stemming and removing stop words
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "combined", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [1])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "combined", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [1])

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------UNIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["UNIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [erni, cox, ernest, erni, cox, (februari, 17, ...            3
1  [holosteum, holosteum, genu, plant, pink, fami...           10
2  [pestarella, tyrrhena, pestarella, tyrrhena, (...            9
3  [midsun, junior, high, school, midsun, junior,...            1
4  [st, james', church, wrightington, bar, st, ja...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajo, draho, lajo, draho, (7, march, 1895, -,...            4
1  [uss, huntsvil, (1857), uss, huntsvil, steamer...            5
2  [scafco, found, 1954, ben, g., stone, scafco, ...            0
3  [mclean', mansion, mclean', mansion, (origin, ...            6
4  [avioan, craiova, iar-93, vultur, avioan, crai...            5
--------UNIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 405288
Shape of phi_y: (14,)
Shape of phi_j_giv

In [None]:
# Question 3 :: Bigram with stemming and removing stop words
# 3. bigram - with stem - with stop words        -done
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "combined", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "combined", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [2])

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")
model = NaiveBayes()
print("--------BIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["BIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0, text_col = "Tokenized Description")

trainingData.head()
                               Tokenized Description  Class Index
0  [erni cox, cox ernest, ernest erni, erni cox, ...            3
1  [holosteum holosteum, holosteum genu, genu pla...           10
2  [pestarella tyrrhena, tyrrhena pestarella, pes...            9
3  [midsun junior, junior high, high school, scho...            1
4  [st james', james' church, church wrightington...            6
testingData.head()
                               Tokenized Description  Class Index
0  [lajo draho, draho lajo, lajo draho, draho (7,...            4
1  [uss huntsvil, huntsvil (1857), (1857) uss, us...            5
2  [scafco found, found 1954, 1954 ben, ben g., g...            0
3  [mclean' mansion, mansion mclean', mclean' man...            6
4  [avioan craiova, craiova iar-93, iar-93 vultur...            5
--------BIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------
Number of classes: 14, examples: 140000, vocab size: 2348651
Shape of phi_y: (14,)
Shape of phi_j_giv

In [None]:
# Question 3 :: Bigram without stemming or removing stop words
# bigram - without stem - without stop words  - done
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "combined", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "combined", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [2])

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------BIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["BIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0,  text_col = "Tokenized Description")

In [None]:
# Tokenize with unigram and bi-gram -- WITHOUT PRE-PROCESSING
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "combined", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1,2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "combined", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1,2])

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------UNI+BI+GRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["UNI+BI+GRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0,  text_col = "Tokenized Description")

In [None]:
# Tokenize with unigram and bi-gram -- WITH PRE-PROCESSING
df_train, vocabulary = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "combined", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [1,2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "combined", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [1,2])

trainingData, testingData = getTrainingAndTestingData(df_train, df_test, target_field = "Tokenized Description")

model = NaiveBayes()
print("--------UNI+BI+GRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["UNI+BI+GRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model(model, vocabulary, trainingData, testingData, smoothening = 1.0,  text_col = "Tokenized Description")