In [1]:
import pandas as pd
from naive_bayes_custom import NaiveBayesCustom
from utils import tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary, getTrainingAndTestingData2, plot_wordclouds_per_class
from modelUtils import run_model2

In [2]:
df_train = pd.read_csv("train.csv")
print(f"df_train.shape : {df_train.shape}")

print(df_train.head())

df_test = pd.read_csv("test.csv")
print(f"df_test.shape : {df_test.shape}")

print(df_test.head())

results = {}

df_train.shape : (140000, 3)
   label                              title  \
0      3                          Ernie Cox   
1     10                          Holosteum   
2      9                Pestarella tyrrhena   
3      1          MidSun Junior High School   
4      6  St James' Church Wrightington Bar   

                                             content  
0   Ernest Ernie Cox (February 17 1894 – February...  
1   Holosteum is a genus of plants in the Pink fa...  
2   Pestarella tyrrhena (formerly Callianassa tyr...  
3   MidSun Junior High School is a Canadian middl...  
4   St James' Church Wrightington Bar is in Churc...  
df_test.shape : (35000, 3)
   label                          title  \
0      4                   Lajos Drahos   
1      5          USS Huntsville (1857)   
2      0                         SCAFCO   
3      6               McLean's Mansion   
4      5  Avioane Craiova IAR-93 Vultur   

                                             content  
0   Lajos Drahos 

In [3]:
# Question Part 1 :: Unigram without stemming and removing stop words
# Train the model with context and title concatenated only corresponding to the labels
df_train, vocabularyTitle = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "title", target_col = "Tokenized Title", remove_stop_words = False, with_stemming = False, window = [1])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "title", target_col = "Tokenized Title", remove_stop_words = False, with_stemming = False, window = [1])

df_train, vocabularyContent = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1])

trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")
    
# 1. unigram - without stem - without stop words - done
model = NaiveBayesCustom() 
print("--------UNIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["UNIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 1.0, text_col1 = "Tokenized Title", text_col2 = "Tokenized Description")


Training Data Sample:
                           Tokenized Title  \
0                             [ernie, cox]   
1                              [holosteum]   
2                   [pestarella, tyrrhena]   
3           [midsun, junior, high, school]   
4  [st, james', church, wrightington, bar]   

                               Tokenized Description  Class Index  
0  [ernest, ernie, cox, (february, 17, 1894, –, f...            3  
1  [holosteum, is, a, genus, of, plants, in, the,...           10  
2  [pestarella, tyrrhena, (formerly, callianassa,...            9  
3  [midsun, junior, high, school, is, a, canadian...            1  
4  [st, james', church, wrightington, bar, is, in...            6  

Testing Data Sample:
                      Tokenized Title  \
0                     [lajos, drahos]   
1           [uss, huntsville, (1857)]   
2                            [scafco]   
3                 [mclean's, mansion]   
4  [avioane, craiova, iar-93, vultur]   

                       

In [4]:
# Question 1, part 2 :: Unigram with stemming and removing stop words
df_train, vocabularyTitle = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "title", target_col = "Tokenized Title", remove_stop_words = True, with_stemming = True, window = [1])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "title", target_col = "Tokenized Title", remove_stop_words = True, with_stemming = True, window = [1])

df_train, vocabularyContent = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [1])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [1])

trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")
    
# 1. unigram - without stem - without stop words - done
model = NaiveBayesCustom() 
print("--------UNIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["UNIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 1.0, text_col1 = "Tokenized Title", text_col2 = "Tokenized Description")


Training Data Sample:
                           Tokenized Title  \
0                              [erni, cox]   
1                              [holosteum]   
2                   [pestarella, tyrrhena]   
3           [midsun, junior, high, school]   
4  [st, james', church, wrightington, bar]   

                               Tokenized Description  Class Index  
0  [ernest, erni, cox, (februari, 17, 1894, –, fe...            3  
1  [holosteum, genu, plant, pink, famili, (caryop...           10  
2  [pestarella, tyrrhena, (formerli, callianassa,...            9  
3  [midsun, junior, high, school, canadian, middl...            1  
4  [st, james', church, wrightington, bar, church...            6  

Testing Data Sample:
                     Tokenized Title  \
0                      [lajo, draho]   
1            [uss, huntsvil, (1857)]   
2                           [scafco]   
3                 [mclean', mansion]   
4  [avioan, craiova, iar-93, vultur]   

                             

In [None]:
# Question 1, part 4 :: Bigram without stemming or removing stop words
df_train, vocabularyTitle = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "title", target_col = "Tokenized Title", remove_stop_words = False, with_stemming = False, window = [2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "title", target_col = "Tokenized Title", remove_stop_words = False, with_stemming = False, window = [2])

df_train, vocabularyContent = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [2])

trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")
    
# 1. unigram - without stem - without stop words - done
model = NaiveBayesCustom() 
print("--------BIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["BIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 1.0, text_col1 = "Tokenized Title", text_col2 = "Tokenized Description")


Training Data Sample:
                                     Tokenized Title  \
0                                        [ernie cox]   
1                                                 []   
2                              [pestarella tyrrhena]   
3          [midsun junior, junior high, high school]   
4  [st james', james' church, church wrightington...   

                               Tokenized Description  Class Index  
0  [ernest ernie, ernie cox, cox (february, (febr...            3  
1  [holosteum is, is a, a genus, genus of, of pla...           10  
2  [pestarella tyrrhena, tyrrhena (formerly, (for...            9  
3  [midsun junior, junior high, high school, scho...            1  
4  [st james', james' church, church wrightington...            6  

Testing Data Sample:
                                    Tokenized Title  \
0                                    [lajos drahos]   
1               [uss huntsville, huntsville (1857)]   
2                                            

In [None]:
# Question 1, part 3 :: Bigram with stemming and removing stop words
df_train, vocabularyTitle = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "title", target_col = "Tokenized Title", remove_stop_words = True, with_stemming = True, window = [2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "title", target_col = "Tokenized Title", remove_stop_words = True, with_stemming = True, window = [2])

df_train, vocabularyContent = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [2])

trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")
    
# 1. unigram - without stem - without stop words - done
model = NaiveBayesCustom() 
print("--------BIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["BIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 1.0, text_col1 = "Tokenized Title", text_col2 = "Tokenized Description")

In [None]:
# Question 1, part 5 :: uni + Bigram without stemming or removing stop words
df_train, vocabularyTitle = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "title", target_col = "Tokenized Title", remove_stop_words = False, with_stemming = False, window = [1,2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "title", target_col = "Tokenized Title", remove_stop_words = False, with_stemming = False, window = [1,2])

df_train, vocabularyContent = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1,2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = False, with_stemming = False, window = [1,2])

trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")
    
# 1. unigram - without stem - without stop words - done
model = NaiveBayesCustom() 
print("--------UNI+BIGRAM -- WITHOUT STEMMING -- WITHOUT STOP WORDS REMOVAL---------")
results["UNI+BIGRAM-WITHOUT STEMMING-WITHOUT STOP WORDS REMOVAL"] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 1.0, text_col1 = "Tokenized Title", text_col2 = "Tokenized Description")

In [None]:
# Question 1, part 6 :: uni + Bigram with stemming or removing stop words
df_train, vocabularyTitle = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "title", target_col = "Tokenized Title", remove_stop_words = True, with_stemming = True, window = [1,2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "title", target_col = "Tokenized Title", remove_stop_words = True, with_stemming = True, window = [1,2])

df_train, vocabularyContent = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_train, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [1,2])
df_test, _ = tokenizeAndRemoveStopWordsOrStemAndReturnVocabulary(df_test, "content", target_col = "Tokenized Description", remove_stop_words = True, with_stemming = True, window = [1,2])

trainingData, testingData = getTrainingAndTestingData2(df_train, df_test, target_field1 = "Tokenized Title", target_field2 = "Tokenized Description")
    
# 1. unigram - without stem - without stop words - done
model = NaiveBayesCustom() 
print("--------UNI+BIGRAM -- WITH STEMMING -- WITH STOP WORDS REMOVAL---------")
results["UNI+BIGRAM-WITH STEMMING-WITH STOP WORDS REMOVAL"] = run_model2(model, vocabularyTitle, vocabularyContent, trainingData, testingData, smoothening = 1.0, text_col1 = "Tokenized Title", text_col2 = "Tokenized Description")