# Import Data and NLTK


In [1]:
import nltk 
nltk.download('all') 

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

In [2]:
# store data as a panda dataframe 
import pandas as pd 

review_df = pd.read_csv("/content/drive/MyDrive/train.csv",
                        names = ["Label", "Review"] )
print(review_df.head())

   Label                                             Review
0      1  Unfortunately, the frustration of being Dr. Go...
1      2  Been going to Dr. Goldberg for over 10 years. ...
2      1  I don't know what Dr. Goldberg was like before...
3      1  I'm writing this review to give you a heads up...
4      2  All the food is great here. But the best thing...


In [3]:
print(review_df.loc[review_df["Label"] == 1].count())
print(review_df.loc[review_df["Label"] == 2].count())

Label     280000
Review    280000
dtype: int64
Label     280000
Review    280000
dtype: int64


# Data Pre-Processing

In [4]:
# develop a custom tokeniser 

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string

stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
l = WordNetLemmatizer() 

# this tokenizer cleans the data 

def clean(v):
    tokens = word_tokenize(v) #tokenize
    lower_case = [i.lower() for i in tokens] #make letters lowercase
    strip = [i.translate(table) for i in lower_case] # remove punctuations 
    words = [i for i in strip if i.isalpha()] #remove non alphabetic terms
    main_words = [i for i in words if not i in stop_words] #remove stop words
    final_words = [l.lemmatize(i) for i in main_words] # lemmatize the words
    return (final_words)



In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# convert the reviews into sparse vectors 
vectoriser = CountVectorizer(tokenizer= clean)
Count = vectoriser.fit_transform(review_df.Review)


In [29]:
print("Length of Vocabulary:", len(vectoriser.get_feature_names()))
print("20 words from the vocabulary:", vectoriser.get_feature_names()[35015:35020] )


Length of Vocabulary: 509137
20 words from the vocabulary: ['batching', 'batchlorette', 'batchn', 'batchni', 'batchnnand']


In [6]:
# labels of all the reviews
label = review_df.Label

In [7]:
# split data into training and testing data 
import sklearn

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(Count,label, test_size=0.25, random_state=10, shuffle=True)


In [8]:
# 420,000 training reviews and 140,000 testing reviews
print(x_train.shape)
print(x_test.shape)


(420000, 509137)
(140000, 509137)


# Naive Bayes Evaluation

In [None]:
#evaluating the multinomial naive bayes model

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics 

model = MultinomialNB()
model.fit(x_train, y_train)

test = model.predict(x_test)
accuracy = metrics.accuracy_score(test, y_test)

print("Multinomial Naive Bayes test accuaracy: ", accuracy)


Multinomial Naive Bayes test accuaracy:  0.8706142857142857


In [None]:
#evaluating the Bernoulli Naive bayes model 
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(x_train, y_train)

test = model.predict(x_test)
accuracy = metrics.accuracy_score(test, y_test)

print(" Bernoulli Naive Bayes test accuaracy: ", accuracy)


 Bernoulli Naive Bayes test accuaracy:  0.7928642857142857


In [None]:
#evaluating the Complement Naive bayes model 
from sklearn.naive_bayes import ComplementNB
model = ComplementNB()
model.fit(x_train, y_train)

test = model.predict(x_test)
accuracy = metrics.accuracy_score(test, y_test)

print(" Complement Naive Bayes test accuaracy: ", accuracy)

 Complement Naive Bayes test accuaracy:  0.87065
