In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /home/yann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
# Macro variables
thresh = 3
stop_words = stopwords.words('english')

def clean_review(string, stop_words=None, numbers=False):
    string = string.lower()
    tokens = word_tokenize(string)

    # remove useless words
    if stop_words:
        tokens = [w for w in tokens if not w in stop_words]

        # remove tokens containing numbers
    if numbers:
        numbers = [str(i) for i in range(10)]
        kill_list = []
        for number in numbers:
            kill_list = kill_list + [w for w in tokens if number in w]
        # removing selected tokens
        tokens = [w for w in tokens if not w in kill_list]
    
    # merge tokens
    string = ' '
    string = string.join( tokens)
    
    return string

In [43]:
df_train = pd.read_csv("../data/lab_train.txt", engine="python")
df_train.drop(columns="Unnamed: 0", inplace=True)
df_train.head()

Unnamed: 0,review,score
0,Before I begin I'd just like point out that I ...,1.0
1,I love all 4 of the movies. The way the storyl...,5.0
2,i love this movie. it is something i would cal...,5.0
3,"I really enjoy this movie so much,that I told ...",5.0
4,Having been a fan of Walt Disney movies for ye...,5.0


In [44]:
for i in range(len(df_train.review)):
    string = df_train.review.iloc[i]
    df_train.review.iloc[i] = clean_review(string, stop_words=stop_words, numbers=True)

df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,review,score
0,begin 'd like point reviewing film work `` art...,1.0
1,love movies . way storyline follows experince ...,5.0
2,love movie . something would call chain . efec...,5.0
3,"really enjoy movie much , told friends movie.i...",5.0
4,"fan walt disney movies years , extremely pleas...",5.0


In [45]:
df_test = pd.read_csv("../data/lab_test.txt", engine="python")
df_test.drop(columns="Unnamed: 0", inplace=True)
df_test.head()

Unnamed: 0,review,score
0,How I thank the Lord for this DVD and movie......,5.0
1,A chance meeting on a train changes the life o...,5.0
2,Although I bought this box set only a week or ...,5.0
3,This film has tons of highlights. The waxing s...,4.0
4,"""Shaun"" is supposed to be a comedic 'spoof' an...",4.0


In [46]:
for i in range(len(df_test.review)):
    string = df_test.review.iloc[i]
    df_test.review.iloc[i] = clean_review(string, stop_words=stop_words, numbers=True)

df_test.head()

Unnamed: 0,review,score
0,thank lord dvd movie ... touched life years..i...,5.0
1,chance meeting train changes life tennis playe...,5.0
2,although bought box set week two decided beatl...,5.0
3,film tons highlights . waxing scene < br / > <...,4.0
4,`` shaun '' supposed comedic 'spoof ' 'homage ...,4.0


In [47]:
vectorizer = TfidfVectorizer(min_df=5)

X_train = vectorizer.fit_transform(df_train.review)
X_train = X_train.toarray()

X_test = vectorizer.transform(df_test.review)
X_test = X_test.toarray()

X_train.shape, X_test.shape

((200, 629), (100, 629))

In [None]:
# Define the threshold for which a review is considered as positive
Y_train = (df_train.score.values>=thresh).astype(dtype=np.float)
Y_test = (df_test.score.values>=thresh).astype(dtype=np.float)

In [None]:
vectorizer.get_feature_names()

In [None]:
# Compute the predictions
classifier = ComplementNB()
Y_pred = classifier.fit(X_train, Y_train).predict(X_test)

In [None]:
score = ((Y_pred == Y_test).sum())/Y_pred.shape[0]
print('Score accuracy is ', score)