In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /home/yann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/yann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Macro variables
thresh = 3
stop_words = stopwords.words('english')

def clean_review(string, stop_words=None, numbers=False):
    string = string.lower()
    tokens = word_tokenize(string)

    # remove useless words
    if stop_words:
        tokens = [w for w in tokens if not w in stop_words]

        # remove tokens containing numbers
    if numbers:
        numbers = [str(i) for i in range(10)]
        kill_list = []
        for number in numbers:
            kill_list = kill_list + [w for w in tokens if number in w]
        # removing selected tokens
        tokens = [w for w in tokens if not w in kill_list]
    
    # merge tokens
    string = ' '
    string = string.join( tokens)
    
    return string

In [3]:
df_train = pd.read_csv("../data/lab_train.txt", engine="python")
df_train.drop(columns="Unnamed: 0", inplace=True)
df_train.head()

Unnamed: 0,review,score
0,Before I begin I'd just like point out that I ...,1.0
1,I love all 4 of the movies. The way the storyl...,5.0
2,i love this movie. it is something i would cal...,5.0
3,"I really enjoy this movie so much,that I told ...",5.0
4,Having been a fan of Walt Disney movies for ye...,5.0


In [4]:
for i in range(len(df_train.review)):
    string = df_train.review.iloc[i]
    df_train.review.iloc[i] = clean_review(string, stop_words=stop_words, numbers=True)

df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,review,score
0,begin 'd like point reviewing film work `` art...,1.0
1,love movies . way storyline follows experince ...,5.0
2,love movie . something would call chain . efec...,5.0
3,"really enjoy movie much , told friends movie.i...",5.0
4,"fan walt disney movies years , extremely pleas...",5.0


In [5]:
df_test = pd.read_excel("../data/evaluation_dataset.xlsx", names=["review"], header=None)
for i in range(len(df_test.review)):
    string = df_test.review.iloc[i]
    df_test.review.iloc[i] = clean_review(string, stop_words=stop_words, numbers=True)
df_test.head()

Unnamed: 0,review
0,check staff friendly could enough help asked c...
1,room great - modern & clean . robes & slippers...
2,great hotel . staff friendly professional will...
3,price room stayed ( without breakfast ! ! ) n'...
4,parking facilities excellent find charges litt...


In [6]:
reviews = np.concatenate((df_train.review, df_test.review), axis=0)
vectorizer = TfidfVectorizer()
#X = vectorizer.fit_transform(reviews)
#X = X.toarray()
#X_train, X_test = X[:200], X[200:]

X_train = vectorizer.fit_transform(df_train.review)
X_train = X_train.toarray()

X_test = vectorizer.transform(df_test.review)
X_test = X_test.toarray()

# Define the threshold for which a review is considered as positive
Y_train = (df_train.score.values>=thresh).astype(dtype=np.float)

In [7]:
vectorizer.get_feature_names()

['abba',
 'abbe',
 'abc',
 'abducted',
 'ability',
 'able',
 'abnormal',
 'about',
 'abraham',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absorbing',
 'absurd',
 'abults',
 'abuse',
 'abuses',
 'academy',
 'accent',
 'accents',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accident',
 'accidental',
 'accidentally',
 'acclaim',
 'acclaimed',
 'accompanies',
 'accomplish',
 'accuarate',
 'accuracies',
 'accuracy',
 'accurate',
 'accurately',
 'accused',
 'achieved',
 'achievement',
 'aching',
 'acid',
 'acknowledging',
 'acording',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'activities',
 'actor',
 'actors',
 'actress',
 'actresses',
 'acts',
 'actual',
 'actually',
 'adam',
 'adapted',
 'adapting',
 'adaption',
 'add',
 'added',
 'addict',
 'addiction',
 'adding',
 'addition',
 'additional',
 'additonal',
 'address',
 'adds',
 'adequate',
 'administration',
 'admired',
 'admirers',
 'admit',
 'admittedly',
 'adolescents',
 'adrian',
 'ads',
 'adult',
 'adult

In [8]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [9]:
# Compute the predictions
classifier = ComplementNB()
Y_pred = classifier.fit(X_train, Y_train).predict(X_test)

In [10]:
score = np.zeros(Y_pred.shape, dtype='O')
# score[Y_pred>=(thresh/5)]='positive'
# score[Y_pred<(thresh/5)]='negative'
df_test.insert(loc=1, column='prediction', value=Y_pred, allow_duplicates=True)
df_test.head()

Unnamed: 0,review,prediction
0,check staff friendly could enough help asked c...,1.0
1,room great - modern & clean . robes & slippers...,1.0
2,great hotel . staff friendly professional will...,1.0
3,price room stayed ( without breakfast ! ! ) n'...,1.0
4,parking facilities excellent find charges litt...,1.0


In [11]:
df_test.to_csv(path_or_buf ='../data/evaluation.csv', index=False)
print(Y_pred.sum())


185.0
