## Using the following website
### https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

In [27]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re, string

[nltk_data] Downloading package punkt to /home/yann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yann/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/yann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yann/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
df_train = pd.read_csv("../data/lab_train.txt", engine="python")
df_train.drop(columns="Unnamed: 0", inplace=True)
df_train.head()

Unnamed: 0,review,score
0,Before I begin I'd just like point out that I ...,1.0
1,I love all 4 of the movies. The way the storyl...,5.0
2,i love this movie. it is something i would cal...,5.0
3,"I really enjoy this movie so much,that I told ...",5.0
4,Having been a fan of Walt Disney movies for ye...,5.0


In [5]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

['i', 'love', 'this', 'movie', '.', 'it', 'be', 'something', 'i', 'would', 'call', 'off', 'the', 'chain', '.', 'the', 'efects', 'and', 'every', 'thing', 'be', 'off', 'the', 'chain', '.', 'if', 'u', 'havent', 'see', 'this', 'movie', 'then', 'u', 'should', 'go', 'and', 'get', 'it', '.', 'i', 'bet', 'u', 'would', 'like', 'it', '.']


In [30]:
stop_words = stopwords.words('english')

def clean_review(tokens, stop_words = stop_words, numbers=True):

    cleaned_tokens = []
    
    # remove tokens containing numbers
    if numbers:
        numbers = [str(i) for i in range(10)]
        kill_list = []
        for number in numbers:
            kill_list = kill_list + [w for w in tokens if number in w]
        # removing selected tokens
        tokens = [w for w in tokens if not w in kill_list]
        
    for token, tag in pos_tag(tokens):
        # Removing links
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        # Removing @
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
            
        # merge tokens
        merged = ' '
        merged = merged.join(cleaned_tokens)
        
    return merged

print(df_train.review[1])
print()
print(clean_review(word_tokenize(df_train.review[1])))

love movie way storyline follow experince one brave woman 's great action alien stuff glue tv would tell everybody like syfy action get movies. br sheila

love movie way storyline follow experince one brave woman 's great action alien stuff glue tv would tell everybody like syfy action get movie br sheila


In [9]:
df_train = pd.read_csv("../data/lab_train.txt", engine="python")
df_train.drop(columns="Unnamed: 0", inplace=True)
for i in range(len(df_train.review)):
    phrase = df_train.review.iloc[i]
    tokens = word_tokenize(phrase)
    df_train.review.iloc[i] = clean_review(tokens, stop_words=stop_words)

df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,review,score
0,begin 'd like point review film work `` art ''...,1.0
1,love movie way storyline follow experince one ...,5.0
2,love movie something would call chain efects e...,5.0
3,really enjoy movie much tell friend movie.its ...,5.0
4,fan walt disney movie year extremely pleased f...,5.0


In [10]:
df_test = pd.read_csv("../data/lab_test.txt", engine="python")
df_test.drop(columns="Unnamed: 0", inplace=True)
for i in range(len(df_test.review)):
    phrase = df_test.review.iloc[i]
    tokens = word_tokenize(phrase)
    df_test.review.iloc[i] = clean_review(tokens, stop_words=stop_words)

df_test.head()

Unnamed: 0,review,score
0,thank lord dvd movie ... touch life years..in ...,5.0
1,chance meeting train change life tennis player...,5.0
2,although buy box set week two decide beatles f...,5.0
3,film ton highlight wax scene br br personal fa...,4.0
4,`` shaun '' suppose comedic 'spoof 'homage `` ...,4.0


In [22]:
vectorizer = TfidfVectorizer(min_df=2)

X_train = vectorizer.fit_transform(df_train.review)
X_train = X_train.toarray()

X_test = vectorizer.transform(df_test.review)
X_test = X_test.toarray()

X_train.shape, X_test.shape

((200, 1906), (100, 1906))

In [24]:
thresh = 3
# Define the threshold for which a review is considered as positive
Y_train = (df_train.score.values>=thresh).astype(dtype=np.float)
Y_test = (df_test.score.values>=thresh).astype(dtype=np.float)

In [25]:
# Compute the predictions
classifier = MultinomialNB()
Y_pred = classifier.fit(X_train, Y_train).predict(X_test)

In [26]:
score = ((Y_pred == Y_test).sum())/Y_pred.shape[0]
print('Score accuracy is ', score)

Score accuracy is  0.88
