In [1]:
import numpy as np
from numpy.random import choice
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re, string

[nltk_data] Downloading package punkt to /home/yann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yann/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/yann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yann/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df_train = pd.read_csv("../data/lab_train.txt", engine="python")
df_train.drop(columns="Unnamed: 0", inplace=True)
df_train.head() # 173 positive evaluations

Unnamed: 0,review,score
0,Before I begin I'd just like point out that I ...,1.0
1,I love all 4 of the movies. The way the storyl...,5.0
2,i love this movie. it is something i would cal...,5.0
3,"I really enjoy this movie so much,that I told ...",5.0
4,Having been a fan of Walt Disney movies for ye...,5.0


In [3]:
df_test = pd.read_csv("../data/lab_test.txt", engine="python")
df_test.drop(columns="Unnamed: 0", inplace=True)
df_test.head() # 88 positive evaluations

Unnamed: 0,review,score
0,How I thank the Lord for this DVD and movie......,5.0
1,A chance meeting on a train changes the life o...,5.0
2,Although I bought this box set only a week or ...,5.0
3,This film has tons of highlights. The waxing s...,4.0
4,"""Shaun"" is supposed to be a comedic 'spoof' an...",4.0


In [4]:
df_eval = pd.read_excel('../data/evaluation_dataset.xlsx', header=None, names=['review'])
df_eval.head()

Unnamed: 0,review
0,The check in staff were very friendly and coul...
1,The room was great - modern & clean. Robes & s...
2,This is a great hotel. The staff are very frie...
3,The price of the room which we stayed in was £...
4,The parking facilities are excellent but did f...


In [5]:
train_reviews = df_train.review.values
test_reviews = df_test.review.values

In [6]:
stop_words = stopwords.words('english')

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def clean_review(tokens, stop_words = stop_words, numbers=True):

    cleaned_tokens = []
        
    for token, tag in pos_tag(tokens):
        # Removing links
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        # Removing @
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
            
            # remove tokens containing numbers
    if numbers:
        numbers = [str(i) for i in range(10)]
        kill_list = []
        for number in numbers:
            kill_list = kill_list + [w for w in cleaned_tokens if number in w]
        # removing selected tokens
        cleaned_tokens = [w for w in cleaned_tokens if not w in kill_list]
        
        # merge tokens
        merged = ' '
        merged = merged.join(cleaned_tokens)
        
    return merged

def clean(array):
    for i, phrase in enumerate(array):
        array[i] = clean_review(word_tokenize(phrase))
    return array

In [7]:
train_reviews = clean(train_reviews)
test_reviews = clean(test_reviews)

In [8]:
vectorizer = TfidfVectorizer(min_df=3,
                             stop_words=stopwords.words('english'),
                             strip_accents='ascii')

X_train = vectorizer.fit_transform(train_reviews).toarray()
X_test = vectorizer.transform(test_reviews).toarray()

X_train.shape

(200, 1119)

In [9]:
def targets(score, thresh=3):
    targets = np.ones(score.shape, dtype=np.int)
    targets = targets - 2*((score<thresh).astype(dtype=np.int))
    return targets

In [10]:
Y_train = targets(df_train.score.values)
Y_test = targets(df_test.score.values)

In [11]:
# Trying to reduce influence of positive influence
neg = Y_train == -1
pos = Y_train == 1

classifiers=[]
scores = []
accuracy = []
for k in range(2000):
    idxs = choice(173, 30, replace=False)
    
    x_array = [X_train[neg]]+[X_train[idxs]]
    y_array = [Y_train[neg]]+[Y_train[idxs]]
    
    new_x_train = np.concatenate(x_array)
    new_y_train = np.concatenate(y_array)
    
    classifier = MultinomialNB()
    classifier.fit(new_x_train, new_y_train)
    Y_pred = classifier.predict(X_test)
    
    X_eval = vectorizer.transform(df_eval.review.values)
    score = classifier.predict(X_eval)
    
    classifiers.append(classifier)
    scores.append(f1_score(Y_test,Y_pred))
    accuracy.append(np.mean(Y_pred==Y_test))
    
print(scores[scores.index(max(scores))])
print(accuracy[scores.index(max(scores))])
print(accuracy[accuracy.index(max(accuracy))])

0.9257142857142858
0.87
0.87


In [12]:
classifier = classifiers[scores.index(max(scores))]

# Compute the predictions
Y_pred = classifier.predict(X_test)
print('Accuracy on test data is:', np.mean(Y_pred==Y_test))

Accuracy on test data is: 0.87


In [13]:
np.sum(Y_pred==-1)

13

### Evaluation

In [14]:
X_eval = vectorizer.transform(df_eval.review.values)
pred_eval = classifier.predict(X_eval)
np.sum(pred_eval==1)

171

In [15]:
predictions = np.zeros(pred_eval.shape, dtype='O')
predictions[pred_eval==1] = 'positive'
predictions[pred_eval==-1] = 'negative'

In [16]:
df_eval = pd.read_excel('../data/evaluation_dataset.xlsx', header=None, names=['review'])
df_eval.insert(1, "prediction", predictions, True)
df_eval.head()

Unnamed: 0,review,prediction
0,The check in staff were very friendly and coul...,negative
1,The room was great - modern & clean. Robes & s...,positive
2,This is a great hotel. The staff are very frie...,positive
3,The price of the room which we stayed in was £...,positive
4,The parking facilities are excellent but did f...,positive


In [17]:
df_eval.to_csv('../data/evaluation_cleaning.csv', index=False)