In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np
import random

from keras.models import Sequential
from keras.layers import Dense

**Improvements**

1. More models
2. Look at the Kaggle notebooks and take inspo

# Crunch
Only run when adding new stuff to cleaner, otherwise use the CSV in the next section

In [None]:
# reading original
original = pd.read_csv('labeledTrainData.tsv', sep="\t")
original.head()

In [None]:
stops = stopwords.words("english")
porter = PorterStemmer()

def clean_text(text):
    words = word_tokenize(text)
    words_no_punc = [word.lower() for word in words if word.isalpha()]
    no_stop = [word for word in words_no_punc if word not in stops]
    stems = [porter.stem(word) for word in no_stop]
    clean = ' '.join(stems)

    return clean

original["review_clean"] = original["review"].apply(lambda text: clean_text(text))
original.to_csv('train_clean.csv', index=False)

# Exploration

In [2]:
df = pd.read_csv('train_clean.csv', usecols=["sentiment", "review", "review_clean"])

In [3]:
random_nr = random.randint(0, len(df))

old = df["review"][random_nr]
new = df["review_clean"][random_nr]

print(f'Sentiment: {df["sentiment"][random_nr]}')
print(f'\nBefore ({len(old)} chars):')
print(old[:200])
print(f'\nAfter ({len(new)} chars):')
print(new[:200])

Sentiment: 0

Before (1127 chars):
Steve Martin should quit trying to do remakes of classic comedy. He absolutely does not fit this part. Like the woeful remake of the Out Of Towners, this movie falls flat on it's face. How anybody eve

After (639 chars):
steve martin quit tri remak classic comedi absolut fit part like woeful remak towner movi fall flat face anybodi ever thought steve martin could even come close jack lemmon wonder perform beyond true 


# Train and test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['review_clean'], df['sentiment'], test_size=0.2)

In [5]:
# Bag of Words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Logistic regression
~88% accuracy

In [6]:
# Fitting
logreg = LogisticRegression(max_iter=1000, verbose=2)

logreg.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s finished


LogisticRegression(max_iter=1000, verbose=2)

In [7]:
pred_logreg = logreg.predict(X_test)
confusion_matrix(pred_logreg, y_test)

array([[2154,  295],
       [ 320, 2231]])

In [8]:
print(classification_report(pred_logreg, y_test))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88      2449
           1       0.88      0.87      0.88      2551

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



# KNN
~63% accuracy

In [None]:
# getting optimal nr of neighbors (takes ~2 mins)
knn_grid = GridSearchCV(
    estimator=KNeighborsClassifier(), 
    param_grid={'n_neighbors': np.arange(3,11)}, # surely not less than 3, 11 as max due to time consumption
    verbose=2,
    cv=3
)
knn_grid.fit(X_train, y_train)
optimal_neighbors = knn_grid.best_params_['n_neighbors']
optimal_neighbors # Will return 9

In [9]:
knn = KNeighborsClassifier(n_neighbors=9) # optimal_neighbors w/o having to run it
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=9)

In [10]:
pred_knn = knn.predict(X_test)
confusion_matrix(pred_knn, y_test)

array([[1670, 1040],
       [ 804, 1486]])

In [11]:
print(classification_report(pred_knn, y_test))

              precision    recall  f1-score   support

           0       0.68      0.62      0.64      2710
           1       0.59      0.65      0.62      2290

    accuracy                           0.63      5000
   macro avg       0.63      0.63      0.63      5000
weighted avg       0.64      0.63      0.63      5000



# Naive Bayes
~65% accuracy

In [12]:
nb = GaussianNB()

nb.fit(X_train.toarray(), y_train)

GaussianNB()

In [13]:
pred_nb = nb.predict(X_test.toarray())
confusion_matrix(pred_nb, y_test)

array([[2022, 1231],
       [ 452, 1295]])

In [14]:
print(classification_report(pred_nb, y_test))

              precision    recall  f1-score   support

           0       0.82      0.62      0.71      3253
           1       0.51      0.74      0.61      1747

    accuracy                           0.66      5000
   macro avg       0.66      0.68      0.66      5000
weighted avg       0.71      0.66      0.67      5000



# Random Forest
~85% accuracy

In [15]:
forest = RandomForestClassifier(max_depth=50, n_estimators=100)

forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=50)

In [16]:
pred_forest = forest.predict(X_test)
confusion_matrix(pred_forest, y_test)

array([[2059,  322],
       [ 415, 2204]])

In [17]:
print(classification_report(pred_forest, y_test))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85      2381
           1       0.87      0.84      0.86      2619

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



# ANN
~87% accuracy

In [18]:
ann = Sequential()
ann.add(Dense(10, activation = 'relu'))
ann.add(Dense(10, activation = 'relu'))
ann.add(Dense(1, activation = 'sigmoid'))
ann.compile(optimizer = 'Adam', loss ='binary_crossentropy', metrics = ['accuracy'])
ann.fit(X_train.toarray(), y_train, batch_size=32, epochs=5, verbose=2)

Epoch 1/5
625/625 - 4s - loss: 0.3561 - accuracy: 0.8562
Epoch 2/5
625/625 - 3s - loss: 0.1610 - accuracy: 0.9431
Epoch 3/5
625/625 - 3s - loss: 0.0885 - accuracy: 0.9707
Epoch 4/5
625/625 - 3s - loss: 0.0525 - accuracy: 0.9844
Epoch 5/5
625/625 - 4s - loss: 0.0317 - accuracy: 0.9905


<tensorflow.python.keras.callbacks.History at 0x15089b220>

In [19]:
pred_ann = ann.predict(X_test) > 0.5
confusion_matrix(pred_ann, y_test)

array([[2177,  363],
       [ 297, 2163]])

In [20]:
print(classification_report(pred_ann, y_test))

              precision    recall  f1-score   support

       False       0.88      0.86      0.87      2540
        True       0.86      0.88      0.87      2460

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

