In [9]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import random

**Improvements**

1. More models
2. Look at the Kaggle notebooks and take inspo

# Crunch
Only run when adding new stuff to cleaner, otherwise use the CSV in the next section

In [10]:
# reading original
original = pd.read_csv('labeledTrainData.tsv', sep="\t")
original.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [25]:
stops = stopwords.words("english")
porter = PorterStemmer()

def clean_text(text):
    words = word_tokenize(text)
    words_no_punc = [word.lower() for word in words if word.isalpha()]
    no_stop = [word for word in words_no_punc if word not in stops]
    stems = [porter.stem(word) for word in no_stop]
    clean = ' '.join(stems)

    return clean

original["review_clean"] = original["review"].apply(lambda text: clean_text(text))
original.to_csv('train_clean.csv', index=False)

# Exploration

In [26]:
df = pd.read_csv('train_clean.csv', usecols=["sentiment", "review", "review_clean"])

In [27]:
random_nr = random.randint(0, len(df))

old = df["review"][random_nr]
new = df["review_clean"][random_nr]

print(f'Sentiment: {df["sentiment"][random_nr]}')
print(f'\nBefore ({len(old)} chars):')
print(old[:200])
print(f'\nAfter ({len(new)} chars):')
print(new[:200])

Sentiment: 1

Before (333 chars):
This movie is just too funny, a totally non-PC gangster romp. If Mel Brooks made a picture about the Mob in the 30's, it would probably look like this. Too many great one-liners to to remember, and wh

After (171 chars):
movi funni total gangster romp mel brook made pictur mob would probabl look like mani great rememb everyon anyon doesnt laugh whole bunch time doesnt puls put laugh icehol


# Train and test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['review_clean'], df['sentiment'], test_size=0.2)

In [15]:
# Bag of Words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Logistic regression
~87% accuracy

In [16]:
# Fitting
logreg = LogisticRegression(max_iter=1000, verbose=2)

logreg.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.2s finished


LogisticRegression(max_iter=1000, verbose=2)

In [17]:
pred_logreg = logreg.predict(X_test)
confusion_matrix(pred_logreg, y_test)

array([[2187,  281],
       [ 338, 2194]])

In [18]:
print(classification_report(pred_logreg, y_test))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88      2468
           1       0.89      0.87      0.88      2532

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



# KNN
~63% accuracy

In [None]:
# getting optimal nr of neighbors (takes ~2 mins)
knn_grid = GridSearchCV(
    estimator=KNeighborsClassifier(), 
    param_grid={'n_neighbors': np.arange(3,11)}, # surely not less than 3, 11 as max due to time consumption
    verbose=2,
    cv=3
)
knn_grid.fit(X_train, y_train)
optimal_neighbors = knn_grid.best_params_['n_neighbors']
optimal_neighbors # Will return 9

In [19]:
knn = KNeighborsClassifier(n_neighbors=9) # optimal_neighbors w/o having to run it
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=9)

In [20]:
pred_knn = knn.predict(X_test)
confusion_matrix(pred_knn, y_test)

array([[1746, 1168],
       [ 779, 1307]])

In [21]:
print(classification_report(pred_knn, y_test))

              precision    recall  f1-score   support

           0       0.69      0.60      0.64      2914
           1       0.53      0.63      0.57      2086

    accuracy                           0.61      5000
   macro avg       0.61      0.61      0.61      5000
weighted avg       0.62      0.61      0.61      5000



# Naive Bayes
~65% accuracy

In [22]:
nb = GaussianNB()

nb.fit(X_train.toarray(), y_train)

KeyboardInterrupt: 

In [None]:
pred_nb = nb.predict(X_test.toarray())
confusion_matrix(pred_nb, y_test)

In [None]:
print(classification_report(pred_nb, y_test))

# Random Forest
~85% accuracy

In [6]:
forest = RandomForestClassifier(max_depth=50, n_estimators=100)

forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=50, n_estimators=10)

In [7]:
pred_forest = forest.predict(X_test)
confusion_matrix(pred_forest, y_test)

array([[1868,  559],
       [ 619, 1954]])

In [8]:
print(classification_report(pred_forest, y_test))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76      2427
           1       0.78      0.76      0.77      2573

    accuracy                           0.76      5000
   macro avg       0.76      0.76      0.76      5000
weighted avg       0.76      0.76      0.76      5000

