In [17]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import random

**Improvements**

- Name Entity Recognition to remove those
- PoS, only look at e.g. adjectives?
- Other ML-modell than logreg?

# Crunch
Only run when adding new stuff to cleaner, otherwise use the CSV in the next section

In [None]:
# reading original
original = pd.read_csv('labeledTrainData.tsv', sep="\t")
original.head()

In [None]:
stops = stopwords.words("english")
porter = PorterStemmer()

def clean_text(text):
    words = word_tokenize(text)
    words_no_punc = [word.lower() for word in words if word.isalpha()]
    no_stop = [word for word in words_no_punc if word not in stops]
    stems = [porter.stem(word) for word in no_stop]
    clean = ' '.join(stems)

    return clean

original["review_clean"] = original["review"].apply(lambda text: clean_text(text))
original.to_csv('train_clean.csv', index=False)

# Exploration

In [3]:
df = pd.read_csv('train_clean.csv', usecols=["sentiment", "review", "review_clean"])

In [4]:
random_nr = random.randint(0, len(df))

old = df["review"][random_nr]
new = df["review_clean"][random_nr]

print(f'Sentiment: {df["sentiment"][random_nr]}')
print(f'\nBefore ({len(old)} chars):')
print(old[:200])
print(f'\nAfter ({len(new)} chars):')
print(new[:200])

Sentiment: 0

Before (2196 chars):
Painfully bad Christmas film that has an equally painfully bad performance by Vince Vaughn, who is paying his usual frat boy self but this time for a children's movie but with out the wit or charm tha

After (1072 chars):
pain bad christma film equal pain bad perform vinc vaughn pay usual frat boy self time children movi wit charm r rate film vaughn seem like autopilot though film keep run wall lacklust perform minut f


# Train and test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['review_clean'], df['sentiment'])

In [6]:
# Bag of Words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Logistic regression
~87% accuracy

In [7]:
# Fitting
logreg = LogisticRegression(max_iter=1000, verbose=2)

logreg.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s finished


LogisticRegression(max_iter=1000, verbose=2)

In [8]:
pred_logreg = logreg.predict(X_test)
confusion_matrix(pred_logreg, y_test)

array([[2682,  348],
       [ 449, 2771]])

In [9]:
print(classification_report(pred_logreg, y_test))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87      3030
           1       0.89      0.86      0.87      3220

    accuracy                           0.87      6250
   macro avg       0.87      0.87      0.87      6250
weighted avg       0.87      0.87      0.87      6250



# KNN
~63% accuracy

In [None]:
# getting optimal nr of neighbors (takes ~2 mins)
knn_grid = GridSearchCV(
    estimator=KNeighborsClassifier(), 
    param_grid={'n_neighbors': np.arange(3,11)}, # surely not less than 3, 11 as max due to time consumption
    verbose=2,
    cv=3
)
knn_grid.fit(X_train, y_train)
optimal_neighbors = knn_grid.best_params_['n_neighbors']
optimal_neighbors # Will return 9

In [10]:
knn = KNeighborsClassifier(n_neighbors=9) # optimal_neighbors w/o having to run it
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=9)

In [11]:
pred_knn = knn.predict(X_test)
confusion_matrix(pred_knn, y_test)

array([[2287, 1552],
       [ 844, 1567]])

In [12]:
print(classification_report(pred_knn, y_test))

              precision    recall  f1-score   support

           0       0.73      0.60      0.66      3839
           1       0.50      0.65      0.57      2411

    accuracy                           0.62      6250
   macro avg       0.62      0.62      0.61      6250
weighted avg       0.64      0.62      0.62      6250



# Naive Bayes
~65% accuracy

In [13]:
nb = GaussianNB()

nb.fit(X_train.toarray(), y_train)

GaussianNB()

In [14]:
pred_nb = nb.predict(X_test.toarray())
confusion_matrix(pred_nb, y_test)

array([[2562, 1587],
       [ 569, 1532]])

In [15]:
print(classification_report(pred_nb, y_test))

              precision    recall  f1-score   support

           0       0.82      0.62      0.70      4149
           1       0.49      0.73      0.59      2101

    accuracy                           0.66      6250
   macro avg       0.65      0.67      0.65      6250
weighted avg       0.71      0.66      0.66      6250



# Random Forest
~85% accuracy  
Need to read up on this though, just guessed stuff

In [34]:
forest = RandomForestClassifier(max_depth=50)

forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=100)

In [35]:
pred_forest = forest.predict(X_test)
confusion_matrix(pred_forest, y_test)

array([[2604,  422],
       [ 527, 2697]])

In [36]:
print(classification_report(pred_forest, y_test))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85      3026
           1       0.86      0.84      0.85      3224

    accuracy                           0.85      6250
   macro avg       0.85      0.85      0.85      6250
weighted avg       0.85      0.85      0.85      6250

