In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import random

**Improvements**

- Name Entity Recognition to remove those
- PoS, only look at e.g. adjectives?
- Other ML-modell than logreg?

# Crunch
Only run when adding new stuff to cleaner, otherwise use the CSV in the next section

In [None]:
# reading original
original = pd.read_csv('labeledTrainData.tsv', sep="\t")
original.head()

In [None]:
stops = stopwords.words("english")
porter = PorterStemmer()

def clean_text(text):
    words = word_tokenize(text)
    words_no_punc = [word.lower() for word in words if word.isalpha()]
    no_stop = [word for word in words_no_punc if word not in stops]
    stems = [porter.stem(word) for word in no_stop]
    clean = ' '.join(stems)

    return clean

original["review_clean"] = original["review"].apply(lambda text: clean_text(text))
original.to_csv('train_clean.csv', index=False)

# Exploration

In [2]:
df = pd.read_csv('train_clean.csv', usecols=["sentiment", "review", "review_clean"])

In [3]:
random_nr = random.randint(0, len(df))

old = df["review"][random_nr]
new = df["review_clean"][random_nr]

print(f'Sentiment: {df["sentiment"][random_nr]}')
print(f'\nBefore ({len(old)} chars):')
print(old[:200])
print(f'\nAfter ({len(new)} chars):')
print(new[:200])

Sentiment: 0

Before (1214 chars):
Seriously, the fact that this show is so popular just boggles the mind. This show isn't funny, it isn't clever, it isn't original, it's just a steaming pile of bull crap. Let me start with the charact

After (627 chars):
serious fact show popular boggl mind show funni clever origin steam pile bull crap let start charact charact moron loud exagger voic sound like fingernail blackboard voic act could better anim god hur


# Train and test

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['review_clean'], df['sentiment'], test_size=0.2)

In [5]:
# Bag of Words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Logistic regression
~87% accuracy

In [6]:
# Fitting
logreg = LogisticRegression(max_iter=1000, verbose=2)

logreg.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.1s finished


LogisticRegression(max_iter=1000, verbose=2)

In [7]:
pred_logreg = logreg.predict(X_test)
confusion_matrix(pred_logreg, y_test)

array([[2111,  301],
       [ 381, 2207]])

In [8]:
print(classification_report(pred_logreg, y_test))

              precision    recall  f1-score   support

           0       0.85      0.88      0.86      2412
           1       0.88      0.85      0.87      2588

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



# KNN
~63% accuracy

In [None]:
# getting optimal nr of neighbors (takes ~2 mins)
knn_grid = GridSearchCV(
    estimator=KNeighborsClassifier(), 
    param_grid={'n_neighbors': np.arange(3,11)}, # surely not less than 3, 11 as max due to time consumption
    verbose=2,
    cv=3
)
knn_grid.fit(X_train, y_train)
optimal_neighbors = knn_grid.best_params_['n_neighbors']
optimal_neighbors # Will return 9

In [9]:
knn = KNeighborsClassifier(n_neighbors=9) # optimal_neighbors w/o having to run it
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=9)

In [10]:
pred_knn = knn.predict(X_test)
confusion_matrix(pred_knn, y_test)

array([[1503,  817],
       [ 989, 1691]])

In [11]:
print(classification_report(pred_knn, y_test))

              precision    recall  f1-score   support

           0       0.60      0.65      0.62      2320
           1       0.67      0.63      0.65      2680

    accuracy                           0.64      5000
   macro avg       0.64      0.64      0.64      5000
weighted avg       0.64      0.64      0.64      5000



# Naive Bayes
~65% accuracy

In [12]:
nb = GaussianNB()

nb.fit(X_train.toarray(), y_train)

GaussianNB()

In [13]:
pred_nb = nb.predict(X_test.toarray())
confusion_matrix(pred_nb, y_test)

array([[2004, 1265],
       [ 488, 1243]])

In [14]:
print(classification_report(pred_nb, y_test))

              precision    recall  f1-score   support

           0       0.80      0.61      0.70      3269
           1       0.50      0.72      0.59      1731

    accuracy                           0.65      5000
   macro avg       0.65      0.67      0.64      5000
weighted avg       0.70      0.65      0.66      5000



# Random Forest
~85% accuracy

In [15]:
forest = RandomForestClassifier(max_depth=50)

forest.fit(X_train, y_train)

RandomForestClassifier(max_depth=50)

In [16]:
pred_forest = forest.predict(X_test)
confusion_matrix(pred_forest, y_test)

array([[2070,  343],
       [ 422, 2165]])

In [17]:
print(classification_report(pred_forest, y_test))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84      2413
           1       0.86      0.84      0.85      2587

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

