In [29]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import random

**Improvements**

- Name Entity Recognition to remove those
- PoS, only look at e.g. adjectives?
- Other ML-modell than logreg?

# Crunch
Only run when adding new stuff to cleaner, otherwise use the CSV in the next section

In [None]:
# reading original
original = pd.read_csv('labeledTrainData.tsv', sep="\t")
original.head()

In [None]:
stops = stopwords.words("english")
porter = PorterStemmer()

def clean_text(text):
    words = word_tokenize(text)
    words_no_punc = [word.lower() for word in words if word.isalpha()]
    no_stop = [word for word in words_no_punc if word not in stops]
    stems = [porter.stem(word) for word in no_stop]
    clean = ' '.join(stems)

    return clean

original["review_clean"] = original["review"].apply(lambda text: clean_text(text))
original.to_csv('train_clean.csv', index=False)

# Exploration

In [None]:
df = pd.read_csv('train_clean.csv', usecols=["sentiment", "review", "review_clean"])
df.head(3)

In [None]:
random_nr = random.randint(0, len(df))

old = df["review"][random_nr]
new = df["review_clean"][random_nr]

print(f'Sentiment: {df["sentiment"][random_nr]}')
print(f'\nBefore ({len(old)} chars):')
print(old)
print(f'\nAfter ({len(new)} chars):')
print(new)

# Train and test

In [None]:
# Train and test data
df['random_number'] = np.random.randn(len(df.index))

train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]

len(train), len(test)

In [None]:
# Bag of Words
vectorizer = CountVectorizer()
train_matrix = vectorizer.fit_transform(train['review_clean'])
test_matrix = vectorizer.transform(test['review_clean'])

In [None]:
X_train = train_matrix
X_test = test_matrix
y_train = train['sentiment']
y_test = test['sentiment']

# Logistic regression
~87% accuracy

In [None]:
# Fitting
logreg = LogisticRegression(max_iter=1000, verbose=2)

logreg.fit(X_train,y_train)

In [None]:
pred_logreg = logreg.predict(X_test)
confusion_matrix(pred_logreg, y_test)

In [None]:
print(classification_report(pred_logreg, y_test))

# KNN
~63% accuracy

In [None]:
# getting optimal nr of neighbors (takes ~3 mins)
knn_grid = GridSearchCV(
    estimator=KNeighborsClassifier(), 
    param_grid={'n_neighbors': np.arange(10,20)}, 
    verbose=2,
    cv=3
)
knn_grid.fit(X_train, y_train)
optimal_neighbors = knn_grid.best_params_['n_neighbors']
optimal_neighbors # FYI will return 19

In [None]:
knn = KNeighborsClassifier(n_neighbors=optimal_neighbors)
knn.fit(X_train, y_train)

In [None]:
pred_knn = knn.predict(X_test)
confusion_matrix(pred_knn, y_test)

In [None]:
print(classification_report(pred_knn, y_test))

# Naive Bayes
~70% accuracy

In [31]:
nb = GaussianNB()

nb.fit(X_train.toarray(), y_train)

GaussianNB()

In [34]:
pred_nb = nb.predict(X_test.toarray())
confusion_matrix(pred_nb, y_test)

array([[2130, 1316],
       [ 480, 1334]])

In [35]:
print(classification_report(pred_nb, y_test))

              precision    recall  f1-score   support

           0       0.82      0.62      0.70      3446
           1       0.50      0.74      0.60      1814

    accuracy                           0.66      5260
   macro avg       0.66      0.68      0.65      5260
weighted avg       0.71      0.66      0.67      5260

