In [113]:
import os
import math
import re
import json

import random

import gensim
from gensim.models import Word2Vec

import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.datasets import make_classification
# from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import matplotlib.pyplot as plt

import numpy as np

In [2]:
%matplotlib inline
random.seed(1)
np.random.seed(1)

In [9]:
# w2v_model_file = 'imdb_review_w2v.model' # change each time
train_csv = 'train_df.csv'
test_csv = 'test_df.csv'
df_csv = 'df.csv'

neg_bound = 4
pos_bound = 7

train_size = 0.80

min_occ = 5 # The minimum number of occurrences for a word to be considered

In [4]:
def get_vector(word):
    """Get the vector for a word"""
    try:
        return model.wv[word]
    except:
        print(word)
        raise
        
def filter_tokens(tokens):
    return [token for token in tokens if token in vocab]

In [5]:
en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    # Remove non-word characters
    text = re.sub(r'[^a-z]', ' ', text)
    # Remove single letters
    text = re.sub(r'\b[a-z]{0,3}\b', ' ', text)
    # Merge multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Lemmatization
    tokens = text.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    
    return tokens

# Run once

In [49]:
def load_train_or_test(dir):
    """
    Return the negative and positive train or test data
    """
    def load_neg_or_pos(sub):
        res = []
        for file_name in os.listdir(sub):
            with open(sub + file_name, encoding='utf8') as file:
                underscore_ind = file_name.index('_')
                period_ind = file_name.index('.')
                id = int(file_name[:underscore_ind])
                rating = int(file_name[underscore_ind + 1:period_ind])
                text = next(file)
                res.append([rating, text])
        return res
    # Only choose more polar ratings
    neg = [[rating, text] for rating, text in load_neg_or_pos(dir + '/neg/') if rating <= neg_bound]
    pos = [[rating, text] for rating, text in load_neg_or_pos(dir + '/pos/') if rating >= pos_bound]
    random.shuffle(neg)
    random.shuffle(pos)
    both = neg[:8000] + pos[:8000]
    random.shuffle(both)
    return pd.DataFrame(both, columns=['Rating', 'Text'])

In [50]:
df = load_train_or_test('./train') #.append(load_train_or_test('./test'))

In [51]:
df

Unnamed: 0,Rating,Text
0,1,Now i have never ever seen a bad movie in all ...
1,10,Clint Eastwood returns as Dirty Harry Calahan ...
2,9,RKO studios decided to borrow both William Pow...
3,1,"The subject notwithstanding, this is an amateu..."
4,2,I watched this movie purely for the setting. I...
...,...,...
15995,9,A riotous farce set in the world of glamorous ...
15996,8,Having first achieved fame with Drunken Master...
15997,1,...but a lousy film. As Maltin says this was C...
15998,2,"just watched it, me and my better half could n..."


In [52]:
len(df[df['Rating'] <= neg_bound]), len(df[df['Rating'] >= pos_bound])

(8000, 8000)

In [53]:
df['Tokens'] = df['Text'].apply(tokenize)
# Clean up the text too
df['Text'] = df['Tokens'].apply(" ".join)

In [54]:
# # Train and save model
# model = Word2Vec(sentences=train_df['Tokens'])
# model.save(w2v_model_file)

In [55]:
# vocab = set(model.wv.key_to_index.keys())
# vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [56]:
# # Keep only tokens that showed up the required number of times
# train_df['Tokens'] = train_df['Tokens'].apply(filter_tokens)

# test_df['Tokens'] = test_df['Text'].apply(lambda text: filter_tokens(tokenize(text)))
# # Process test text too
# test_df['Text'] = test_df['Tokens'].apply(" ".join)

In [57]:
# # The vectors corresponding to each reviews' words
# df['Vectors'] = df['Tokens'].apply(get_vector)

In [58]:
# Save processed data
df.to_csv(df_csv)

# Load stuff done already

In [6]:
# model = Word2Vec.load(w2v_model_file)

In [6]:
df = pd.read_csv(df_csv)

# Common stuff

In [10]:
# vocab = set(model.wv.key_to_index.keys())
# vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [7]:
y_bi = df['Rating'] > 5
y_train_bi, y_test_bi = train_test_split(y_bi, train_size=train_size, random_state=1)

y_bin2 = df['Rating'] // 2
y_train_bin2, y_test_bin2 = train_test_split(y_bin2, train_size=train_size, random_state=1)

In [57]:
def test_classifier(predicted, big_table=True):
    cm = confusion_matrix(predicted, y_test_bi)
    print(f"TP: {cm[0][0]}, FN: {cm[0][1]}\nFP: {cm[1][0]}, TN: {cm[1][1]}")
    if big_table:
        print(classification_report(predicted, y_test_bi))

# Logistic Regression + Bag of Words

In [32]:
def try_bow(**kwargs):
    cnt_vectorizer = CountVectorizer(stop_words=en_stop, **kwargs) # en_stop because the default has problems
    X_bow = cnt_vectorizer.fit_transform(df['Text'])
    X_train_bow, X_test_bow = train_test_split(X_bow, train_size=train_size, random_state=1)

    # Scale data
    scaler_bow = StandardScaler(with_mean=False).fit(X_train_bow)
    X_train_bow_scaled = scaler_bow.transform(X_train_bow)
    X_test_bow_scaled = scaler_bow.transform(X_test_bow)
    print(X_train_bow_scaled.shape, len(cnt_vectorizer.vocabulary_.keys()))
    
    lr_bow = LogisticRegression()
    lr_bow.fit(X_train_bow_scaled, y_train_bi)
    
    test_classifier(lr_bow.predict(X_test_bow_scaled))

In [33]:
try_bow(min_df=5, ngram_range=(1, 1)) # Just unigrams

(12800, 18626) 18626
TP: 1284, FN: 264
FP: 269, TN: 1383
              precision    recall  f1-score   support

       False       0.83      0.83      0.83      1548
        True       0.84      0.84      0.84      1652

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [34]:
try_bow(min_df=5, ngram_range=(1, 2)) # Unigrams and bigrams

(12800, 52772) 52772
TP: 1308, FN: 219
FP: 245, TN: 1428
              precision    recall  f1-score   support

       False       0.84      0.86      0.85      1527
        True       0.87      0.85      0.86      1673

    accuracy                           0.85      3200
   macro avg       0.85      0.86      0.85      3200
weighted avg       0.86      0.85      0.86      3200



In [35]:
try_bow(min_df=5, ngram_range=(2, 2)) # Just bigrams

(12800, 34146) 34146
TP: 1176, FN: 318
FP: 377, TN: 1329
              precision    recall  f1-score   support

       False       0.76      0.79      0.77      1494
        True       0.81      0.78      0.79      1706

    accuracy                           0.78      3200
   macro avg       0.78      0.78      0.78      3200
weighted avg       0.78      0.78      0.78      3200



In [36]:
try_bow(min_df=5, ngram_range=(1, 3)) # Unigrams, bigrams, and trigrams

(12800, 55247) 55247
TP: 1313, FN: 218
FP: 240, TN: 1429
              precision    recall  f1-score   support

       False       0.85      0.86      0.85      1531
        True       0.87      0.86      0.86      1669

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200



# Logistic Regression + TFIDF

In [28]:
def make_tfidf(**kwargs):
    # en_stop because the default apparently has problems
    tfidf_vectorizer = TfidfVectorizer(stop_words=en_stop, min_df=min_occ, **kwargs)
    X_tfidf = tfidf_vectorizer.fit_transform(df['Text'])
    X_train_tfidf, X_test_tfidf = train_test_split(X_tfidf, train_size=train_size, random_state=1)
    
    print(X_train_tfidf.shape, len(tfidf_vectorizer.vocabulary_.keys()))
    
    return X_train_tfidf, X_test_tfidf

def try_tfidf(X_train_tfidf, X_test_tfidf):    
    lr_tfidf = LogisticRegression()
    lr_tfidf.fit(X_train_tfidf, y_train_bi)
    
    test_classifier(lr_tfidf.predict(X_test_tfidf))

In [14]:
X_train_tfidf_1, X_test_tfidf_1 = make_tfidf(ngram_range=(1, 1))
X_train_tfidf_1_2, X_test_tfidf_1_2 = make_tfidf(ngram_range=(1, 2))
X_train_tfidf_2, X_test_tfidf_2 = make_tfidf(ngram_range=(2, 2))

(12800, 18626) 18626
(12800, 52772) 52772
(12800, 34146) 34146


In [29]:
try_tfidf(X_train_tfidf_1, X_test_tfidf_1)

TP: 1366, FN: 196
FP: 187, TN: 1451
              precision    recall  f1-score   support

       False       0.88      0.87      0.88      1562
        True       0.88      0.89      0.88      1638

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [30]:
try_tfidf(X_train_tfidf_1_2, X_test_tfidf_1_2)

TP: 1357, FN: 194
FP: 196, TN: 1453
              precision    recall  f1-score   support

       False       0.87      0.87      0.87      1551
        True       0.88      0.88      0.88      1649

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [31]:
try_tfidf(X_train_tfidf_2, X_test_tfidf_2)

TP: 1243, FN: 243
FP: 310, TN: 1404
              precision    recall  f1-score   support

       False       0.80      0.84      0.82      1486
        True       0.85      0.82      0.84      1714

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [116]:
def try_classifier(classifier, X_train=X_train_tfidf_1, X_test=X_test_tfidf_1, big_table=True):
    classifier.fit(X_train, y_train_bi)
    
    print(classifier.score(X_test, y_test_bi))
    test_classifier(classifier.predict(X_test), big_table=big_table)

# Random forests + tf-idf

In [119]:
from sklearn.ensemble import RandomForestClassifier

In [120]:
try_classifier(RandomForestClassifier(max_depth=2, random_state=1))

0.7725
TP: 1251, FN: 426
FP: 302, TN: 1221
              precision    recall  f1-score   support

       False       0.81      0.75      0.77      1677
        True       0.74      0.80      0.77      1523

    accuracy                           0.77      3200
   macro avg       0.77      0.77      0.77      3200
weighted avg       0.77      0.77      0.77      3200



In [121]:
try_classifier(RandomForestClassifier(max_depth=3, random_state=1))

0.7865625
TP: 1244, FN: 374
FP: 309, TN: 1273
              precision    recall  f1-score   support

       False       0.80      0.77      0.78      1618
        True       0.77      0.80      0.79      1582

    accuracy                           0.79      3200
   macro avg       0.79      0.79      0.79      3200
weighted avg       0.79      0.79      0.79      3200



In [122]:
try_classifier(RandomForestClassifier(max_depth=5, random_state=1))

0.8071875
TP: 1219, FN: 283
FP: 334, TN: 1364
              precision    recall  f1-score   support

       False       0.78      0.81      0.80      1502
        True       0.83      0.80      0.82      1698

    accuracy                           0.81      3200
   macro avg       0.81      0.81      0.81      3200
weighted avg       0.81      0.81      0.81      3200



In [123]:
try_classifier(RandomForestClassifier(max_depth=5, min_samples_leaf=5, random_state=1))

0.804375
TP: 1208, FN: 281
FP: 345, TN: 1366
              precision    recall  f1-score   support

       False       0.78      0.81      0.79      1489
        True       0.83      0.80      0.81      1711

    accuracy                           0.80      3200
   macro avg       0.80      0.80      0.80      3200
weighted avg       0.81      0.80      0.80      3200



In [124]:
try_classifier(RandomForestClassifier(max_depth=7, min_samples_leaf=5, n_estimators=200, random_state=1))

0.8234375
TP: 1241, FN: 253
FP: 312, TN: 1394
              precision    recall  f1-score   support

       False       0.80      0.83      0.81      1494
        True       0.85      0.82      0.83      1706

    accuracy                           0.82      3200
   macro avg       0.82      0.82      0.82      3200
weighted avg       0.82      0.82      0.82      3200



In [125]:
try_classifier(RandomForestClassifier(max_depth=9, min_samples_leaf=5, n_estimators=200, random_state=1))

0.8303125
TP: 1238, FN: 228
FP: 315, TN: 1419
              precision    recall  f1-score   support

       False       0.80      0.84      0.82      1466
        True       0.86      0.82      0.84      1734

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [126]:
try_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=5, n_estimators=300, random_state=1))

0.84125
TP: 1259, FN: 214
FP: 294, TN: 1433
              precision    recall  f1-score   support

       False       0.81      0.85      0.83      1473
        True       0.87      0.83      0.85      1727

    accuracy                           0.84      3200
   macro avg       0.84      0.84      0.84      3200
weighted avg       0.84      0.84      0.84      3200



In [127]:
try_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=10, n_estimators=300, random_state=2))

0.8440625
TP: 1274, FN: 220
FP: 279, TN: 1427
              precision    recall  f1-score   support

       False       0.82      0.85      0.84      1494
        True       0.87      0.84      0.85      1706

    accuracy                           0.84      3200
   macro avg       0.84      0.84      0.84      3200
weighted avg       0.84      0.84      0.84      3200



In [128]:
try_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=15, n_estimators=300, random_state=2))

0.8428125
TP: 1275, FN: 225
FP: 278, TN: 1422
              precision    recall  f1-score   support

       False       0.82      0.85      0.84      1500
        True       0.86      0.84      0.85      1700

    accuracy                           0.84      3200
   macro avg       0.84      0.84      0.84      3200
weighted avg       0.84      0.84      0.84      3200



In [142]:
try_classifier(RandomForestClassifier(max_depth=16, n_estimators=400, random_state=2), big_table=False)

0.845
TP: 1281, FN: 224
FP: 272, TN: 1423


In [143]:
try_classifier(RandomForestClassifier(max_depth=20, n_estimators=600, random_state=1), big_table=False)

0.8496875
TP: 1299, FN: 227
FP: 254, TN: 1420


In [141]:
try_classifier(RandomForestClassifier(max_depth=20, n_estimators=800, random_state=1), big_table=False)

0.8471875
TP: 1288, FN: 224
FP: 265, TN: 1423


# SVM + tf-idf

In [85]:
def try_svm(svm, X_train=X_train_tfidf_1, X_test=X_test_tfidf_1):
    svm.fit(X_train, y_train_bi)
    
    test_classifier(svm.predict(X_test))

In [86]:
try_svm(SVC(kernel='linear', random_state=1))

TP: 1370, FN: 209
FP: 183, TN: 1438
              precision    recall  f1-score   support

       False       0.88      0.87      0.87      1579
        True       0.87      0.89      0.88      1621

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [87]:
try_svm(SVC(kernel='rbf', random_state=1))

TP: 1368, FN: 189
FP: 185, TN: 1458
              precision    recall  f1-score   support

       False       0.88      0.88      0.88      1557
        True       0.89      0.89      0.89      1643

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [111]:
try_svm(SVC(kernel='poly', degree=2, random_state=1))

TP: 1348, FN: 188
FP: 205, TN: 1459
              precision    recall  f1-score   support

       False       0.87      0.88      0.87      1536
        True       0.89      0.88      0.88      1664

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [110]:
# try_svm(SVC(kernel='poly', degree=3, max_iter=10, random_state=1))

In [112]:
try_svm(SVC(kernel='sigmoid', random_state=1))

TP: 1371, FN: 211
FP: 182, TN: 1436
              precision    recall  f1-score   support

       False       0.88      0.87      0.87      1582
        True       0.87      0.89      0.88      1618

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [105]:
try_svm(SVC(kernel='linear', max_iter=1000, random_state=1))



TP: 1272, FN: 277
FP: 281, TN: 1370
              precision    recall  f1-score   support

       False       0.82      0.82      0.82      1549
        True       0.83      0.83      0.83      1651

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [106]:
try_svm(SVC(kernel='rbf', max_iter=1000, random_state=1))



TP: 1301, FN: 318
FP: 252, TN: 1329
              precision    recall  f1-score   support

       False       0.84      0.80      0.82      1619
        True       0.81      0.84      0.82      1581

    accuracy                           0.82      3200
   macro avg       0.82      0.82      0.82      3200
weighted avg       0.82      0.82      0.82      3200



In [107]:
try_svm(SVC(kernel='poly', degree=2, max_iter=1000, random_state=1))



TP: 1302, FN: 334
FP: 251, TN: 1313
              precision    recall  f1-score   support

       False       0.84      0.80      0.82      1636
        True       0.80      0.84      0.82      1564

    accuracy                           0.82      3200
   macro avg       0.82      0.82      0.82      3200
weighted avg       0.82      0.82      0.82      3200



In [108]:
try_svm(SVC(kernel='poly', degree=3, max_iter=1000, random_state=1))



TP: 928, FN: 162
FP: 625, TN: 1485
              precision    recall  f1-score   support

       False       0.60      0.85      0.70      1090
        True       0.90      0.70      0.79      2110

    accuracy                           0.75      3200
   macro avg       0.75      0.78      0.75      3200
weighted avg       0.80      0.75      0.76      3200



In [109]:
try_svm(SVC(kernel='sigmoid', max_iter=1000, random_state=1))



TP: 1264, FN: 267
FP: 289, TN: 1380
              precision    recall  f1-score   support

       False       0.81      0.83      0.82      1531
        True       0.84      0.83      0.83      1669

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



# Extra Trees + tf-idf

In [130]:
from sklearn.ensemble import ExtraTreesClassifier

In [131]:
try_classifier(ExtraTreesClassifier(max_depth=5, n_estimators=100, random_state=1))

0.78125
TP: 1355, FN: 502
FP: 198, TN: 1145
              precision    recall  f1-score   support

       False       0.87      0.73      0.79      1857
        True       0.70      0.85      0.77      1343

    accuracy                           0.78      3200
   macro avg       0.78      0.79      0.78      3200
weighted avg       0.80      0.78      0.78      3200



In [132]:
try_classifier(ExtraTreesClassifier(max_depth=10, n_estimators=100, random_state=1))

0.825
TP: 1301, FN: 308
FP: 252, TN: 1339
              precision    recall  f1-score   support

       False       0.84      0.81      0.82      1609
        True       0.81      0.84      0.83      1591

    accuracy                           0.82      3200
   macro avg       0.83      0.83      0.82      3200
weighted avg       0.83      0.82      0.82      3200



In [133]:
try_classifier(ExtraTreesClassifier(max_depth=15, n_estimators=100, random_state=1))

0.829375
TP: 1284, FN: 277
FP: 269, TN: 1370
              precision    recall  f1-score   support

       False       0.83      0.82      0.82      1561
        True       0.83      0.84      0.83      1639

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [134]:
try_classifier(ExtraTreesClassifier(max_depth=5, n_estimators=200, random_state=1))

0.805625
TP: 1394, FN: 463
FP: 159, TN: 1184
              precision    recall  f1-score   support

       False       0.90      0.75      0.82      1857
        True       0.72      0.88      0.79      1343

    accuracy                           0.81      3200
   macro avg       0.81      0.82      0.80      3200
weighted avg       0.82      0.81      0.81      3200



In [135]:
try_classifier(ExtraTreesClassifier(max_depth=5, n_estimators=500, random_state=1))

0.8178125
TP: 1409, FN: 439
FP: 144, TN: 1208
              precision    recall  f1-score   support

       False       0.91      0.76      0.83      1848
        True       0.73      0.89      0.81      1352

    accuracy                           0.82      3200
   macro avg       0.82      0.83      0.82      3200
weighted avg       0.83      0.82      0.82      3200



In [137]:
try_classifier(ExtraTreesClassifier(max_depth=10, n_estimators=200, random_state=1))

0.8375
TP: 1339, FN: 306
FP: 214, TN: 1341
              precision    recall  f1-score   support

       False       0.86      0.81      0.84      1645
        True       0.81      0.86      0.84      1555

    accuracy                           0.84      3200
   macro avg       0.84      0.84      0.84      3200
weighted avg       0.84      0.84      0.84      3200



In [138]:
try_classifier(ExtraTreesClassifier(max_depth=15, n_estimators=400, random_state=1))

0.8521875
TP: 1333, FN: 253
FP: 220, TN: 1394
              precision    recall  f1-score   support

       False       0.86      0.84      0.85      1586
        True       0.85      0.86      0.85      1614

    accuracy                           0.85      3200
   macro avg       0.85      0.85      0.85      3200
weighted avg       0.85      0.85      0.85      3200



In [139]:
try_classifier(ExtraTreesClassifier(max_depth=15, n_estimators=500, random_state=1))

0.8559375
TP: 1339, FN: 247
FP: 214, TN: 1400
              precision    recall  f1-score   support

       False       0.86      0.84      0.85      1586
        True       0.85      0.87      0.86      1614

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200



In [140]:
try_classifier(ExtraTreesClassifier(max_depth=20, n_estimators=800, random_state=1))

0.861875
TP: 1335, FN: 224
FP: 218, TN: 1423
              precision    recall  f1-score   support

       False       0.86      0.86      0.86      1559
        True       0.86      0.87      0.87      1641

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200

