In [1]:
import os
import math
import re
import json

import random

import gensim
from gensim.models import Word2Vec

import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer

# from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import numpy as np



In [2]:
%matplotlib inline
random.seed(1)
np.random.seed(1)

In [20]:
# w2v_model_file = 'imdb_review_w2v.model' # change each time
train_csv = 'train_df.csv'
test_csv = 'test_df.csv'
df_csv = 'df.csv'

neg_bound = 4
pos_bound = 7

train_size = 0.80

In [4]:
def get_vector(word):
    """Get the vector for a word"""
    try:
        return model.wv[word]
    except:
        print(word)
        raise
        
def filter_tokens(tokens):
    return [token for token in tokens if token in vocab]

In [5]:
en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    # Remove non-word characters
    text = re.sub(r'[^a-z]', ' ', text)
    # Remove single letters
    text = re.sub(r'\b[a-z]{0,3}\b', ' ', text)
    # Merge multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Lemmatization
    tokens = text.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    
    return tokens

# Run once

In [49]:
def load_train_or_test(dir):
    """
    Return the negative and positive train or test data
    """
    def load_neg_or_pos(sub):
        res = []
        for file_name in os.listdir(sub):
            with open(sub + file_name, encoding='utf8') as file:
                underscore_ind = file_name.index('_')
                period_ind = file_name.index('.')
                id = int(file_name[:underscore_ind])
                rating = int(file_name[underscore_ind + 1:period_ind])
                text = next(file)
                res.append([rating, text])
        return res
    # Only choose more polar ratings
    neg = [[rating, text] for rating, text in load_neg_or_pos(dir + '/neg/') if rating <= neg_bound]
    pos = [[rating, text] for rating, text in load_neg_or_pos(dir + '/pos/') if rating >= pos_bound]
    random.shuffle(neg)
    random.shuffle(pos)
    both = neg[:8000] + pos[:8000]
    random.shuffle(both)
    return pd.DataFrame(both, columns=['Rating', 'Text'])

In [50]:
df = load_train_or_test('./train') #.append(load_train_or_test('./test'))

In [51]:
df

Unnamed: 0,Rating,Text
0,1,Now i have never ever seen a bad movie in all ...
1,10,Clint Eastwood returns as Dirty Harry Calahan ...
2,9,RKO studios decided to borrow both William Pow...
3,1,"The subject notwithstanding, this is an amateu..."
4,2,I watched this movie purely for the setting. I...
...,...,...
15995,9,A riotous farce set in the world of glamorous ...
15996,8,Having first achieved fame with Drunken Master...
15997,1,...but a lousy film. As Maltin says this was C...
15998,2,"just watched it, me and my better half could n..."


In [52]:
len(df[df['Rating'] <= neg_bound]), len(df[df['Rating'] >= pos_bound])

(8000, 8000)

## Process text

In [53]:
df['Tokens'] = df['Text'].apply(tokenize)
# Clean up the text too
df['Text'] = df['Tokens'].apply(" ".join)

In [54]:
# # Train and save model
# model = Word2Vec(sentences=train_df['Tokens'])
# model.save(w2v_model_file)

In [55]:
# vocab = set(model.wv.key_to_index.keys())
# vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [56]:
# # Keep only tokens that showed up the required number of times
# train_df['Tokens'] = train_df['Tokens'].apply(filter_tokens)

# test_df['Tokens'] = test_df['Text'].apply(lambda text: filter_tokens(tokenize(text)))
# # Process test text too
# test_df['Text'] = test_df['Tokens'].apply(" ".join)

In [57]:
# # The vectors corresponding to each reviews' words
# df['Vectors'] = df['Tokens'].apply(get_vector)

In [58]:
# Save processed data
df.to_csv(df_csv)

# Load stuff done already

In [6]:
# model = Word2Vec.load(w2v_model_file)

In [8]:
df = pd.read_csv(df_csv)

# Common stuff

In [10]:
# vocab = set(model.wv.key_to_index.keys())
# vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [59]:
y_bi = df['Rating'] > 5
y_train_bi, y_test_bi = train_test_split(y_bi, train_size=train_size, random_state=1)

# Logistic Regression + Bag of Words

In [72]:
def try_bow(**kwargs):
    cnt_vectorizer = CountVectorizer(stop_words=en_stop, **kwargs) # en_stop because the default has problems
    X_bow = cnt_vectorizer.fit_transform(df['Text'])
    X_train_bow, X_test_bow = train_test_split(X_bow, train_size=train_size, random_state=1)

    # Scale data
    scaler_bow = StandardScaler(with_mean=False).fit(X_train_bow)
    X_train_bow_scaled = scaler_bow.transform(X_train_bow)
    X_test_bow_scaled = scaler_bow.transform(X_test_bow)
    print(X_train_bow_scaled.shape, len(cnt_vectorizer.vocabulary_.keys()))
    
    lr_bow = LogisticRegression()
    lr_bow.fit(X_train_bow_scaled, y_train_bi)

    bow_predicted = lr_bow.predict(X_test_bow_scaled)
    cm = confusion_matrix(bow_predicted, y_test_bi)
    print(f"TP: {cm[0][0]}, FN: {cm[0][1]}\nFP: {cm[1][0]}, TN: {cm[1][1]}")
    print(classification_report(bow_predicted, y_test_bi))

In [73]:
try_bow(min_df=5, ngram_range=(1, 1)) # Just unigrams

(12800, 18626) 18626
TP: 1284, FN: 264
FP: 269, TN: 1383
              precision    recall  f1-score   support

       False       0.83      0.83      0.83      1548
        True       0.84      0.84      0.84      1652

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [74]:
try_bow(min_df=5, ngram_range=(2, 2)) # Just bigrams

(12800, 34146) 34146
TP: 1176, FN: 318
FP: 377, TN: 1329
              precision    recall  f1-score   support

       False       0.76      0.79      0.77      1494
        True       0.81      0.78      0.79      1706

    accuracy                           0.78      3200
   macro avg       0.78      0.78      0.78      3200
weighted avg       0.78      0.78      0.78      3200



In [75]:
try_bow(min_df=5, ngram_range=(1, 2)) # Unigrams and bigrams

(12800, 52772) 52772
TP: 1308, FN: 219
FP: 245, TN: 1428
              precision    recall  f1-score   support

       False       0.84      0.86      0.85      1527
        True       0.87      0.85      0.86      1673

    accuracy                           0.85      3200
   macro avg       0.85      0.86      0.85      3200
weighted avg       0.86      0.85      0.86      3200



In [76]:
try_bow(min_df=5, ngram_range=(1, 3)) # Unigrams, bigrams, and trigrams

(12800, 55247) 55247
TP: 1313, FN: 218
FP: 240, TN: 1429
              precision    recall  f1-score   support

       False       0.85      0.86      0.85      1531
        True       0.87      0.86      0.86      1669

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200



# Logistic Regression + TFIDF

In [78]:
def try_tfidf(**kwargs):
    tfidf_vectorizer = TfidfVectorizer(stop_words=en_stop, **kwargs) # en_stop because the default has problems
    X_tfidf = tfidf_vectorizer.fit_transform(df['Text'])
    X_train_tfidf, X_test_tfidf = train_test_split(X_tfidf, train_size=train_size, random_state=1)

    print(X_train_tfidf.shape, len(tfidf_vectorizer.vocabulary_.keys()))
    
    lr_tfidf = LogisticRegression()
    lr_tfidf.fit(X_train_tfidf, y_train_bi)
    
    # Test
    tfidf_predicted = lr_tfidf.predict(X_test_tfidf)
    cm = confusion_matrix(tfidf_predicted, y_test_bi)
    print(f"TP: {cm[0][0]}, FN: {cm[0][1]}\nFP: {cm[1][0]}, TN: {cm[1][1]}")
    print(classification_report(tfidf_predicted, y_test_bi))

In [79]:
try_tfidf(ngram_range=(1, 1))

(12800, 53018) 53018
TP: 1365, FN: 194
FP: 188, TN: 1453
              precision    recall  f1-score   support

       False       0.88      0.88      0.88      1559
        True       0.88      0.89      0.88      1641

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [80]:
try_tfidf(ngram_range=(1, 2))

(12800, 1144585) 1144585
TP: 1341, FN: 195
FP: 212, TN: 1452
              precision    recall  f1-score   support

       False       0.86      0.87      0.87      1536
        True       0.88      0.87      0.88      1664

    accuracy                           0.87      3200
   macro avg       0.87      0.87      0.87      3200
weighted avg       0.87      0.87      0.87      3200



In [81]:
try_tfidf(ngram_range=(2, 2))

(12800, 1091567) 1091567
TP: 1207, FN: 202
FP: 346, TN: 1445
              precision    recall  f1-score   support

       False       0.78      0.86      0.81      1409
        True       0.88      0.81      0.84      1791

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



# Random forests