In [4]:
import os
import math
import re
import json

import random

import gensim
from gensim.models import Word2Vec

import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer

# from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import numpy as np



In [5]:
%matplotlib inline
random.seed(1)
np.random.seed(1)

In [6]:
# w2v_model_file = 'imdb_review_w2v.model' # change each time
train_csv = 'train_df.csv'
test_csv = 'test_df.csv'

neg_bound = 4
pos_bound = 7

In [7]:
def get_vector(word):
    """Get the vector for a word"""
    try:
        return model.wv[word]
    except:
        print(word)
        raise
        
def filter_tokens(tokens):
    return [token for token in tokens if token in vocab]

In [12]:
en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    # Remove non-word characters
    text = re.sub(r'[^a-z]', ' ', text)
    # Remove single letters
    text = re.sub(r'\b[a-z]{0,3}\b', ' ', text)
    # Merge multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Lemmatization
    tokens = text.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    
    return tokens

# Run once

In [91]:
def load_train_or_test(dir):
    """
    Return the negative and positive train or test data
    """
    def load_neg_or_pos(sub):
        res = []
        for file_name in os.listdir(sub):
            with open(sub + file_name, encoding='utf8') as file:
                underscore_ind = file_name.index('_')
                period_ind = file_name.index('.')
                id = int(file_name[:underscore_ind])
                rating = int(file_name[underscore_ind + 1:period_ind])
                text = next(file)
                res.append([id, rating, text])
        return res
    # Only choose more polar ratings
    neg = [[id, rating, text] for id, rating, text in load_neg_or_pos(dir + '/neg/') if rating <= neg_bound]
    pos = [[id, rating, text] for id, rating, text in load_neg_or_pos(dir + '/pos/') if rating >= pos_bound]
    random.shuffle(neg)
    random.shuffle(pos)
    both = neg[:4000] + pos[:4000]
    random.shuffle(both)
    return pd.DataFrame(both, columns=['Id', 'Rating', 'Text'])

In [92]:
train_df = load_train_or_test('./train')
test_df = load_train_or_test('./test')

In [93]:
train_df

Unnamed: 0,Id,Rating,Text
0,5597,3,I will say that at least the movie makes sense...
1,156,1,Zu Warriors most definitely should've been an ...
2,6374,10,Atlantis was much better than I had anticipate...
3,4675,4,"""Washington Square"" is a flat, shabby adaptati..."
4,11382,3,Anyone notice that Tommy only has 3 facial exp...
...,...,...,...
7995,11970,4,"...for the Lt to have chosen this one. First, ..."
7996,4646,7,I didn't know what to make of this film. I gue...
7997,10433,4,The fight scenes were great. Loved the old and...
7998,10812,8,Japanese Tomo Akiyama's Keko Mask (1993) is ex...


In [98]:
len(train_df[train_df['Rating'] <= neg_bound]), len(train_df[train_df['Rating'] >= pos_bound])

(4000, 4000)

## Process text

In [108]:
train_df['Tokens'] = train_df['Text'].apply(tokenize)

train_df['Text'] = train_df['Tokens'].apply(" ".join)

In [100]:
# # Train and save model
# model = Word2Vec(sentences=train_df['Tokens'])
# model.save(w2v_model_file)

In [101]:
# vocab = set(model.wv.key_to_index.keys())
# vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [109]:
# # Keep only tokens that showed up the required number of times
# train_df['Tokens'] = train_df['Tokens'].apply(filter_tokens)

# test_df['Tokens'] = test_df['Text'].apply(lambda text: filter_tokens(tokenize(text)))
# # Process test text too
# test_df['Text'] = test_df['Tokens'].apply(" ".join)

In [103]:
# The vectors corresponding to each reviews' words
train_df['Vectors'] = train_df['Tokens'].apply(get_vector)

In [110]:
# Save processed data
train_df.to_csv(train_csv)
test_df.to_csv(test_csv)

# Load stuff done already

In [6]:
# model = Word2Vec.load(w2v_model_file)

In [8]:
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

# Common stuff

In [10]:
# vocab = set(model.wv.key_to_index.keys())
# vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [41]:
y_train_bi = train_df['Rating'] > 5
y_test_bi = test_df['Rating'] > 5

# Bag of Words

In [27]:
cnt_vectorizer = CountVectorizer(min_df=5, stop_words=en_stop) # en_stop because the default has problems
X_train_bow = cnt_vectorizer.fit_transform(train_df['Text'])
X_test_bow = cnt_vectorizer.transform(test_df['Text'])

# Scale data
scaler_bow = StandardScaler(with_mean=False).fit(X_train_bow)
X_train_bow_scaled = scaler_bow.transform(X_train_bow)

In [15]:
X_train_bow.shape, len(cnt_vectorizer.vocabulary_.keys())

((8000, 12665), 12665)

In [48]:
lr_bow = LogisticRegression()
lr_bow.fit(X_train_bow_scaled, y_train_bi)

LogisticRegression()

In [54]:
bow_predicted = lr_bow.predict(X_test_bow)
cm = confusion_matrix(bow_predicted, y_test_bi)
print(f"TP: {cm[0][0]}, FN: {cm[0][1]}\nFP: {cm[1][0]}, TN: {cm[1][1]}")
print(classification_report(bow_predicted, y_test_bi))

TP: 2563, FN: 235
FP: 1437, TN: 3765
              precision    recall  f1-score   support

       False       0.64      0.92      0.75      2798
        True       0.94      0.72      0.82      5202

    accuracy                           0.79      8000
   macro avg       0.79      0.82      0.79      8000
weighted avg       0.84      0.79      0.80      8000



# TFIDF

In [29]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, stop_words=en_stop) # en_stop because the default has problems
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['Text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['Text'])

In [30]:
X_train_tfidf.shape, len(tfidf_vectorizer.vocabulary_.keys())

((8000, 12665), 12665)

In [32]:
lr_tfidf = LogisticRegression()
lr_tfidf.fit(X_train_tfidf, y_train_bi)

LogisticRegression()

In [55]:
tfidf_predicted = lr_tfidf.predict(X_test_tfidf)
cm = confusion_matrix(tfidf_predicted, y_test_bi)
print(f"TP: {cm[0][0]}, FN: {cm[0][1]}\nFP: {cm[1][0]}, TN: {cm[1][1]}")
print(classification_report(tfidf_predicted, y_test_bi))

TP: 3437, FN: 562
FP: 563, TN: 3438
              precision    recall  f1-score   support

       False       0.86      0.86      0.86      3999
        True       0.86      0.86      0.86      4001

    accuracy                           0.86      8000
   macro avg       0.86      0.86      0.86      8000
weighted avg       0.86      0.86      0.86      8000

