In [None]:
# Data obtained from https://ai.stanford.edu/~amaas/data/sentiment/

# Setup

In [80]:
import os
import math
import re
import random
import json

import gensim
from gensim.models import Word2Vec

import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.decomposition import PCA, KernelPCA

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

In [2]:
# nltk.download('punkt')
# nltk.download('stopwords')

# Loading data

In [50]:
def load_train_or_test(dir):
    """
    Return the negative and positive train or test data
    """
    def load_neg_or_pos(sub):
        res = []
        for file_name in os.listdir(sub):
            with open(sub + file_name, encoding='utf8') as file:
                underscore_ind = file_name.index('_')
                period_ind = file_name.index('.')
                id = int(file_name[:underscore_ind])
                rating = int(file_name[underscore_ind + 1:period_ind])
                text = next(file)
                res.append([id, rating, text])
        return res
    res = load_neg_or_pos(dir + '/neg/')[:6000] + load_neg_or_pos(dir + '/pos/')[:6000]
    return pd.DataFrame(res, columns=['Id', 'Rating', 'Text'])
        

In [51]:
train_df, test_df = load_train_or_test('./train'), load_train_or_test('./test')

In [52]:
train_df

Unnamed: 0,Id,Rating,Text
0,0,3,Story of a man who has unnatural feelings for ...
1,10000,4,Airport '77 starts as a brand new luxury 747 p...
2,10001,4,This film lacked something I couldn't put my f...
3,10002,1,"Sorry everyone,,, I know this is supposed to b..."
4,10003,1,When I was little my parents took me along to ...
...,...,...,...
11995,4146,10,Anyone who lived through the ages of Revenge o...
11996,4147,8,I don't think this movie is for everyone. But ...
11997,4148,7,Let's set one thing straight: this movie does ...
11998,4149,10,Reading some of the comments on the message bo...


# Creating Word2Vec model

In [53]:
en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    # Remove non-word characters
    text = re.sub(r'[^a-z]', ' ', text)
    # Remove single characters
    text = re.sub(r'\b[a-z]\b', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Lemmatization
    tokens = text.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) >= 4 and word not in en_stop]
    
    return tokens

In [54]:
train_df['Tokens'] = train_df['Text'].apply(tokenize)

In [55]:
train_df[['Rating', 'Tokens']]

Unnamed: 0,Rating,Tokens
0,3,"[story, unnatural, feeling, start, opening, sc..."
1,4,"[airport, start, brand, luxury, plane, loaded,..."
2,4,"[film, lacked, something, finger, first, chari..."
3,1,"[sorry, everyone, know, supposed, film, handed..."
4,1,"[little, parent, took, along, theater, interio..."
...,...,...
11995,10,"[anyone, lived, revenge, nerd, girlpower, appr..."
11996,8,"[think, movie, everyone, weekend, seattle, tho..."
11997,7,"[thing, straight, movie, seek, redefine, genre..."
11998,10,"[reading, comment, message, board, expecting, ..."


In [26]:
model_file = 'imdb_review_w2v.model'

In [56]:
# model = Word2Vec(sentences=train_df['Tokens'])
# model.save(model_file)

In [None]:
model = Word2Vec.load(model_file)

In [66]:
vocab = set(model.wv.key_to_index.keys())
vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [74]:
def get_vector(word):
    """Get the vector for a word"""
    try:
        return model.wv[word]
    except:
        print(word)
        raise

In [72]:
# Keep only tokens that showed up the required number of times
train_df['Tokens'] = train_df['Tokens'].apply(lambda tokens: list(filter(lambda token: token in vocab, tokens)))
# The vectors corresponding to each reviews' words
train_df['Vectors'] = train_df['Tokens'].apply(get_vector)

# Training