# Movie Review Classification with Word Embeddings (Feb 2018)

In [1]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import glob
import seaborn as sns
import spacy
import string
import re
import nltk
from spacy.symbols import ORTH
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

## First look at the data

In [2]:
path = "aclImdb/"
names = ['neg','pos']

In [3]:
!head $path/train/pos/0_9.txt

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!

In [79]:
texts,labels = [],[]
for idx,label in enumerate(names):
    for fname in glob.glob(os.path.join(f'{path}train', label, '*.*')):
        texts.append(open(fname, 'r').read())
        labels.append(idx)
trn,trn_y= texts, np.array(labels).astype(np.int64)

In [80]:
texts,labels = [],[]
for idx,label in enumerate(names):
    for fname in glob.glob(os.path.join(f'{path}test', label, '*.*')):
        texts.append(open(fname, 'r').read())
        labels.append(idx)
test,test_y= texts, np.array(labels).astype(np.int64)

In [6]:
trn[0]

"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly."

In [7]:
trn_y[0]

0

## 1. Use the libary spacy to tokenize data

In [24]:
# borrowed from fast.ai (https://github.com/fastai/fastai/blob/master/fastai/nlp.py)
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [25]:
row = trn[2]
spacy_tok(row.lower())

['this',
 'film',
 'lacked',
 'something',
 'i',
 'could',
 "n't",
 'put',
 'my',
 'finger',
 'on',
 'at',
 'first',
 ':',
 'charisma',
 'on',
 'the',
 'part',
 'of',
 'the',
 'leading',
 'actress',
 '.',
 'this',
 'inevitably',
 'translated',
 'to',
 'lack',
 'of',
 'chemistry',
 'when',
 'she',
 'shared',
 'the',
 'screen',
 'with',
 'her',
 'leading',
 'man',
 '.',
 'even',
 'the',
 'romantic',
 'scenes',
 'came',
 'across',
 'as',
 'being',
 'merely',
 'the',
 'actors',
 'at',
 'play',
 '.',
 'it',
 'could',
 'very',
 'well',
 'have',
 'been',
 'the',
 'director',
 'who',
 'miscalculated',
 'what',
 'he',
 'needed',
 'from',
 'the',
 'actors',
 '.',
 'i',
 'just',
 'do',
 "n't",
 'know',
 '.',
 '\n\n',
 'but',
 'could',
 'it',
 'have',
 'been',
 'the',
 'screenplay',
 '?',
 'just',
 'exactly',
 'who',
 'was',
 'the',
 'chef',
 'in',
 'love',
 'with',
 '?',
 'he',
 'seemed',
 'more',
 'enamored',
 'of',
 'his',
 'culinary',
 'skills',
 'and',
 'restaurant',
 ',',
 'and',
 'ultimatel

## Get Non-Stopwords

In [26]:
# get stop words
nltk.download('stopwords')
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/ty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
# modified from https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb
def get_non_stopwords(review):
    """Returns a list of non-stopwords"""
    return {x:1 for x in spacy_tok(str(review).lower()) if x not in stops}.keys()

In [31]:
get_non_stopwords(trn[0])

dict_keys(['story', 'man', 'unnatural', 'feelings', 'pig', '.', 'starts', 'opening', 'scene', 'terrific', 'example', 'absurd', 'comedy', 'formal', 'orchestra', 'audience', 'turned', 'insane', ',', 'violent', 'mob', 'crazy', 'chantings', "'s", 'singers', 'unfortunately', 'stays', 'whole', 'time', 'general', 'narrative', 'eventually', 'making', 'putting', 'even', 'era', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'third', 'grader', 'technical', 'level', 'better', 'might', 'think', 'good', 'cinematography', 'future', 'great', 'vilmos', 'zsigmond', 'stars', 'sally', 'kirkland', 'frederic', 'forrest', 'seen', 'briefly'])

## 3. Read the 300 dimensional Glove embeddings into a dictionary.
Globe embedings: https://nlp.stanford.edu/projects/glove/

In [70]:
# After downloading:
globe_path = "glove.6B.300d.txt"

In [35]:
def load_word_embeddings(file=globe_path):
    embeddings={}
    with open(file,'r') as infile:
        for line in infile:
            values=line.split()
            embeddings[values[0]]=np.asarray(values[1:],dtype='float32')
    return embeddings

In [36]:
embeddings = load_word_embeddings()

In [37]:
len(embeddings.keys())

400000

## 4. Create average feature embedding for each sentence (stopwords ignored).

In [71]:
def sentence_features_v2(s, embeddings=embeddings,emb_size=300):
    # ignore stop words
    words=get_non_stopwords(s)
    words=[w for w in words if w.isalpha() and w in embeddings]
    if len(words)==0:
        return np.hstack([np.zeros(emb_size)])
    M=np.array([embeddings[w] for w in words])
    return M.mean(axis=0)

In [40]:
w = sentence_features_v2(trn[0])
w.shape

(300,)

In [66]:
# create sentence vectors
x_train = np.array([sentence_features_v2(x) for x in trn])
x_test = np.array([sentence_features_v2(x) for x in test])

In [67]:
x_train.shape, x_test.shape

((25000, 300), (25000, 300))

In [95]:
trn_y.shape, test_y.shape

((25000,), (25000,))

## 5. Fit an XGBoost classifier

In [69]:
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d_train = xgb.DMatrix(x_train, label=trn_y)
d_val = xgb.DMatrix(x_test, label=test_y)

watchlist = [(d_train, 'train'), (d_val, 'valid')]

bst = xgb.train(xgb_pars, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.679532	valid-logloss:0.681121
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.410523	valid-logloss:0.471239
[100]	train-logloss:0.327481	valid-logloss:0.421985
[150]	train-logloss:0.282652	valid-logloss:0.401021
[200]	train-logloss:0.252722	valid-logloss:0.390152
[250]	train-logloss:0.231107	valid-logloss:0.383067
[300]	train-logloss:0.213928	valid-logloss:0.378596
[350]	train-logloss:0.199011	valid-logloss:0.375209
[399]	train-logloss:0.187425	valid-logloss:0.373045


## 6. Fitting XGBoost to a one-hot encoding representation of the data with bag of words.

In [87]:
veczr = CountVectorizer(tokenizer=nltk.word_tokenize,stop_words='english')

In [107]:
trn_term_doc=veczr.fit_transform(trn)
test_term_doc=veczr.transform(test)

In [108]:
trn_term_doc.shape,test_term_doc.shape, trn_y.shape, test_y.shape

((25000, 114215), (25000, 114215), (25000,), (25000,))

In [109]:
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d_train = xgb.DMatrix(trn_term_doc, label=trn_y)
d_val = xgb.DMatrix(test_term_doc, label=test_y)

watchlist = [(d_train, 'train'), (d_val, 'valid')]

bst = xgb.train(xgb_pars, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.681228	valid-logloss:0.68133
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.4872	valid-logloss:0.494361
[100]	train-logloss:0.427427	valid-logloss:0.441414
[150]	train-logloss:0.392734	valid-logloss:0.411832
[200]	train-logloss:0.369056	valid-logloss:0.392905
[250]	train-logloss:0.351235	valid-logloss:0.380086
[300]	train-logloss:0.337678	valid-logloss:0.37019
[350]	train-logloss:0.326536	valid-logloss:0.362763
[399]	train-logloss:0.316882	valid-logloss:0.357108


## Conclusion

For train loss, embedding method has a better result; but for validation loss,  the one-hot-encoding method has a better result.