In [38]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

from gensim.models import Word2Vec

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout,GRU
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam

from keras import regularizers

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer


#nltk.download('punkt')
#nltk.download('wordnet')

## Loading the data

Here positive, negative and unsupervised reviews are extracted, labeled and stored in a dataframe. 

In [3]:
origin = 'data/'
pfile = 'positive_reviews.txt'
nfile = 'negative_reviews.txt'
unfile = 'unsupervised_reviews.txt'

with open(origin+pfile, encoding="latin1") as f:
        positiveReviews = f.read().splitlines()
with open(origin+nfile, encoding="latin1") as f:
        negativeReviews = f.read().splitlines()
with open(origin+unfile, encoding="latin1") as f:
        unsupervisedReviews = f.read().splitlines()

In [4]:
reviews = pd.concat([
    pd.DataFrame({"review":positiveReviews, "label":1}),
    pd.DataFrame({"review":negativeReviews, "label":0}),
    pd.DataFrame({"review":unsupervisedReviews, "label":-1})
], ignore_index=True).sample(frac=1, random_state=10)
reviews.head()

Unnamed: 0,review,label
33226,portly nice guy falls for a luscious blonde sh...,0
64804,new york minute is a summer movie for to year...,-1
39763,some movies you watch and you say well that ma...,0
51270,a pretty good film i really loved the cast rat...,-1
9698,although i was born in the year that this movi...,1


## Splitting data into train, validation and test sets

Here sentences are split into training, validation and testing datasets. Additionally, the proportions of positive/negative reviews in the dataset are checked for balance.

In [5]:
reviews = reviews[["review", "label"]].sample(frac=1, random_state=1)

#training set
train = reviews[reviews.label!=-1].sample(frac=0.6, random_state=1)

non_train = reviews[reviews.label!=-1].drop(train.index)

#validation set
valid = non_train.sample(frac=0.5, random_state=1)

#test set
test = non_train.drop(valid.index)

unsu = reviews[reviews.label==-1]

In [6]:
#  Checking shapes
print(train.shape)
print(valid.shape)
print(test.shape)

(30000, 2)
(10000, 2)
(10000, 2)


In [7]:
# Checking class balance, seems balanced enough
print(train["label"].mean(), valid["label"].mean(), test["label"].mean())

0.5016 0.5009 0.4943


In [8]:
train.head()

Unnamed: 0,review,label
34644,it s interesting to see what shape pierce bros...,0
28999,this film was choppy incoherent and contrived ...,0
18124,let s start from this point this is not a movi...,1
37706,im warning you people out there this is just a...,0
45566,this film is basically a poor take on the old ...,0


In [9]:
train.review.iloc[10]

'this is the worst movie i have ever seen the story line is a joke the effects are terrible the cinematography doesn t fit the tone of the movie the dialogue is cheesy and the actors do a good job at screwing up the rest people just don t act that way in real life situations my question is who would fund such crap the movie starts where some miners fall down a mine shaft after a fireman fails to save them next we join some bikers in a forest who ride around doing stunts on their bikes one guy falls and breaks his leg or something the fireman arrives to help them meanwhile somebody starts a fire some more bike stunts bla bla bla i wasted my time do not watch this movie '

## Text Normalisation (Preprocessing)

I carried out the following tech normalisation techniques:


* Tokenized the reviews
* Removed stopwords, due to their lack of semantic content
* Lemmatisation using wordnet to undo word inflections and map words back to their roots

In [10]:
#  Combining DFs for cleaning
df = train.append(valid).append(test)
df['review'].head()

34644    it s interesting to see what shape pierce bros...
28999    this film was choppy incoherent and contrived ...
18124    let s start from this point this is not a movi...
45566    this film is basically a poor take on the old ...
Name: review, dtype: object

In [11]:
review_lines = []
review_lines_strings = []

In [12]:
lemmatizer = WordNetLemmatizer() 
stop_words = set(stopwords.words('english'))

for line in df['review'].values.tolist():
    words = word_tokenize(line)
    words = [lemmatizer.lemmatize(w) for w in words if not w in stop_words] # lemmatizing and removing stopwords
    review_lines.append(words)
    review_lines_strings.append(" ".join(words))

df['review'] = review_lines_strings # Updating df

In [18]:
len(review_lines_strings[650])

385

In [19]:
#  splitting up the dataframe again after cleaning
x_train = df["review"][:train.shape[0]]
y_train = df["label"][:train.shape[0]]

x_valid = df["review"][train.shape[0]:train.shape[0] + valid.shape[0]]
y_valid = df["label"][train.shape[0]:train.shape[0] + valid.shape[0]]

x_test = df["review"][train.shape[0] + valid.shape[0]:]
y_test = df["label"][train.shape[0] + valid.shape[0]:]

## Simple Multinomial Naive Bayes Baseline

This is based on a simple generative classifier that is using counts for each word in each review (Bag of words representations) and modelling this as multinomially ditributed iid data and then fitting these conditional distributions on each class. 

In [20]:
vectorizer = CountVectorizer()
X_all = vectorizer.fit_transform( df["review"])

In [21]:
#  splitting up the count vectorizer data a
x_train_c = X_all[:train.shape[0], :]

x_valid_c = X_all[train.shape[0]:train.shape[0] + valid.shape[0], :]

x_test_c = X_all[train.shape[0] + valid.shape[0]:, :]

print(x_train_c.shape, x_valid_c.shape, x_test_c.shape)

(30000, 89978) (10000, 89978) (10000, 89978)


In [22]:
model_nb = MultinomialNB()

model_nb.fit(x_train_c, y_train)

test_accuracy = model_nb.score(x_test_c, y_test)

print(f'Test accuracy for multinomial NB {test_accuracy}')

Test accuracy for multinomial NB 0.8571


## Word2Vec Embedding and LSTM

Here I employ an LSTM taking in a sequence (time-series) of word embeddings as input, since the  LSTM is a model with very low bias it is prone to overfitting thus  regularisation techniques such as dropout, L1, L2, and early stopping are needed, fitting the hyperparameters of these techniques require a validation set. I did not pursue any methods such as grid searches, random search or bayesian optimisation to tune the regularisers due to lack of time, instead I just eyeballed it. 

In [23]:
EMB_DIM = 52

In [24]:
word_model = Word2Vec(review_lines, size=EMB_DIM, min_count = 1, workers=5, window = 5, sg=0, negative=5)

In [26]:
print("Number of word vectors: {}".format(len(word_model.wv.vocab)))

Number of word vectors: 90032


In [27]:
print(word_model.wv.most_similar('good')) # Quick sanity check

[('decent', 0.8301441669464111), ('great', 0.8031619787216187), ('bad', 0.7798143625259399), ('ok', 0.7188183069229126), ('nice', 0.7141984701156616), ('fine', 0.7071343660354614), ('okay', 0.703923761844635), ('cool', 0.702067494392395), ('alright', 0.6900888085365295), ('awesome', 0.6599547863006592)]


In [28]:
tokenizer = Tokenizer( char_level=False)
tokenizer.fit_on_texts(x_train)
seq_train = tokenizer.texts_to_sequences(x_train)
seq_valid = tokenizer.texts_to_sequences(x_valid)
seq_test = tokenizer.texts_to_sequences(x_test)

In [29]:
SEQUENCE_LENGTH = 500

In [30]:
review_train = pad_sequences(seq_train, maxlen=SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")

review_valid = pad_sequences(seq_valid, maxlen=SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")

review_test = pad_sequences(seq_test, maxlen=SEQUENCE_LENGTH,
                     padding="pre", truncating="post")

In [31]:
print('Train tensor:     ', review_train.shape)
print('Validation tensor:', review_valid.shape)
print('Test tensor:      ', review_test.shape)

Train tensor:      (30000, 500)
Validation tensor: (10000, 500)
Test tensor:       (10000, 500)


In [32]:
print('Number of words', len(tokenizer.word_index))

Number of words 72474


In [33]:
#  Initialising the embedding layer to use word2vec

num_words = len(tokenizer.word_index)+1
# we initialize the matrix with zeros
word_vect_matrix = np.zeros((num_words, EMB_DIM))
num_zeros = 0 

for word, i in tokenizer.word_index.items():
    try:
        # words not found in embedding index will be initialised to zero (the model can learn to ignore these)
        word_vect_matrix[i] = word_model.wv[word]
    except KeyError:
        # If a key exception happens word embedding will be a zero vector
        num_zeros += 1
        pass

percent_loss = num_zeros * 100.0 / num_words 
print(f"did not find {num_zeros} words out of {num_words} thats {percent_loss} percent gone")

did not find 50 words out of 72475 thats 0.0689893066574681 percent gone


In [37]:
model = Sequential()
model.add(Embedding(num_words,
                     EMB_DIM,
                     mask_zero=False,
                     weights=[word_vect_matrix],
                     input_length=SEQUENCE_LENGTH,
                     trainable=False))
model.add(LSTM(128, kernel_regularizer=regularizers.l2(0.001), dropout=0.2, recurrent_dropout=0.2,input_shape=(1,)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 52)           3768700   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               92672     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 3,861,501
Trainable params: 92,801
Non-trainable params: 3,768,700
_________________________________________________________________


In [35]:
batch_size = 500 # tweaked a bit but mostly picked due to speed limitations
model.fit(review_train, y_train,
          batch_size=batch_size,
          epochs=5,
          validation_data=(review_valid, y_valid))
          

Train on 30000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fea5cdf34a8>

In [36]:
score, acc = model.evaluate(review_test, y_test,
                            batch_size=batch_size)
print(f'Test accuracy: {acc}')

Test accuracy: 0.8786999970674515
