In [None]:
import os
import re
import tarfile
import pandas as pd
import numpy as np

In [None]:
w2v =  'https://www.dropbox.com/s/965dir4dje0hfi4/GoogleNews-vectors-negative300.bin.gz'

In [None]:
datasettest = pd.read_csv("testsetreviews.csv")

In [None]:
csave = ['id','description']
datasettest = datasettest[csave]
idsave = ['8UmzC1ZGGE','yqR4PtpO8X']
datasettest = datasettest[datasettest['id'].isin(idsave)]
datasettest.tail()

In [None]:
dataset = datasettest.reindex(np.random.permutation(datasettest.index))

In [None]:
datasettest.description[165]

In [None]:
dic={}
count=0
for i in dataset.id.unique():
    dic[i]=count
    count+=1
dic

In [None]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)

In [None]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset.index:
        tokens = tokenizer.tokenize(dataset.loc[sample,'description'])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab
            
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [None]:
def collect_expected(dataset):
    """ Peel of the target values from the dataset """
    expected = []
    for sample in dataset.index:
        expected.append(dic[dataset.loc[sample,'id']])
    return expected

In [None]:
def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
 
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [None]:
#dataset = pre_process_data('./aclImdb_v1/train')
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

split_point = int(len(vectorized_data)*.8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [None]:
maxlen = 300
batch_size = 32         # How many samples to show the net before backpropogating the error and updating the weights
embedding_dims = 300    # Length of the token vectors we will create for passing into the Convnet
epochs = 15

x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM

In [None]:
num_neurons = 100

print('Build model...')
model = Sequential()

model.add(LSTM(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile('rmsprop', 'binary_crossentropy',  metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=10,
          validation_data=(x_test, y_test))
model_structure = model.to_json()

In [None]:
with open("lstm_model3.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("lstm_weights3.h5")
print('Model saved.')

In [None]:
from keras.models import model_from_json
with open("lstm_model1.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)

#model.load_weights('lstm_weights1.h5')
print(model.summary())

In [None]:
def tokenize_and_vectorize_original(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab
            
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [None]:
sample_1 = "Scythe was a game I was unsure if I'd be interested in. I'd heard amazing things about it, and thought it seemed interesting enough. Luckily it was purchased for me as a gift, and since then this game has been a huge hit. I was afraid the game wouldn't hit the table much at all but it continuously receives play anywhere from 2 to 3/4 players fairly frequently and a few 5+ player games. Everyone loves the game and we've reached an efficiency level so that 2-3 player games only last 40-45 minutes. While there are plenty of folks who have a number of qualms with the game, most of which I've heard and totally understand, this whole experience just clicks right for me and my group and we absolutely adore it"
vec_list = tokenize_and_vectorize_original([(1, sample_1)])
maxlen = 300
embedding_dims = 300
vec_list = pad_trunc(vec_list, maxlen)
vec_list = np.reshape(vec_list, (len(vec_list), maxlen, embedding_dims))
model.predict_classes(vec_list)

#print("Raw output of sigmoid function: {}".format(model.predict(test_vec)))