In [None]:
import glob
import os

from random import shuffle
from nltk.tokenize import TreebankWordTokenizer

In [None]:
!unzip /content/pos2.zip

Archive:  /content/pos2.zip
  inflating: pos2/0_9.txt            
  inflating: pos2/1_7.txt            
  inflating: pos2/10_9.txt           
  inflating: pos2/100_7.txt          
  inflating: pos2/101_8.txt          
  inflating: pos2/102_10.txt         
  inflating: pos2/103_7.txt          
  inflating: pos2/104_10.txt         
  inflating: pos2/105_7.txt          
  inflating: pos2/106_10.txt         
  inflating: pos2/107_10.txt         
  inflating: pos2/108_10.txt         
  inflating: pos2/109_10.txt         
  inflating: pos2/11_9.txt           
  inflating: pos2/110_10.txt         
  inflating: pos2/111_10.txt         
  inflating: pos2/112_10.txt         
  inflating: pos2/113_10.txt         
  inflating: pos2/114_10.txt         
  inflating: pos2/115_10.txt         
  inflating: pos2/116_10.txt         
  inflating: pos2/117_10.txt         
  inflating: pos2/118_8.txt          
  inflating: pos2/119_10.txt         
  inflating: pos2/12_9.txt           
  inflating: pos2/120_

In [None]:
!unzip /content/neg2.zip

Archive:  /content/neg2.zip
  inflating: neg2/0_3.txt            
  inflating: neg2/1_1.txt            
  inflating: neg2/10_2.txt           
  inflating: neg2/100_3.txt          
  inflating: neg2/101_1.txt          
  inflating: neg2/102_1.txt          
  inflating: neg2/103_1.txt          
  inflating: neg2/104_3.txt          
  inflating: neg2/105_2.txt          
  inflating: neg2/106_2.txt          
  inflating: neg2/107_2.txt          
  inflating: neg2/108_1.txt          
  inflating: neg2/109_2.txt          
  inflating: neg2/11_3.txt           
  inflating: neg2/110_1.txt          
  inflating: neg2/111_4.txt          
  inflating: neg2/112_1.txt          
  inflating: neg2/113_4.txt          
  inflating: neg2/114_4.txt          
  inflating: neg2/115_2.txt          
  inflating: neg2/116_1.txt          
  inflating: neg2/117_3.txt          
  inflating: neg2/118_2.txt          
  inflating: neg2/119_4.txt          
  inflating: neg2/12_1.txt           
  inflating: neg2/120_

In [None]:
def pre_process_data(filepath):

  """
    Load pos and neg examples from separate dirs then shuffle them together

  """

  positive_path = os.path.join(filepath, 'pos2')
  negative_path = os.path.join(filepath, 'neg2')

  pos_label = 1
  neg_label = 0

  dataset = []

  for filename in glob.glob(os.path.join(positive_path, '*.txt')):
    with open(filename, 'r') as f:
      dataset.append((pos_label, f.read()))

  for filename in glob.glob(os.path.join(negative_path, '*.txt')):
    with open(filename, 'r') as f:
      dataset.append((neg_label, f.read()))

  shuffle(dataset)

  return dataset

In [None]:
dataset = pre_process_data("/content/")

In [None]:
print(dataset[0])

(0, "I watched this movie for a project on love. please tell Nicolas Cage to learn what it would feel like to be his character, and then re-read the lines he's saying. My life cannot go on... i accidentally cut off my own hand...my brother was close by. Obviously his fault. And since when have happy endings included the nice guy who takes care of Mom sad and alone. No closure, bad script, and doesn't have enough extension of minor characters. Save yourself, unless your up for a good laugh. Costumes were done appropriately, and extras did a fabulous job. I'm sure it would have been a fun movie to make, but keep it more genre specific, I can't recommend this movie to anyone I know, because it is not an intellectual movie. It is not a chick flick. It is not a strict romantic. And I can't show kids because of the sex and questions to follow. All in all, just not a good flick.")


In [None]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors

In [None]:
import gensim.downloader as api

In [None]:
word_vecs = api.load("word2vec-google-news-300", return_path=True)



In [None]:
print(word_vecs)

/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [None]:
word_vectors = KeyedVectors.load_word2vec_format(word_vecs, binary=True, limit=200000)

In [None]:
def tokenize_and_vectorize(dataset):
  tokenizer = TreebankWordTokenizer()
  vectorized_data = []
  for sample in dataset:
    tokens = tokenizer.tokenize(sample[1])
    sample_vecs = []
    for token in tokens:
      try:
        sample_vecs.append(word_vectors[token])
      except:
        pass
      
    vectorized_data.append(sample_vecs)

  return vectorized_data

In [None]:
def collect_expected(dataset):
  """Peel off the target values from the dataset"""
  expected=[]
  for sample in dataset:
    expected.append(sample[0])
  return expected

In [None]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [None]:
split_point = int(len(vectorized_data) * .8)

In [None]:
X_train = vectorized_data[:split_point]
y_train = expected[:split_point]
X_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [None]:
maxlen = 400
batch_size = 8
embedding_dims = 300
epochs = 3

In [None]:
def pad_trunc(data, maxlen):

  """
    For a given dataset pad with zero vectors or truncate to maxlen
  """

  new_data = []

  # Create a vector of 0s the lenght of our word vectors

  zero_vector = []
  for _ in range(len(data[0][0])):
    zero_vector.append(0.0)

  for sample in data:
    if len(sample) > maxlen:
      temp = sample[:maxlen]
    elif len(sample) < maxlen:
      temp = sample
      # Append the appropriate number 0 vectors to the list
      additional_elems = maxlen - len(sample)
      for _ in range(additional_elems):
        temp.append(zero_vector)

    else:
      temp = sample

    new_data.append(temp)

  return new_data

In [None]:
len(X_train[0][0])

300

In [None]:
expected[0]

0

In [None]:
import numpy as np

X_train = pad_trunc(X_train, maxlen)
X_test = pad_trunc(X_test, maxlen)

In [None]:
len(X_train[0])

400

In [None]:
len(X_test[0])

400

In [None]:
X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))

In [None]:

y_train = np.array(y_train)

In [None]:
X_test = np.reshape(X_test, (len(X_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [None]:
X_train.shape

(801, 400, 300)

In [None]:
X_test.shape

(201, 400, 300)

In [None]:
from keras.models import Sequential 
from keras.layers import Dense, Dropout, Flatten, LSTM 
num_neurons = 50
model = Sequential()

In [None]:
model.add(LSTM(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 400, 50)           70200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 20001     
Total params: 90,201
Trainable params: 90,201
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f3234c48a58>

In [None]:
model_structure = model.to_json()
with open("lstm_model1.json", "w") as json_file:
  json_file.write(model_structure)

In [None]:
model.save_weights("lstm_weights1.h5")

In [None]:
from keras.models import model_from_json
with open("lstm_model1.json", "r") as json_file:
  json_string = json_file.read()

In [None]:
model = model_from_json(json_string)

In [None]:
model.load_weights("/content/lstm_weights1.h5")

In [None]:
sample_1 = "I'm hate that the dismal weather that had me down for so long, when will it break! Ugh, when does happiness return?  The sun is blinding and the puffy clouds are too thin.  I can't wait for the weekend."

In [None]:
vec_list = tokenize_and_vectorize([(1, sample_1)])

In [None]:
test_vec_list = pad_trunc(vec_list, maxlen)

In [None]:
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

In [None]:
print("Sample's sentiment, 1-pos, 2-neg: {}".format(model.predict_classes(test_vec)))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Sample's sentiment, 1-pos, 2-neg: [[0]]


In [None]:
print("Raw output of sigmoid function: {}".format(model.predict(test_vec)))

Raw output of sigmoid function: [[0.17224404]]


In [None]:
def test_lan(data, maxlen):
  total_len = truncated = exact = padded = 0
  for sample in data:
    total_len = total_len + len(sample)
    if len(sample) > maxlen:
      truncated = truncated + 1
    elif len(sample) < maxlen:
      padded = padded + 1
    else:
      exact = exact + 1

  print('Padded: {}'.format(padded))
  print('Equal: {}'.format(exact))
  print('Truncated: {}'.format(truncated))
  print('Avg length: {}'.format(total_len/len(data)))
  print('Total length: {}'.format(total_len))

In [None]:
test_lan(vectorized_data, 400)

Padded: 0
Equal: 917
Truncated: 85
Avg length: 414.0878243512974
Total length: 414916


In [None]:
len(vectorized_data)

1002

In [None]:
414916/1002

414.0878243512974

In [None]:
def avg_len(data):
  total_len = 0
  for sample in data:
    total_len = total_len + len(sample[1])
  return total_len/len(data)

In [None]:
avg_len(dataset)

1227.0439121756488

In [None]:
def clean_data(data):
  """Shift to lower case, replace unknowns with UNK, and listify"""
  new_data = []
  VALID = 'abcdefghijklmnopqrstuvwxyz123456789"\'?!.,:; '
  for sample in data:
    new_sample = []
    for char in sample[1].lower(): # Just grab the string, not the label
       if char in VALID:
         new_sample.append(char)
       else:
         new_sample.append('UNK')

    new_data.append(new_sample)

  return new_data



In [None]:
listified_data = clean_data(dataset)

In [None]:
def char_pad_trunc(data, maxlen=1500):
  """We truncate to maxlen or add in PAD tokens"""
  new_dataset = []
  for sample in data:
    if len(sample) > maxlen:
      new_data = sample[:maxlen]
    elif len(sample) < maxlen:
      pads = maxlen - len(sample)
      new_data = sample + ['PAD'] * pads
    else:
      new_data = sample
    new_dataset.append(new_data)
  return new_dataset

In [None]:
def create_dicts(data):
  """Modified from Keras LSTM example"""
  chars = set()
  for sample in data:
    chars.update(set(sample))
  char_indices = dict((c, i) for i, c in enumerate(chars))
  indices_char = dict((i, c) for i, c in enumerate(chars))
  return char_indices, indices_char

In [None]:
import numpy as np 

def onehot_encode(dataset, char_indicies, maxlen=1500):
  
  """
    One hot encode the tokens
  

  Args:
       dataset list of lists of tokens
       char_indicies dictionary of {key=character, value=index to use encoding vector}
       maxlen int Length of each sample
  Return:
      np array of shape (samples, tokens, encoding length)

  """

  X = np.zeros((len(dataset), maxlen, len(char_indices.keys())))
  for i, sentence in enumerate(dataset):
    for t, char in enumerate(sentence):
      X[i, t, char_indicies[char]] = 1

  return X
  

In [None]:
maxlen = 1500
common_length_data = char_pad_trunc(listified_data, maxlen)
char_indices, indices_char = create_dicts(common_length_data)


In [None]:
char_indices

{' ': 37,
 '!': 35,
 '"': 30,
 "'": 25,
 ',': 17,
 '.': 20,
 '1': 19,
 '2': 9,
 '3': 8,
 '4': 36,
 '5': 41,
 '6': 18,
 '7': 6,
 '8': 1,
 '9': 21,
 ':': 15,
 ';': 31,
 '?': 42,
 'PAD': 5,
 'UNK': 32,
 'a': 22,
 'b': 29,
 'c': 23,
 'd': 14,
 'e': 28,
 'f': 2,
 'g': 4,
 'h': 16,
 'i': 24,
 'j': 10,
 'k': 7,
 'l': 0,
 'm': 26,
 'n': 43,
 'o': 40,
 'p': 11,
 'q': 12,
 'r': 38,
 's': 34,
 't': 45,
 'u': 3,
 'v': 27,
 'w': 33,
 'x': 44,
 'y': 13,
 'z': 39}

In [None]:
indices_char

{0: 'l',
 1: '8',
 2: 'f',
 3: 'u',
 4: 'g',
 5: 'PAD',
 6: '7',
 7: 'k',
 8: '3',
 9: '2',
 10: 'j',
 11: 'p',
 12: 'q',
 13: 'y',
 14: 'd',
 15: ':',
 16: 'h',
 17: ',',
 18: '6',
 19: '1',
 20: '.',
 21: '9',
 22: 'a',
 23: 'c',
 24: 'i',
 25: "'",
 26: 'm',
 27: 'v',
 28: 'e',
 29: 'b',
 30: '"',
 31: ';',
 32: 'UNK',
 33: 'w',
 34: 's',
 35: '!',
 36: '4',
 37: ' ',
 38: 'r',
 39: 'z',
 40: 'o',
 41: '5',
 42: '?',
 43: 'n',
 44: 'x',
 45: 't'}

In [None]:
encoded_data = onehot_encode(common_length_data, char_indices, maxlen)

In [None]:
len(listified_data)

1002

In [None]:
encoded_data[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
split_point = int(len(encoded_data) * .8)

X_train = encoded_data[:split_point]
y_train = expected[:split_point]
X_test = encoded_data[split_point:]
y_test = expected[split_point:]

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [None]:
X_test = np.array(X_test)
y_test = np.array(y_test)

In [None]:
len(X_train)

801

In [None]:
len(X_test)

201

In [None]:
from keras.models import Sequential 
from keras.layers import Dense, Dropout, Embedding, Flatten, LSTM 

In [None]:
num_neurons = 40
maxlen = 1500
model = Sequential()

model.add(LSTM(num_neurons, return_sequences=True, input_shape=(maxlen, len(char_indices.keys()))))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 1500, 40)          13920     
_________________________________________________________________
dropout_4 (Dropout)          (None, 1500, 40)          0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 60000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 60001     
Total params: 73,921
Trainable params: 73,921
Non-trainable params: 0
_________________________________________________________________


In [None]:
batch_size = 32
epochs = 10
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3233338b70>

In [None]:
model_structure = model.to_json()
with open("char_lstm_model3.json", "w") as json_file:
  json_file.write(model_structure)


In [None]:
model.save_weights("char_lstm_weights3.h5")