In [None]:
import glob
import os

from random import shuffle
from nltk.tokenize import TreebankWordTokenizer

In [None]:
!unzip /content/pos2.zip

Archive:  /content/pos2.zip
  inflating: pos2/0_9.txt            
  inflating: pos2/1_7.txt            
  inflating: pos2/10_9.txt           
  inflating: pos2/100_7.txt          
  inflating: pos2/101_8.txt          
  inflating: pos2/102_10.txt         
  inflating: pos2/103_7.txt          
  inflating: pos2/104_10.txt         
  inflating: pos2/105_7.txt          
  inflating: pos2/106_10.txt         
  inflating: pos2/107_10.txt         
  inflating: pos2/108_10.txt         
  inflating: pos2/109_10.txt         
  inflating: pos2/11_9.txt           
  inflating: pos2/110_10.txt         
  inflating: pos2/111_10.txt         
  inflating: pos2/112_10.txt         
  inflating: pos2/113_10.txt         
  inflating: pos2/114_10.txt         
  inflating: pos2/115_10.txt         
  inflating: pos2/116_10.txt         
  inflating: pos2/117_10.txt         
  inflating: pos2/118_8.txt          
  inflating: pos2/119_10.txt         
  inflating: pos2/12_9.txt           
  inflating: pos2/120_

In [None]:
!unzip /content/neg2.zip

Archive:  /content/neg2.zip
  inflating: neg2/0_3.txt            
  inflating: neg2/1_1.txt            
  inflating: neg2/10_2.txt           
  inflating: neg2/100_3.txt          
  inflating: neg2/101_1.txt          
  inflating: neg2/102_1.txt          
  inflating: neg2/103_1.txt          
  inflating: neg2/104_3.txt          
  inflating: neg2/105_2.txt          
  inflating: neg2/106_2.txt          
  inflating: neg2/107_2.txt          
  inflating: neg2/108_1.txt          
  inflating: neg2/109_2.txt          
  inflating: neg2/11_3.txt           
  inflating: neg2/110_1.txt          
  inflating: neg2/111_4.txt          
  inflating: neg2/112_1.txt          
  inflating: neg2/113_4.txt          
  inflating: neg2/114_4.txt          
  inflating: neg2/115_2.txt          
  inflating: neg2/116_1.txt          
  inflating: neg2/117_3.txt          
  inflating: neg2/118_2.txt          
  inflating: neg2/119_4.txt          
  inflating: neg2/12_1.txt           
  inflating: neg2/120_

In [None]:
def pre_process_data(filepath):
  """
    Load pos and neg examples from separate dirs then shuffle them together.

  """

  positive_path = os.path.join(filepath, 'pos2')
  negative_path = os.path.join(filepath, 'neg2')
  
  pos_label = 1
  neg_label = 0

  dataset = []
  for filename in glob.glob(os.path.join(positive_path, '*.txt')):
    with open(filename, 'r') as f:
      dataset.append((pos_label, f.read()))

  for filename in glob.glob(os.path.join(negative_path, '*.txt')):
    with open(filename, 'r') as f:
      dataset.append((neg_label, f.read()))

  shuffle(dataset)
  return dataset


In [None]:
dataset = pre_process_data("/content/")

In [None]:
print(dataset[0])

(0, 'I have to say I am really surprised at the high ratings for this movie. I found it to be absolutely idiotic. The mother gets "visions" when she touches certain things or people? And one thing she touched twice made her vision continue... Just seemed so ridiculous. Deedee Pfieffer\'s performance was awful I thought. She was very irritating. The girl who played Lori did a good job and so did most of the supporting cast for what they had to work with.<br /><br />I usually love LMN and am very open minded when it comes to movies but this movie seemed to have a ridiculous plot and over the top acting and it just was not for me.')


In [None]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors

In [None]:
import gensim.downloader as api

In [None]:
word_vecs = api.load("word2vec-google-news-300", return_path=True)



In [None]:
print(word_vecs)

/root/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [None]:
word_vectors = KeyedVectors.load_word2vec_format(word_vecs, binary=True, limit=200000)

In [None]:
def tokenize_and_vectorize(dataset):
  tokenizer = TreebankWordTokenizer()
  vectorized_data = []
  for sample in dataset:
    tokens = tokenizer.tokenize(sample[1])
    sample_vecs = []
    for token in tokens:
      try:
        sample_vecs.append(word_vectors[token])
      except:
        pass

    vectorized_data.append(sample_vecs)

  return vectorized_data

In [None]:
def collect_expected(dataset):
  """Peel off the target values from the dataset"""
  expected = []
  for sample in dataset:
    expected.append(sample[0])
  return expected

In [None]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [None]:
split_point = int(len(vectorized_data) * .8)

In [None]:
X_train = vectorized_data[:split_point]
y_train = expected[:split_point]
X_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [None]:
maxlen = 400
batch_size = 32
embedding_dims = 300
epochs = 2

In [None]:
def pad_trunc(data, maxlen):

  """
      For a given dataset pad with zero vectors or truncate to maxlen
  """

  new_data = []

  # Create a vector of 0s the length of our word vectors

  zero_vector = []
  for _ in range(len(data[0][0])):
    zero_vector.append(0.0)

  for sample in data:
    if len(sample) > maxlen:
      temp = sample[:maxlen]
    elif len(sample) < maxlen:
      temp = sample
      # Append the appropriate number 0 vectors to the list
      additional_elems = maxlen - len(sample)
      for _ in range(additional_elems):
        temp.append(zero_vector)

    else:
      temp = sample

    new_data.append(temp)

  return new_data

In [None]:
len(X_train[0][0])

300

In [None]:
import numpy as np

X_train = pad_trunc(X_train, maxlen)
X_test = pad_trunc(X_test, maxlen)

In [None]:
len(X_train[0])

400

In [None]:
len(X_test[0])

400

In [None]:
X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))
y_train = np.array(y_train)

In [None]:
X_test = np.reshape(X_test, (len(X_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [None]:
from keras.models import Sequential

In [None]:
from keras.layers import Dense, Dropout, Flatten, SimpleRNN

In [None]:
num_neurons = 50

In [None]:
model = Sequential()

In [None]:
model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))

In [None]:
model.add(Dropout(.2))

In [None]:
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, 400, 50)           17550     
_________________________________________________________________
dropout (Dropout)            (None, 400, 50)           0         
_________________________________________________________________
flatten (Flatten)            (None, 20000)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 20001     
Total params: 37,551
Trainable params: 37,551
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f063709bcc0>

In [None]:
model_structure = model.to_json()
with open("simplernn_modell.json", "w") as json_file:
  json_file.write(model_structure)

In [None]:
model.save_weights("simplernn_weights1.h5")

In [None]:
num_neurons = 100
model = Sequential()
model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 400, 100)          40100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 100)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 40000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 40001     
Total params: 80,101
Trainable params: 80,101
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f0633769550>

In [None]:
model_structure = model.to_json()
with open("simplernn_model2.json", "w") as json_file:
  json_file.write(model_structure)

In [None]:
model.save_weights("simplernn_weights2.h5")

In [None]:
word_vectors['dog']

array([ 5.12695312e-02, -2.23388672e-02, -1.72851562e-01,  1.61132812e-01,
       -8.44726562e-02,  5.73730469e-02,  5.85937500e-02, -8.25195312e-02,
       -1.53808594e-02, -6.34765625e-02,  1.79687500e-01, -4.23828125e-01,
       -2.25830078e-02, -1.66015625e-01, -2.51464844e-02,  1.07421875e-01,
       -1.99218750e-01,  1.59179688e-01, -1.87500000e-01, -1.20117188e-01,
        1.55273438e-01, -9.91210938e-02,  1.42578125e-01, -1.64062500e-01,
       -8.93554688e-02,  2.00195312e-01, -1.49414062e-01,  3.20312500e-01,
        3.28125000e-01,  2.44140625e-02, -9.71679688e-02, -8.20312500e-02,
       -3.63769531e-02, -8.59375000e-02, -9.86328125e-02,  7.78198242e-03,
       -1.34277344e-02,  5.27343750e-02,  1.48437500e-01,  3.33984375e-01,
        1.66015625e-02, -2.12890625e-01, -1.50756836e-02,  5.24902344e-02,
       -1.07421875e-01, -8.88671875e-02,  2.49023438e-01, -7.03125000e-02,
       -1.59912109e-02,  7.56835938e-02, -7.03125000e-02,  1.19140625e-01,
        2.29492188e-01,  

In [None]:
num_neurons = 25
epochs=8
batch_size=8
model = Sequential()
model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.4))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_11 (SimpleRNN)    (None, 400, 25)           8150      
_________________________________________________________________
dropout_11 (Dropout)         (None, 400, 25)           0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 10000)             0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 10001     
Total params: 18,151
Trainable params: 18,151
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x7f0629c19ef0>

In [None]:
model_structure = model.to_json()
with open("simplernn_model3.json", "w") as json_file:
  json_file.write(model_structure)

In [None]:
model.save_weights("simplernn_weights3.h5")

In [None]:
sample_1 = "I'm hate that the dismal weather that had me down for so long, when will it break! Ugh, when does happiness return?  The sun is blinding and the puffy clouds are too thin.  I can't wait for the weekend."

In [None]:
from keras.models import model_from_json
with open("simplernn_model3.json", "r") as json_file:
  json_string = json_file.read()

model = model_from_json(json_string)
model.load_weights('simplernn_weights3.h5')

In [None]:
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

In [None]:
test_vec

array([[[ 0.07910156, -0.0050354 ,  0.11181641, ..., -0.0067749 ,
          0.04272461, -0.10351562],
        [ 0.19335938, -0.07128906,  0.10839844, ...,  0.0480957 ,
          0.16503906,  0.04418945],
        [ 0.1328125 ,  0.08007812,  0.28710938, ..., -0.02404785,
         -0.02697754,  0.125     ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]]])

In [None]:
model.predict_classes(test_vec)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


array([[0]], dtype=int32)