In [None]:
from google.colab import drive
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
x = ["I am very happy today", "No, I do not like the movie"]
y = [1,0]

In [None]:
#Create a tokenizer, configured to only take into account the 20 most common words
tokenizer = Tokenizer(num_words=20)

In [None]:
#Build the word index
tokenizer.fit_on_texts(x)

In [None]:
#Turns strings into lists of integer indices
sequences = tokenizer.texts_to_sequences(x)

In [None]:
# Show the vocabulory
word_index = tokenizer.word_index
print(tokenizer.word_index)
print('Found %s unique tokens.' % len(word_index))

{'i': 1, 'am': 2, 'very': 3, 'happy': 4, 'today': 5, 'no': 6, 'do': 7, 'not': 8, 'like': 9, 'the': 10, 'movie': 11}
Found 11 unique tokens.


In [None]:
print(sequences)

[[1, 2, 3, 4, 5], [6, 1, 7, 8, 9, 10, 11]]


In [None]:
# Identify max length of reviews
max_length = 0
for review_number in range(len(sequences)):
  numberofwords=len(sequences[review_number])
  if (numberofwords) > (max_length):
    max_length = numberofwords
print(max_length)

7


In [None]:
# Padding the sequence for shorter reviews 
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
data = pad_sequences(sequences, maxlen=max_length)
y = np.asarray(y)
print(data)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

[[ 0  0  1  2  3  4  5]
 [ 6  1  7  8  9 10 11]]
Shape of data tensor: (2, 7)
Shape of label tensor: (2,)


In [None]:
# Load my word embeddings
import gensim
wordembeddings = gensim.models.KeyedVectors.load_word2vec_format('drive/My Drive/CNNPhraseEmbeddings/Resources/GoogleNews-vectors-negative300.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# Converting data to matrix of shape (12,300) where 12 is number of words in vocaboulry + 1 and 300 word vector for each word
unique_words = len(word_index)
total_words = unique_words + 1
skipped_words = 0
embedding_dim = 300  
embedding_matrix = np.zeros((total_words, embedding_dim))
for word, index in tokenizer.word_index.items():
  try:
    embedding_vector = wordembeddings[word]
  except:
    skipped_words = skipped_words+1
    pass
  if embedding_vector is not None:
    embedding_matrix[index] = embedding_vector
print("Embeddings Matrix shape : ",embedding_matrix.shape)

Embeddings Matrix shape :  (12, 300)


In [None]:
embedding_matrix[3]-wordembeddings.wv['very']

  """Entry point for launching an IPython kernel.


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [None]:
from tensorflow.keras.layers import *
# create the embedding layer
embedding_layer = Embedding(total_words, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False)

In [None]:
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
# define model
model = Sequential()
model.add(embedding_layer)
model.add(SimpleRNN(128,activation='relu',return_sequences= True))
model.add(SimpleRNN(256,activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 7, 300)            3600      
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 7, 128)            54912     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 256)               98560     
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 157,329
Trainable params: 153,729
Non-trainable params: 3,600
_________________________________________________________________
None


In [None]:
from keras.optimizers import *
# compile network
opt = SGD(lr=0.01,decay=1e-6)
model.compile(loss = "binary_crossentropy", optimizer = opt,metrics=['accuracy'])
#model.compile(loss='binary_crossentropy', optimizer='adam',  metrics=['accuracy'])
# fit network
model.fit(data, y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7efd000465f8>