* Movie review dataset- Sentiment analysis
* Dataset from keras: 25,000 reviews 
* Words are encoded as well, eg 3rd most common occuring word is 3rd encoded 

In [2]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np
import keras

VOCAB_SIZE = 88584
BATCH_SIZE = 64

MAX_LEN = 250
(train_data, train_labels) , (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
train_data[0]
# look at one review!
# the review is already encoded words into a vector of numbers

print(len(train_data[9]))
# each train data element has different length

130


## More pre-processing 

* if the review is more than 250 words we will trim off the extra words to make it 250 words length
* if the review is less than 250 words we will add 0's to make it 250 words length, ie, blank words encoded as 0 to the left side of the encoding

In [5]:
train_data  = sequence.pad_sequences(train_data, MAX_LEN)
test_data = sequence.pad_sequences(test_data, MAX_LEN)


print(len(train_data[0]))

print((train_data[0]))

250
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     1    14    22    16
    43   530   973  1622  1385    65   458  4468    66  3941     4   173
    36   256     5    25   100    43   838   112    50   670 22665     9
    35   480   284     5   150     4   172   112   167 21631   336   385
    39     4   172  4536  1111    17   546    38    13   447     4   192
    50    16     6   147  2025    19    14    22     4  1920  4613   469
     4    22    71    87    12    16    43   530    38    76    15    13
  1247     4    22    17   515    17    12    16   626    18 19193     5
    62   386    12     8   316     8   106     5     4  2223  5244    16
   480    66  3785    33     4   130    12    16    38   619     5    25
   124    51    36   135    48    25  1415    33     6    22    12   215
    28    77    52     5    14   407    16    8

## CReating the model

In [6]:
#  sigmoid squeshes the value between 0 and 1 to normalise the reviews

model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(VOCAB_SIZE, 32),
                             tf.keras.layers.LSTM(32),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2834688   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


## Training the model


In [8]:
#  binary cross entropy cuz we will know how far the output is from 0 or 1
# optimizer is adam or rsmprop, anything will do 
# validation_split = 0.2 20% of the training data is used for validation

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics = ['acc'])

history = model.fit(train_data, train_labels, epochs = 10, validation_split= 0.2)

# while running the model or fitting you can observe the accuracy parameter slowly increasing as the 
# model is trained with more data!

# model is over fit to 98% and validation acc is fixed to 80%, meaning we dont have too much data
# WE need more data

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
results  = model.evaluate(test_data, test_labels)
print(results)

[0.5055547952651978, 0.8512399792671204]


## Making the prediction

In [9]:
word_index = imdb.get_word_index()

def encode_text(text):
  tokens = keras.preprocessing.text.text_to_word_sequence(text)
  tokens = [word_index[word] if word in word_index else 0 for word in tokens]
  return sequence.pad_sequences([tokens], MAX_LEN)[0]

text = "the movie was just amazing, so amazing"
encoded = encode_text(text)

print(text)
print('encoded text is ',encoded)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
the movie was just amazing, so amazing
encoded text is  [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  

In [36]:
# function for decoding back to imdb text

reverse_word_index = { value: key for (key, value) in  word_index.items()}

def decode_integers(integers):
  PAD = 0
  text = ""
  i = 0
  for n in integers:
    # print(n)
    if n != PAD:
      text += reverse_word_index[n] + " "
      # print(text)
  
  for num in integers:
    # print('inside for loop',num, i)
    i += 1
    if num != PAD:
      # text += reverse_word_index[num]+ " " 
      print("ok")

    return text

# print(encoded)
print(decode_integers(encoded))

the movie was just amazing so amazing 


In [38]:
decode_integers([0,1,2,3,4,5])

'the and a of to '

In [51]:
# make prediction

def predict(text):
  encoded_text = encode_text(text)
  pred = np.zeros((1,250))
  pred[0] = encoded_text
  result = model.predict(pred)
  print(result[0])


positive_review = "That movie was realy brilliant! I would love to watch it again as it was amazing"
predict(positive_review)

negative_review = "This was one of the  movies I have seen! will never watch it again in my life. it really sucked. was one of the worst things I have ever seen"
predict(negative_review)

[0.5192108]
[0.3667158]
