In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
%matplotlib inline

In [2]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, LSTM,Embedding
from tensorflow.python.keras.optimizers import Adam 
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [3]:
import imdb

In [4]:
imdb.data_dir = "data/IMDB/"

In [5]:
imdb.maybe_download_and_extract()

- Download progress: 100.0%
Download finished. Extracting files.
Done.


In [7]:
x_train_text, y_train = imdb.load_data(train = True)
x_test_text, y_test = imdb.load_data(train = False)                                    

In [9]:
print('Train-set size: ', len(x_train_text))
print('Test-set size: ', len(x_test_text))

Train-set size:  25000
Test-set size:  25000


In [10]:
data_text = x_train_text + x_test_text

In [13]:
print(x_train_text[1])
if(y_train[1] == 1.0):
    print('\n\nPositive review')
else:
    print('\n\nNegative review')

Finally! An Iranian film that is not made by Majidi, Kiarostami or the Makhmalbafs. This is a non-documentary, an entertaining black comedy with subversive young girls subtly kicking the 'system' in its ass. It's all about football and its funny, its really funny. The director says "The places are real, the event is real, and so are the characters and the extras. This is why I purposely chose not to use professional actors, as their presence would have introduced a notion of falseness." The non-actors will have you rooting for them straightaway unless a. your heart is made of stone b. you are blind. Excellently scripted, the film challenges patriarchal authority with an almost absurd freshness. It has won the Jury Grand Prize, Berlin, 2006. Dear reader, it's near-perfect. WHERE, where can I get hold of it?


Positive review


In [3]:
num_words = 10000
tokenizer = Tokenizer(num_words = num_words)

In [15]:
%%time
tokenizer.fit_on_texts(data_text)

CPU times: user 9.47 s, sys: 0 ns, total: 9.47 s
Wall time: 9.48 s


In [16]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'one': 27,
 'he': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'so': 34,
 'who': 35,
 'from': 36,
 'like': 37,
 'or': 38,
 'just': 39,
 'her': 40,
 'out': 41,
 'about': 42,
 'if': 43,
 "it's": 44,
 'has': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'my': 56,
 'even': 57,
 'would': 58,
 'she': 59,
 'which': 60,
 'only': 61,
 'really': 62,
 'see': 63,
 'story': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'me': 68,
 'well': 69,
 'were': 70,
 'than': 71,
 'much': 72,
 'we': 73,
 'bad': 74,
 'been': 75,
 'get': 76,
 'do': 77,
 'great': 78,
 'other': 79,
 'will': 80,
 'also': 81,
 'into': 82,
 'p

In [18]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)

In [19]:
x_train_text[1]

'Finally! An Iranian film that is not made by Majidi, Kiarostami or the Makhmalbafs. This is a non-documentary, an entertaining black comedy with subversive young girls subtly kicking the \'system\' in its ass. It\'s all about football and its funny, its really funny. The director says "The places are real, the event is real, and so are the characters and the extras. This is why I purposely chose not to use professional actors, as their presence would have introduced a notion of falseness." The non-actors will have you rooting for them straightaway unless a. your heart is made of stone b. you are blind. Excellently scripted, the film challenges patriarchal authority with an almost absurd freshness. It has won the Jury Grand Prize, Berlin, 2006. Dear reader, it\'s near-perfect. WHERE, where can I get hold of it?'

In [20]:
np.array(x_train_tokens[1])

array([ 415,   32, 9408,   19,   12,    6,   21,   90,   31,   38,    1,
         11,    6,    3,  684,  640,   32,  427,  317,  202,   16,  186,
        534, 5719, 4709,    1,    8,   92, 1906,   44,   29,   42, 2095,
          2,   92,  152,   92,   62,  152,    1,  164,  546,    1, 1363,
         23,  144,    1, 1560,    6,  144,    2,   34,   23,    1,  102,
          2,    1, 2305,   11,    6,  134,   10, 2508,   21,    5,  356,
       1644,  150,   14,   65, 1308,   58,   25, 1705,    3, 4212,    4,
          1,  684,  150,   80,   25,   22, 6229,   15,   93,  886,    3,
        125,  488,    6,   90,    4, 1861,  492,   22,   23, 1998, 7154,
       3677,    1,   19, 5510, 4679,   16,   32,  216, 1874,    9,   45,
       1274,    1, 6302, 1767, 4515, 5103, 3184, 2997, 5370,   44,  734,
        399,  117,  117,   67,   10,   76, 1074,    4,    9])

In [22]:
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)

In [24]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [25]:
np.mean(num_tokens)

221.27716000000001

In [26]:
np.max(num_tokens)

2208

In [27]:
max_tokens = np.mean(num_tokens) + 2*np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

544

In [28]:
np.sum(num_tokens < max_tokens)/len(num_tokens)

0.94533999999999996

In [29]:
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens,maxlen = max_tokens,padding = pad,truncating = pad)
x_test_pad = pad_sequences(x_test_tokens,maxlen = max_tokens,padding = pad,truncating = pad)

In [30]:
x_train_pad[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [None]:
#funtion to convert a list back to string from tokens
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values().idx.keys()))

In [5]:
model = Sequential()

In [6]:
embedding_size = 8

In [7]:
model.add(Embedding(input_dim = num_words,
                    output_dim = 8,
                    input_length = 544,
                    name = 'layer_embedding'))

In [8]:
model.add(LSTM(units = 16,return_sequences = True))
model.add(LSTM(units = 8,return_sequences = True))
model.add(LSTM(units = 4))

In [9]:
model.add(Dense(1,activation = 'sigmoid'))

In [10]:
optimizer = Adam(lr = 1e-3)

In [11]:
model.compile(loss='binary_crossentropy',
              optimizer = optimizer,
              metrics = ['accuracy'])

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 544, 8)            80000     
_________________________________________________________________
lstm (LSTM)                  (None, 544, 16)           1600      
_________________________________________________________________
lstm_1 (LSTM)                (None, 544, 8)            800       
_________________________________________________________________
lstm_2 (LSTM)                (None, 4)                 208       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 82,613
Trainable params: 82,613
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.fit(x_train_pad,y_train,validation_split = 0.05,epochs = 3,batch_size= 64)

Train on 23750 samples, validate on 1250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f07cdc7f2e8>

In [47]:
results = model.evaluate(x_test_pad,y_test)



In [49]:
print("Accuracy: {0:.2%}".format(results[1]))

Accuracy: 86.00%


In [50]:
txt1 = "product is very good"
txt2 = "awesome service"
txt3 = "every thing is just working fine but its too costly"
txt4 = "this service sucks"
txt5 = "not a good sevice"
texts = [txt1,txt2,txt3,txt4,txt5]


In [51]:
tokens = tokenizer.texts_to_sequences(texts)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
tokens_pad.shape

(5, 544)

In [52]:
model.predict(tokens_pad)

array([[ 0.5218007 ],
       [ 0.52453828],
       [ 0.41996172],
       [ 0.10146295],
       [ 0.39317515]], dtype=float32)