<a href="https://colab.research.google.com/github/venkat2ram/Keras-and-Tensorflow/blob/master/uda_L10_NLP_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np


In [10]:
path=keras.utils.get_file('reviews.csv','https://drive.google.com/uc?id=13ySLC_ue6Umt9RJYSeM2t-V0kCv-4C-P')
dataset=pd.read_csv(path)
sentences=dataset['text']
labels=dataset['sentiment']

In [16]:
import tensorflow_datasets as tfds
vocab_size=1000
tokenizer=tfds.features.text.SubwordTextEncoder.build_from_corpus(sentences, vocab_size, max_subword_length=5)

In [21]:
word=sentences[5]
print(word)
enc=tokenizer.encode(sentences[5])
print(enc)

I have to jiggle the plug to get it to line up right to get decent volume.
[4, 31, 6, 849, 162, 450, 12, 1, 600, 438, 775, 6, 175, 14, 6, 55, 213, 159, 474, 775, 6, 175, 614, 380, 295, 148, 72, 789]


In [27]:
for i in enc:
  print(tokenizer.decode([i]))

I 
have 
to 
j
ig
gl
e 
the 
pl
ug
 
to 
get 
it 
to 
li
ne 
up 
right
 
to 
get 
dec
ent 
vo
lu
me
.


In [28]:
for i, sentence in enumerate(sentences):
  sentences[i] = tokenizer.encode(sentence)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [32]:
sentences[5]

[4,
 31,
 6,
 849,
 162,
 450,
 12,
 1,
 600,
 438,
 775,
 6,
 175,
 14,
 6,
 55,
 213,
 159,
 474,
 775,
 6,
 175,
 614,
 380,
 295,
 148,
 72,
 789]

In [41]:
train_size=int(sentences.size*0.8)
sequence_size=50
padding_type='post'
truncating_type='post'

train_sentences=sentences[:train_size]
test_sentences=sentences[train_size:]

padded_train_sentences=pad_sequences(train_sentences,maxlen=sequence_size,padding=padding_type, truncating=truncating_type)
padded_test_sentences=pad_sequences(test_sentences,maxlen=sequence_size,padding=padding_type, truncating=truncating_type)

train_labels=labels[:train_size]
test_labels=labels[train_size:]

train_labels_final=np.array(train_labels)
test_labels_final=np.array(test_labels)

In [42]:
padded_test_sentences[5]

array([ 54,   9, 179,  60, 511, 789,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

# Model using Embedding

In [45]:
embedding_dim=16

model=tf.keras.models.Sequential(
    [tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=sequence_size),
     tf.keras.layers.Dense(6,activation='relu'),
     tf.keras.layers.Dense(1,activation='sigmoid')]
)

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [47]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 16)            16000     
_________________________________________________________________
dense_4 (Dense)              (None, 50, 6)             102       
_________________________________________________________________
dense_5 (Dense)              (None, 50, 1)             7         
Total params: 16,109
Trainable params: 16,109
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(padded_train_sentences,train_labels_final,epochs=100,validation_data=(padded_test_sentences,test_labels_final))

In [50]:
padded_test_sentences

array([[211, 284, 646, ...,   0,   0,   0],
       [413, 233,  31, ...,   0,   0,   0],
       [625, 633, 148, ...,   0,   0,   0],
       ...,
       [822, 500,  37, ...,   0,   0,   0],
       [ 13, 219, 147, ...,   0,   0,   0],
       [291,  38,  61, ..., 789,   0,   0]], dtype=int32)

Model using LSTM

In [57]:
model1=tf.keras.models.Sequential(
    [tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=sequence_size),
     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
     tf.keras.layers.Dense(6,activation='relu'),
     tf.keras.layers.Dense(1,activation='sigmoid')]
)
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [58]:
model1.fit(padded_train_sentences,train_labels_final,epochs=15,validation_data=(padded_test_sentences,test_labels_final))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fbe00871be0>

# Model with two LSTM layers

In [60]:
model2=tf.keras.models.Sequential(
    [tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=sequence_size),
     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim,return_sequences=True)),
     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
     tf.keras.layers.Dense(6,activation='relu'),
     tf.keras.layers.Dense(1,activation='sigmoid')]
)
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [61]:
model2.fit(padded_train_sentences,train_labels_final,epochs=15,validation_data=(padded_test_sentences,test_labels_final))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fbe0316dd68>