# Natural Language Processing

### RNNs to
- Sentiment Analysis
- Character/Text Generation

<-> Recurrent Neural Network other than (-) CNN or Dense() is that it contains an internal loop, doesnot process the entire data at once, process it at different steps and maintains a memory (internal state) and uses it to put new input. Next word based on the previous word.

### Sentiment Analysis

In [3]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import tensorflow as tf
import os
import numpy as np

VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 64

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [5]:
len(train_data[0])

218

### More Preprocessing

In [6]:
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

Make all same length

### Creating the Model

In [9]:
model = tf.keras.Sequential([ \
                            tf.keras.layers.Embedding(VOCAB_SIZE, 32),
                            tf.keras.layers.LSTM(32),
                            tf.keras.layers.Dense(1, activation="sigmoid")])

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          2834688   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


### Training

In [11]:
model.compile(loss = "binary_crossentropy", optimizer="rmsprop",metrics=['acc'])

history = model.fit(train_data, train_labels, epochs = 10, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
results = model.evaluate(test_data, test_labels)
print(results)

[0.48671820759773254, 0.8568000197410583]


### Making Predictions

In [16]:
word_index = imdb.get_word_index()

def encode_text(text):
    tokens = tf.keras.preprocessing.text.text_to_word_sequence(text)
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    return sequence.pad_sequences([tokens], MAXLEN)[0]

text = "that movie was just amazing, so amazing"
encode = encode_text(text)
print(encode)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  12  17  13  4

In [19]:
### the decode function
reverse_word_index = {value: key for (key,value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + " "
    return text[:-1]
print(decode_integers(encode))

that movie was just amazing so amazing


### The prediction

In [24]:
def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1,250))
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])
    
positive_rev = "That movie was good and great! I really loreverse_word_indexnd watch it again because it was amazingly great"
predict(positive_rev)
negative_rev = "That movie sucked. I hated it and would not watch it again. Was one of the worst things I've ever watched"
predict(negative_rev)

[0.5558649]
[0.564133]


In [25]:
## STRANGE OUTPUTS... NEED TO REVISIT