#  Part 1:  Recurrent Neural Network

###  Importing packages

In [None]:
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
#from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.datasets import imdb

from keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Preparing Dataset

In [None]:
max_features = 1000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

# save np.load
#np_load_old = np.load

# modify the default parameters of np.load
#np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

#np.load = np_load_old

print('Pad sequences (samples x time)')
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


In [None]:
x_train[0]

array([ 15, 256,   4,   2,   7,   2,   5, 723,  36,  71,  43, 530, 476,
        26, 400, 317,  46,   7,   4,   2,   2,  13, 104,  88,   4, 381,
        15, 297,  98,  32,   2,  56,  26, 141,   6, 194,   2,  18,   4,
       226,  22,  21, 134, 476,  26, 480,   5, 144,  30,   2,  18,  51,
        36,  28, 224,  92,  25, 104,   4, 226,  65,  16,  38,   2,  88,
        12,  16, 283,   5,  16,   2, 113, 103,  32,  15,  16,   2,  19,
       178,  32], dtype=int32)

### Visualize the data

In [None]:
INDEX_FROM=3   # word index offset

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[10] ))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
a lot of <UNK> <UNK> the <UNK> plot the characters are all very interesting in their own way and the fact that the book itself almost takes on its own character is very well done anyone <UNK> that the film won't <UNK> by the end won't be disappointed either as the ending both makes sense and <UNK> to be quite <UNK> overall <UNK> is a truly great horror film and one of the best of the <UNK> highly <UNK> viewing


### Building a Model

In [None]:
# Embedding layer enables us to convert each word into a fixed length vector of defined size.
# The resultant vector is a dense one with having real values instead of just 0’s and 1’s.
# The fixed length of word vectors helps us to represent words in a
# better way along with reduced dimensions.

# input_dim : Size of the vocabulary
# output_dim : Length of the vector for each word

In [None]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(16,  recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Build model...




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 8)           8000      
                                                                 
 lstm (LSTM)                 (None, 16)                1600      
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 9617 (37.57 KB)
Trainable params: 9617 (37.57 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Model Training

In [None]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Write the training input and output, batch size, and testing input and output

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
          validation_data=(x_test, y_test))



<keras.src.callbacks.History at 0x7f3f5c1e0ac0>

### Testing

In [None]:
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test accuracy:', acc)

Test accuracy: 0.8055999875068665


### Prediction

In [None]:
prediction = model.predict(x_test[22220:22221])
print('Prediction value:',prediction[0])
print('Test Label:',y_test[22220:22221])

Prediction value: [0.8761016]
Test Label: [1]


### Other RNN Layers

* keras.layers.RNN(cell, return_sequences=False)
* keras.layers.SimpleRNN(units, activation='tanh')
* keras.layers.GRU(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.ConvLSTM2D(filters, kernel_size, strides=(1, 1), padding='valid', )
* keras.layers.SimpleRNNCell(units, activation='tanh')
* keras.layers.GRUCell(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.LSTMCell(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform')
* keras.layers.CuDNNLSTM(units, kernel_initializer='glorot_uniform')

# Part 2: Recurrent Neural Network with Custom Dataset

In [None]:
# Credits to Peter Nagy

In [None]:
!wget "https://notebooks.azure.com/vipulmishra/projects/labgail/raw/Senti.csv"

--2023-10-05 05:53:09--  https://notebooks.azure.com/vipulmishra/projects/labgail/raw/Senti.csv
Resolving notebooks.azure.com (notebooks.azure.com)... 13.107.246.38, 13.107.213.38, 2620:1ec:bdf::38, ...
Connecting to notebooks.azure.com (notebooks.azure.com)|13.107.246.38|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://visualstudio.microsoft.com/vs/features/notebooks-at-microsoft [following]
--2023-10-05 05:53:09--  https://visualstudio.microsoft.com/vs/features/notebooks-at-microsoft
Resolving visualstudio.microsoft.com (visualstudio.microsoft.com)... 23.60.121.74
Connecting to visualstudio.microsoft.com (visualstudio.microsoft.com)|23.60.121.74|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://visualstudio.microsoft.com/vs/features/notebooks-at-microsoft/ [following]
--2023-10-05 05:53:09--  https://visualstudio.microsoft.com/vs/features/notebooks-at-microsoft/
Reusing existing connection to visua

### Load data

In [None]:
import pandas as pd
data = pd.read_csv('Senti.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

### Visualize data

In [None]:
data.head(10)

Unnamed: 0,text,sentiment
0,I love this car,Positive
1,This view is amazing,Positive
2,I feel great this morning,Positive
3,I am so excited about the concert,Positive
4,He is my best friend,Positive
5,I do not like this car,Negative
6,This view is horrible,Negative
7,I feel tired this morning,Negative
8,I am not looking forward to the concert,Negative
9,He is my enemy,Negative


### Format data

In [None]:
data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

max_fatures = 2000
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

### Training set

In [None]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print('Shape of training samples:',X_train.shape,Y_train.shape)
print('Shape of testing samples:',X_test.shape,Y_test.shape)

Shape of training samples: (6, 8) (6, 2)
Shape of testing samples: (4, 8) (4, 2)


### Design a model

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(max_features, 128, input_length=X.shape[1]))
model.add(Dropout(0.2))  # Add dropout after the Embedding layer
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


### Training

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)

Epoch 1/5
1/1 - 3s - loss: 0.6918 - accuracy: 0.6667 - 3s/epoch - 3s/step
Epoch 2/5
1/1 - 0s - loss: 0.6842 - accuracy: 0.8333 - 10ms/epoch - 10ms/step
Epoch 3/5
1/1 - 0s - loss: 0.6766 - accuracy: 0.6667 - 9ms/epoch - 9ms/step
Epoch 4/5
1/1 - 0s - loss: 0.6678 - accuracy: 0.8333 - 9ms/epoch - 9ms/step
Epoch 5/5
1/1 - 0s - loss: 0.6613 - accuracy: 0.8333 - 9ms/epoch - 9ms/step


<keras.src.callbacks.History at 0x7f3f63909630>

### Validation

In [None]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Accuracy: %.2f" % (acc))

1/1 - 1s - loss: 0.7380 - accuracy: 0.0000e+00 - 952ms/epoch - 952ms/step
Score: 0.74
Accuracy: 0.00


### Formatting Test Example

In [None]:
text = 'We are going to Delhi'
tester = np.array([text])
tester = pd.DataFrame(tester)
tester.columns = ['text']

tester['text'] = tester['text'].apply(lambda x: x.lower())
tester['text'] = tester['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

max_fatures = 2000
test = tokenizer.texts_to_sequences(tester['text'].values)
test = pad_sequences(test)

if X.shape[1]>test.shape[1]:
    test = np.pad(test[0], (X.shape[1]-test.shape[1],0), 'constant')

test = np.array([test])

prediction = model.predict(test)
print('Prediction value:',prediction[0])

Prediction value: [0.54026747 0.45973253]


# Part 3: RNN Design Choices

## Influence of number of nodes

### LSTM with 8 nodes

In [None]:
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, None, 8)           8000      
                                                                 
 lstm_2 (LSTM)               (None, 8)                 544       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 8553 (33.41 KB)
Trainable params: 8553 (33.41 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test score: 0.43627506494522095
Test accuracy: 0.8042399883270264


### LSTM with 16 nodes

In [None]:
# Write your code here
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
# Use the same layer design from the above cell

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 8)           8000      
                                                                 
 lstm_5 (LSTM)               (None, 16)                1600      
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 9617 (37.57 KB)
Trainable params: 9617 (37.57 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test score: 0.4011673331260681
Test accuracy: 0.8136799931526184


## Influence of Embedding

In [None]:
model = Sequential()
model.add(Embedding(max_features, 4))
model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 4)           4000      
                                                                 
 lstm_3 (LSTM)               (None, 16)                1344      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 5361 (20.94 KB)
Trainable params: 5361 (20.94 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test score: 0.4199399948120117
Test accuracy: 0.8073599934577942


## Influence of Dropout

### Dropout with probability 0.5

In [None]:
model = Sequential()
model.add(Embedding(max_features, 32))
model.add(LSTM(8, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)



Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 32)          32000     
                                                                 
 lstm_4 (LSTM)               (None, 8)                 1312      
                                                                 
 dense_4 (Dense)             (None, 1)                 9         
                                                                 
Total params: 33321 (130.16 KB)
Trainable params: 33321 (130.16 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test score: 0.4355815649032593
Test accuracy: 0.8011599779129028


## Multilayered RNNs

### RNN with 2 layer LSTM

In [None]:
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, None, 8)           8000      
                                                                 
 lstm_6 (LSTM)               (None, None, 8)           544       
                                                                 
 lstm_7 (LSTM)               (None, None, 8)           544       
                                                                 
 lstm_8 (LSTM)               (None, 8)                 544       
                                                                 
 dense_6 (Dense)             (None, 1)                 9         
                                                                 
Total params: 9641 (37.66 KB)
Trainable params: 9641 (37.66 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Test score: 0.430943