In [27]:
import numpy as np

In [28]:
docs = ['recurrent neural network #@!',
		'neural network @',
		'artificial neural',
		'connections between nodes',
		'can create a cycle',
		'allowing output',
		'some nodes to affect subsequent',
		'exhibit temporal',
		'dynamic behavior',
		'type of Neural Network',
    'affect subsequent']

In [29]:
# pip install tensorflow==2.13.1

### Above tensorflow 2.2, keras is available as wrapper over tf

In [30]:
# pip install keras==2.13.1

In [31]:
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

2.17.0


In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer

### Performing tokenization

**tf.keras.layers.TextVectorization** is used now

Here any OOV token will be assigned \<UNK>

In [33]:
tokenizer = Tokenizer(oov_token="<UNK>")

In [34]:
tokenizer.fit_on_texts(docs)

In [35]:
tokenizer.word_counts

OrderedDict([('recurrent', 1),
             ('neural', 4),
             ('network', 3),
             ('artificial', 1),
             ('connections', 1),
             ('between', 1),
             ('nodes', 2),
             ('can', 1),
             ('create', 1),
             ('a', 1),
             ('cycle', 1),
             ('allowing', 1),
             ('output', 1),
             ('some', 1),
             ('to', 1),
             ('affect', 2),
             ('subsequent', 2),
             ('exhibit', 1),
             ('temporal', 1),
             ('dynamic', 1),
             ('behavior', 1),
             ('type', 1),
             ('of', 1)])

### 0 isn't used for index because it is used for padding

In [36]:
# Checking index for each word
tokenizer.word_index

{'<UNK>': 1,
 'neural': 2,
 'network': 3,
 'nodes': 4,
 'affect': 5,
 'subsequent': 6,
 'recurrent': 7,
 'artificial': 8,
 'connections': 9,
 'between': 10,
 'can': 11,
 'create': 12,
 'a': 13,
 'cycle': 14,
 'allowing': 15,
 'output': 16,
 'some': 17,
 'to': 18,
 'exhibit': 19,
 'temporal': 20,
 'dynamic': 21,
 'behavior': 22,
 'type': 23,
 'of': 24}

In [37]:
# Counting no. of sentences
tokenizer.document_count

11

In [38]:
# Converting each word to a token id
# Each word is replaced by its index in the generated vocubalary as its token id
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[7, 2, 3],
 [2, 3],
 [8, 2],
 [9, 10, 4],
 [11, 12, 13, 14],
 [15, 16],
 [17, 4, 18, 5, 6],
 [19, 20],
 [21, 22],
 [23, 24, 2, 3],
 [5, 6]]

### Perform padding

In [39]:
# Makes every sentence length equal to maxlen
from keras.utils import pad_sequences

n_sequences = pad_sequences(sequences=sequences, maxlen=10, padding='post')
n_sequences

array([[ 7,  2,  3,  0,  0,  0,  0,  0,  0,  0],
       [ 2,  3,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8,  2,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 9, 10,  4,  0,  0,  0,  0,  0,  0,  0],
       [11, 12, 13, 14,  0,  0,  0,  0,  0,  0],
       [15, 16,  0,  0,  0,  0,  0,  0,  0,  0],
       [17,  4, 18,  5,  6,  0,  0,  0,  0,  0],
       [19, 20,  0,  0,  0,  0,  0,  0,  0,  0],
       [21, 22,  0,  0,  0,  0,  0,  0,  0,  0],
       [23, 24,  2,  3,  0,  0,  0,  0,  0,  0],
       [ 5,  6,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

### Sentiment Analysis using RNN

In [4]:
from keras.datasets import imdb

data = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [41]:
print(np.array(data).shape, type(data), len(data))

(2, 2, 25000) <class 'tuple'> 2


In [42]:
print(f"Total number of movies = {len(data[0][0])}")

Total number of movies = 25000


### Loading in splitted format

In [43]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()

In [44]:
print(type(X_train), type(y_train))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


X_train and X_test are 2D array having no. of rows=25000 and no. of columns vary. Since the reviews are of different lengths, the dataset is loaded as a 1D array of lists, not as a 2D matrix.

In [45]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [46]:
# Max number of words in a sentence
num_words = 0
for i in range(len(X_train)):
    num_words = max(num_words, len(X_train[i]))

num_words

2494

### Perform padding

In [47]:
from keras.utils import pad_sequences

X_train = pad_sequences(sequences=X_train, maxlen=num_words, padding='post')
X_test = pad_sequences(sequences=X_test, maxlen=num_words, padding='post')

### Build model
- for Seq 2 Seq data, we use **Sequential**
-----
##### Visualizing RNN model
- **return_sequences** return whether RNN should return full sequence of output for each timesteps(No. of words in a sentence) or only the output from last timestep

- If **True**, return shape = (None, #Timesteps, #units)
- If **False**, returnn shape = (None, #units)

where **None** refers to the batch size

---------
### Parameters of a RNN
- Weights for the input to the hidden state: These are learned weights that map the input data to the hidden state.
- Weights for the recurrent connections: These are the weights that connect the hidden state at time t to the hidden state at time t+1.
- Biases: Each unit has a bias term.
- The formula to calculate the total number of parameters in an RNN is: **RNN Parameters =(units×input size)+(units×units)+units**

Here:

units = 32
input size = 1 (since each time step has only 1 feature)

Substituting these values:

RNN Parameters=(32×1)+(32×32)+32 = 1088

In [48]:
# units refer to the number of neurons 1 one layer
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten

# X_train.shape => ((25000, 2494))

model = Sequential([
SimpleRNN(units=32, input_shape=(X_train.shape[1], 1), return_sequences=False),
Dense(units=1, activation='sigmoid')
])

model.summary()

In [49]:
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [50]:
# model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

In [51]:
# model.save('model.h5')
# model.save('model.pkl')

In [52]:
print(model.weights)
model.load_weights('model.h5')
print(model.weights)

[<KerasVariable shape=(1, 32), dtype=float32, path=sequential_1/simple_rnn_1/simple_rnn_cell/kernel>, <KerasVariable shape=(32, 32), dtype=float32, path=sequential_1/simple_rnn_1/simple_rnn_cell/recurrent_kernel>, <KerasVariable shape=(32,), dtype=float32, path=sequential_1/simple_rnn_1/simple_rnn_cell/bias>, <KerasVariable shape=(32, 1), dtype=float32, path=sequential_1/dense_1/kernel>, <KerasVariable shape=(1,), dtype=float32, path=sequential_1/dense_1/bias>]
[<KerasVariable shape=(1, 32), dtype=float32, path=sequential_1/simple_rnn_1/simple_rnn_cell/kernel>, <KerasVariable shape=(32, 32), dtype=float32, path=sequential_1/simple_rnn_1/simple_rnn_cell/recurrent_kernel>, <KerasVariable shape=(32,), dtype=float32, path=sequential_1/simple_rnn_1/simple_rnn_cell/bias>, <KerasVariable shape=(32, 1), dtype=float32, path=sequential_1/dense_1/kernel>, <KerasVariable shape=(1,), dtype=float32, path=sequential_1/dense_1/bias>]


In [53]:
# predictions = model.predict(X_test)
# predictions.shape

### Using Embedding layer of keras

In [54]:
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(oov_token='UNK')
tokenizer.fit_on_texts(docs)
print(tokenizer.word_index)

sequences = tokenizer.texts_to_sequences(docs)
print(sequences)

# Perform padding
sequences = pad_sequences(sequences, padding='post')
sequences

{'UNK': 1, 'neural': 2, 'network': 3, 'nodes': 4, 'affect': 5, 'subsequent': 6, 'recurrent': 7, 'artificial': 8, 'connections': 9, 'between': 10, 'can': 11, 'create': 12, 'a': 13, 'cycle': 14, 'allowing': 15, 'output': 16, 'some': 17, 'to': 18, 'exhibit': 19, 'temporal': 20, 'dynamic': 21, 'behavior': 22, 'type': 23, 'of': 24}
[[7, 2, 3], [2, 3], [8, 2], [9, 10, 4], [11, 12, 13, 14], [15, 16], [17, 4, 18, 5, 6], [19, 20], [21, 22], [23, 24, 2, 3], [5, 6]]


array([[ 7,  2,  3,  0,  0],
       [ 2,  3,  0,  0,  0],
       [ 8,  2,  0,  0,  0],
       [ 9, 10,  4,  0,  0],
       [11, 12, 13, 14,  0],
       [15, 16,  0,  0,  0],
       [17,  4, 18,  5,  6],
       [19, 20,  0,  0,  0],
       [21, 22,  0,  0,  0],
       [23, 24,  2,  3,  0],
       [ 5,  6,  0,  0,  0]], dtype=int32)

In [55]:
from keras.datasets import imdb
from keras.utils import pad_sequences

(X_train, y_train), (X_test, y_test) = imdb.load_data()

word_index = imdb.get_word_index()
vocab_size = len(word_index) + 1  # Add 1 for padding or any out-of-vocabulary words
maxlen=0

for x in X_train:
  maxlen = max(maxlen, len(x))
print(maxlen, vocab_size)
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

2494 88585


### Model Building

For **Embedding layer()**
- **input_dim** = The size of the vocabulary, i.e., the total number of unique words (or tokens) in the input data.
- **output_dim** =  The size of the dense vectors (embedding size) that the layer will learn for each word. It defines how many dimensions each word will be represented by.
- **input_length** = The length of input sequences (number of words per sentence)

In [56]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=2, input_length=maxlen))
model.add(SimpleRNN(units=32, return_sequences=False))
model.add(Dense(units=1, activation='sigmoid'))

model.summary()



In [57]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

In [58]:
from keras.datasets import imdb
from keras.utils import pad_sequences

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

X_train = pad_sequences(X_train, padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post', maxlen=50)

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=2, input_length=50))
model.add(SimpleRNN(units=32, input_shape=(50, 1), return_sequences=False))
model.add(Dense(units=1, activation='sigmoid'))

print(model.summary())

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

None
Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 13ms/step - accuracy: 0.5188 - loss: 0.6903 - val_accuracy: 0.7630 - val_loss: 0.4925
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.7950 - loss: 0.4485 - val_accuracy: 0.8048 - val_loss: 0.4296
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.8592 - loss: 0.3372 - val_accuracy: 0.8072 - val_loss: 0.4423
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.8939 - loss: 0.2769 - val_accuracy: 0.8042 - val_loss: 0.4529
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.9061 - loss: 0.2397 - val_accuracy: 0.7918 - val_loss: 0.4718


<keras.src.callbacks.history.History at 0x79673b7d12d0>

In [59]:
sen = X_test[0][:50].reshape(1, 50)
if model.predict(sen)<0.5:
  print("Negative")
else:
  print("Positive")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 281ms/step
Negative


In [6]:
# units refer to the number of neurons 1 one layer
from keras import Sequential
from keras.utils import pad_sequences
from keras.layers import Dense, SimpleRNN, Embedding, Flatten
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

X_train = pad_sequences(X_train, padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post', maxlen=50)
# X_train.shape => ((25000, 2494))

model = Sequential([
SimpleRNN(units=32, input_shape=(X_train.shape[1], 1), return_sequences=False),
Dense(units=1, activation='sigmoid')
])

model.summary()

  super().__init__(**kwargs)


In [13]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=2, input_length=50))
model.add(SimpleRNN(units=32, return_sequences=True))
model.add(SimpleRNN(units=35, return_sequences=False))
model.add(Dense(units=1, activation='sigmoid'))

print(model.summary())

None


In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 31ms/step - accuracy: 0.5073 - loss: 0.6950 - val_accuracy: 0.6006 - val_loss: 0.6442
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.7421 - loss: 0.5158 - val_accuracy: 0.7962 - val_loss: 0.4368
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 30ms/step - accuracy: 0.8636 - loss: 0.3311 - val_accuracy: 0.8015 - val_loss: 0.4311
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 32ms/step - accuracy: 0.8960 - loss: 0.2670 - val_accuracy: 0.8054 - val_loss: 0.4812
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 30ms/step - accuracy: 0.9177 - loss: 0.2216 - val_accuracy: 0.7859 - val_loss: 0.5020


<keras.src.callbacks.history.History at 0x7e40da909360>