# Lecture 12 - Deep Learning (3) - RNN
## Building a spam filtering system with RNN

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split

%matplotlib inline

* Load the SMS Spam Collection Data Set from UCI Machine Learning Repository.

In [2]:
df_sms = pd.read_csv('SMS_Spam.tsv', sep='\t')

* Replace the label "ham" and "spam" as "0" and "1".

In [3]:
df_sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df_sms['label'] = df_sms['label'].replace(['ham', 'spam'], [0, 1])

  df_sms['label'] = df_sms['label'].replace(['ham', 'spam'], [0, 1])


In [5]:
df_sms

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


* Set a feature and target for classification.

In [6]:
x = df_sms['message']
y = df_sms['label']

* Here, we use the tokenization with Keras.

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

In [9]:
sequences

[[49,
  471,
  4436,
  842,
  755,
  658,
  64,
  8,
  1327,
  88,
  123,
  351,
  1328,
  148,
  2997,
  1329,
  67,
  58,
  4437,
  144],
 [46, 336, 1499, 472, 6, 1940],
 [47,
  489,
  8,
  19,
  4,
  797,
  901,
  2,
  176,
  1941,
  1105,
  659,
  1942,
  2331,
  261,
  2332,
  71,
  1941,
  2,
  1943,
  2,
  337,
  489,
  555,
  960,
  73,
  391,
  174,
  660,
  392,
  2998],
 [6, 248, 150, 23, 382, 2999, 6, 139, 154, 57, 150],
 [1024, 1, 98, 108, 69, 490, 2, 961, 69, 1944, 221, 112, 473],
 [798,
  129,
  67,
  1690,
  145,
  109,
  158,
  1945,
  21,
  7,
  38,
  338,
  89,
  902,
  55,
  116,
  414,
  3,
  44,
  12,
  14,
  86,
  1946,
  46,
  365,
  960,
  4438,
  2,
  68,
  323,
  232,
  2,
  3000],
 [210, 11, 633, 9, 25, 55, 2, 383, 36, 10, 110, 718, 10, 55, 4439, 4440],
 [72,
  235,
  13,
  1204,
  2333,
  2334,
  1947,
  2335,
  2336,
  2337,
  799,
  118,
  109,
  609,
  72,
  13,
  1025,
  12,
  51,
  1691,
  843,
  393,
  2,
  1106,
  13,
  249,
  1025],
 [719,
  72,
  4

* Now, we have to assign the number of training and test data for sequences.

In [10]:
len(sequences)

5572

In [11]:
n_train = int(len(sequences)*0.8)
n_test = int(len(sequences) - n_train)

In [12]:
x_data = sequences
max_len = max(len(i) for i in x_data)

In [13]:
max_len

189

In [14]:
data = pad_sequences(x_data, maxlen=max_len)

In [15]:
data

array([[   0,    0,    0, ...,   58, 4437,  144],
       [   0,    0,    0, ...,  472,    6, 1940],
       [   0,    0,    0, ...,  660,  392, 2998],
       ...,
       [   0,    0,    0, ...,  107,  251, 9008],
       [   0,    0,    0, ...,  200,   12,   47],
       [   0,    0,    0, ...,    2,   61,  268]], dtype=int32)

* Split train and test sets.

In [16]:
x_train = data[:n_train]
y_train = y[:n_train]
x_test = data[n_train:]
y_test = y[n_train:]

* Now, let's construct simple RNN model to classification.

In [17]:
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
from tensorflow.keras.models import Sequential

In [18]:
word_size = len(tokenizer.word_index) + 1

In [19]:
word_size

9010

In [20]:
model = Sequential()
model.add(Embedding(word_size, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=4, batch_size=64, validation_split=0.2)

Epoch 1/4
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 57ms/step - acc: 0.8543 - loss: 0.4455 - val_acc: 0.8599 - val_loss: 0.3586
Epoch 2/4
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - acc: 0.9144 - loss: 0.2315 - val_acc: 0.9731 - val_loss: 0.0836
Epoch 3/4
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - acc: 0.9847 - loss: 0.0587 - val_acc: 0.9821 - val_loss: 0.0597
Epoch 4/4
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - acc: 0.9933 - loss: 0.0277 - val_acc: 0.9832 - val_loss: 0.0546


* How much is the model accurate?

In [21]:
model.summary()

In [22]:
predict_x=(model.predict(x_test) > 0.5).astype("int64")

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step


In [25]:
predict_x

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [23]:
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

In [24]:
print(accuracy_score(y_test,predict_x))

0.9802690582959641


## Sentiment analysis of movie review data

* Here, we will use movie review of IMDB dataset (http://ai.stanford.edu/~amaas/data/sentiment/) using LSTM model.
* Keras has a built-in review data for that.

In [26]:
from tensorflow.keras.datasets import imdb

In [27]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


* Note that the review is saved as a sequence of integers.

In [28]:
x_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

* We can extract the original words.

In [29]:
word_to_index = imdb.get_word_index()
index_to_word={}
for key, value in word_to_index.items():
    index_to_word[value] = key

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [30]:
for i in range(1,21):
  print(index_to_word[i])

the
and
a
of
to
is
br
in
it
i
this
that
was
as
for
with
movie
but
film
on


* We can also find the (preprocessed) review data.

In [31]:
for i in range(10):
  print(' '.join([index_to_word[x] for x in x_train[i]]))

the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but whe

In [32]:
y_train

array([1, 0, 0, ..., 0, 1, 0])

* Now, let's import all packages for training data

In [33]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [34]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

* In order to feed this data into our RNN, all input documents need to be the same length. We will limit the maximum review length to max_words by truncating longer reviews and padding shorter reviews with a null value (0).

In [35]:
x_train = pad_sequences(x_train, maxlen=500)
x_test = pad_sequences(x_test, maxlen=500)

* Building a LSTM model for sentiment analysis

In [36]:
model=Sequential()
model.add(Embedding(10000, 120, input_length=500))
model.add(LSTM(120))
model.add(Dense(1, activation='sigmoid'))



In [37]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=4)

In [38]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64, callbacks=[early_stop])

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 33ms/step - acc: 0.7019 - loss: 0.5551 - val_acc: 0.7257 - val_loss: 0.5385
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 45ms/step - acc: 0.8381 - loss: 0.3766 - val_acc: 0.8220 - val_loss: 0.3894
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 32ms/step - acc: 0.9111 - loss: 0.2300 - val_acc: 0.8304 - val_loss: 0.3962
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 32ms/step - acc: 0.9261 - loss: 0.1992 - val_acc: 0.8562 - val_loss: 0.3493
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 33ms/step - acc: 0.9477 - loss: 0.1457 - val_acc: 0.8702 - val_loss: 0.3824
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 33ms/step - acc: 0.9573 - loss: 0.1178 - val_acc: 0.8659 - val_loss: 0.4043
Epoch 7/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0

<keras.src.callbacks.history.History at 0x7b369a1557f0>

In [39]:
predict_x=(model.predict(x_test) > 0.5).astype("int64")
print(accuracy_score(y_test,predict_x))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step
0.8558


## News Classification

* Here, we will use Reuters news dataset from Keras for classification using LSTM.

In [40]:
from tensorflow.keras.datasets import reuters
from tensorflow.keras.utils import to_categorical

In [52]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=1000, test_split=0.2)

In [53]:
x_train = pad_sequences(x_train, maxlen=100)
x_test = pad_sequences(x_test, maxlen=100)

In [54]:
y_train

array([ 3,  4,  3, ..., 25,  3, 25])

In [55]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [56]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [46]:
model=Sequential()
model.add(Embedding(1000, 128, input_length=100))
model.add(LSTM(128))
model.add(Dense(46, activation='softmax'))



In [47]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=4)

In [48]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=30, batch_size=128, callbacks=[early_stop])

Epoch 1/30
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - acc: 0.3264 - loss: 2.9085 - val_acc: 0.4261 - val_loss: 2.3422
Epoch 2/30
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - acc: 0.4746 - loss: 2.1216 - val_acc: 0.4849 - val_loss: 1.9262
Epoch 3/30
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - acc: 0.5277 - loss: 1.8087 - val_acc: 0.5361 - val_loss: 1.7753
Epoch 4/30
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - acc: 0.5488 - loss: 1.7216 - val_acc: 0.5521 - val_loss: 1.7384
Epoch 5/30
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - acc: 0.5794 - loss: 1.6530 - val_acc: 0.5953 - val_loss: 1.6424
Epoch 6/30
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - acc: 0.6020 - loss: 1.5809 - val_acc: 0.6224 - val_loss: 1.5567
Epoch 7/30
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - acc: 0.

<keras.src.callbacks.history.History at 0x7b37087e9af0>

In [49]:
predict_x=(model.predict(x_test) > 0.5).astype("int64")
print(accuracy_score(y_test,predict_x))

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
0.6731967943009796


## References
* https://towardsdatascience.com/a-beginners-guide-on-sentiment-analysis-with-rnn-9e100627c02e
* https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/