# Text Messages Classification using LSTM, Bi-LSTM, and GRU
- Author: Nuzulul Khairu Nissa
- Link: https://nzlul.medium.com/the-classification-of-text-messages-using-lstm-bi-lstm-and-gru-f79b207f90ad

## Import Package

In [1]:
# Load, explore and plot data
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Train test split
from sklearn.model_selection import train_test_split
# Text pre-processing

# from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

# Modeling
from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional




## Load Data

In [2]:
df = pd.read_csv('./data/sms_spam.csv', sep='\t', names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [4]:
df.groupby('label').describe().T

Unnamed: 0,label,ham,spam
message,count,4825,747
message,unique,4516,653
message,top,"Sorry, I'll call later",Please call our customer service representativ...
message,freq,30,4


## Text preprocessing

In [5]:
df['msg_type'] = df['label'].map({'ham':0, 'spam':1})

df_label = df['msg_type'].values
df.head()

Unnamed: 0,label,message,msg_type
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
x_train, x_test, y_train, y_test = train_test_split(df['message'], df_label, test_size=0.2, random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [6]:
df['message'].apply(lambda x: len(x)).describe()

count    5572.000000
mean       80.489950
std        59.942907
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: message, dtype: float64

In [8]:
# Tokenization
max_len = 50 
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 500

In [9]:
tokenizer = Tokenizer(num_words = vocab_size, 
                      char_level = False,
                      oov_token = oov_tok)
tokenizer.fit_on_texts(x_train)

In [10]:
# Get the word_index
word_index = tokenizer.word_index
len(word_index)

7954

In [11]:
# Sequence and padding
training_sequences = tokenizer.texts_to_sequences(x_train)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_len,
                                padding = padding_type,
                                truncating = trunc_type)

In [12]:
testing_sequences = tokenizer.texts_to_sequences(x_test)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = max_len,
                               padding = padding_type,
                               truncating = trunc_type)

In [13]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

Shape of training tensor:  (4457, 50)
Shape of testing tensor:  (1115, 50)


## RNN

### Define the model architecture

In [14]:
# Define parameter
vocab_size = 500 
embedding_dim = 16
drop_value = 0.2
n_dense = 24

# Define Dense Model Architecture
model = Sequential()
model.add(Embedding(vocab_size,
                    embedding_dim,
                    input_length = max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))




In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 16)            8000      
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dropout (Dropout)           (None, 24)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 8433 (32.94 KB)
Trainable params: 8433 (32.94 KB)
Non-trainable params: 0 (0.00 Byte)
______________________

In [16]:
model.compile(loss='binary_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])




###  Train the model

In [17]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(training_padded,
                    y_train,
                    epochs=num_epochs, 
                    validation_data=(testing_padded, y_test),
                    callbacks =[early_stop],
                    verbose=2)

Epoch 1/30


140/140 - 1s - loss: 0.5012 - accuracy: 0.8654 - val_loss: 0.3368 - val_accuracy: 0.8664 - 807ms/epoch - 6ms/step
Epoch 2/30
140/140 - 0s - loss: 0.2965 - accuracy: 0.8658 - val_loss: 0.2424 - val_accuracy: 0.8664 - 186ms/epoch - 1ms/step
Epoch 3/30
140/140 - 0s - loss: 0.2025 - accuracy: 0.9085 - val_loss: 0.1662 - val_accuracy: 0.9462 - 188ms/epoch - 1ms/step
Epoch 4/30
140/140 - 0s - loss: 0.1410 - accuracy: 0.9598 - val_loss: 0.1156 - val_accuracy: 0.9614 - 187ms/epoch - 1ms/step
Epoch 5/30
140/140 - 0s - loss: 0.1047 - accuracy: 0.9702 - val_loss: 0.0835 - val_accuracy: 0.9758 - 186ms/epoch - 1ms/step
Epoch 6/30
140/140 - 0s - loss: 0.0782 - accuracy: 0.9749 - val_loss: 0.0686 - val_accuracy: 0.9839 - 185ms/epoch - 1ms/step
Epoch 7/30
140/140 - 0s - loss: 0.0670 - accuracy: 0.9809 - val_loss: 0.0568 - val_accuracy: 0.9848 - 189ms/epoch - 1ms/step
Epoch 8/30
140/140 - 0s - loss: 0.0588 - accuracy: 0.9814 - val_loss: 0.0518 - val_accuracy: 0.9883 - 187ms/epoch - 1ms/ste

### Evaluation

In [18]:
model.evaluate(testing_padded, y_test)



[0.04302380979061127, 0.9892376661300659]

In [19]:
train_dense_results = model.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

18/18 - 0s - loss: 0.0263 - accuracy: 0.9906 - 48ms/epoch - 3ms/step
5/5 - 0s - loss: 0.0430 - accuracy: 0.9892 - 23ms/epoch - 5ms/step
Train accuracy: 99.06
Valid accuracy: 98.92


## LSTM

### Define the LSTM model architecture

In [20]:
# Define parameter
n_lstm = 128
drop_lstm = 0.2
# Define LSTM Model 
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(SpatialDropout1D(drop_lstm))
model1.add(LSTM(n_lstm, return_sequences=False))
model1.add(Dropout(drop_lstm))
model1.add(Dense(1, activation='sigmoid'))

In [21]:
model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 16)            8000      
                                                                 
 spatial_dropout1d (Spatial  (None, 50, 16)            0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 128)               74240     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 1)                 129       
                                                                 
Total params: 82369 (321.75 KB)
Trainable params: 82369 (321.75 KB)
Non-trainable params: 0 (0.00 Byte)
________________

In [22]:
model1.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics = ['accuracy'])

In [23]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model1.fit(training_padded,
                     y_train,
                     epochs=num_epochs, 
                     validation_data=(testing_padded, y_test),
                     callbacks =[early_stop],
                     verbose=2)

Epoch 1/30
140/140 - 5s - loss: 0.2232 - accuracy: 0.9278 - val_loss: 0.2200 - val_accuracy: 0.8834 - 5s/epoch - 33ms/step
Epoch 2/30
140/140 - 3s - loss: 0.0943 - accuracy: 0.9713 - val_loss: 0.0763 - val_accuracy: 0.9812 - 3s/epoch - 21ms/step
Epoch 3/30
140/140 - 3s - loss: 0.0785 - accuracy: 0.9807 - val_loss: 0.0487 - val_accuracy: 0.9874 - 3s/epoch - 21ms/step
Epoch 4/30
140/140 - 3s - loss: 0.0762 - accuracy: 0.9809 - val_loss: 0.1655 - val_accuracy: 0.9552 - 3s/epoch - 21ms/step
Epoch 5/30
140/140 - 3s - loss: 0.0711 - accuracy: 0.9816 - val_loss: 0.0684 - val_accuracy: 0.9740 - 3s/epoch - 22ms/step


In [30]:
train_dense_results = model1.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model1.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

18/18 - 1s - loss: 0.0666 - accuracy: 0.9787 - 564ms/epoch - 31ms/step
5/5 - 0s - loss: 0.0684 - accuracy: 0.9740 - 166ms/epoch - 33ms/step
Train accuracy: 97.87
Valid accuracy: 97.40


## Bi-LSTM

In [25]:
model2 = Sequential()
model2.add(Embedding(vocab_size,
                     embedding_dim,
                     input_length = max_len))
model2.add(Bidirectional(LSTM(n_lstm,
                              return_sequences = False)))
model2.add(Dropout(drop_lstm))
model2.add(Dense(1, activation='sigmoid'))

In [26]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 16)            8000      
                                                                 
 bidirectional (Bidirection  (None, 256)               148480    
 al)                                                             
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 156737 (612.25 KB)
Trainable params: 156737 (612.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
model2.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics=['accuracy'])

In [28]:
num_epochs = 30
early_stop = EarlyStopping(monitor = 'val_loss',
                           patience = 2)
history = model2.fit(training_padded,
                     y_train,
                     epochs = num_epochs,
                     validation_data = (testing_padded, y_test),
                     callbacks = [early_stop],
                     verbose = 2)

Epoch 1/30
140/140 - 5s - loss: 0.1965 - accuracy: 0.9329 - val_loss: 0.0774 - val_accuracy: 0.9776 - 5s/epoch - 36ms/step
Epoch 2/30
140/140 - 3s - loss: 0.0653 - accuracy: 0.9782 - val_loss: 0.0639 - val_accuracy: 0.9803 - 3s/epoch - 20ms/step
Epoch 3/30
140/140 - 3s - loss: 0.0559 - accuracy: 0.9838 - val_loss: 0.0476 - val_accuracy: 0.9874 - 3s/epoch - 19ms/step
Epoch 4/30
140/140 - 3s - loss: 0.0502 - accuracy: 0.9872 - val_loss: 0.0478 - val_accuracy: 0.9865 - 3s/epoch - 19ms/step
Epoch 5/30
140/140 - 3s - loss: 0.0519 - accuracy: 0.9850 - val_loss: 0.0415 - val_accuracy: 0.9892 - 3s/epoch - 20ms/step
Epoch 6/30
140/140 - 3s - loss: 0.0417 - accuracy: 0.9870 - val_loss: 0.0566 - val_accuracy: 0.9839 - 3s/epoch - 20ms/step
Epoch 7/30
140/140 - 3s - loss: 0.0560 - accuracy: 0.9825 - val_loss: 0.0514 - val_accuracy: 0.9874 - 3s/epoch - 20ms/step


In [31]:
train_dense_results = model2.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model2.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

18/18 - 1s - loss: 0.0333 - accuracy: 0.9921 - 551ms/epoch - 31ms/step
5/5 - 0s - loss: 0.0514 - accuracy: 0.9874 - 148ms/epoch - 30ms/step
Train accuracy: 99.21
Valid accuracy: 98.74


## GRU

In [32]:
model3 = Sequential()
model3.add(Embedding(vocab_size,
                     embedding_dim,
                     input_length = max_len))
model3.add(SpatialDropout1D(0.2))
model3.add(GRU(128, return_sequences = False))
model3.add(Dropout(0.2))
model3.add(Dense(1, activation = 'sigmoid'))

In [33]:
model3.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 50, 16)            8000      
                                                                 
 spatial_dropout1d_1 (Spati  (None, 50, 16)            0         
 alDropout1D)                                                    
                                                                 
 gru (GRU)                   (None, 128)               56064     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 64193 (250.75 KB)
Trainable params: 64193 (250.75 KB)
Non-trainable params: 0 (0.00 Byte)
________________

In [34]:
model3.compile(loss = 'binary_crossentropy',
                       optimizer = 'adam',
                       metrics=['accuracy'])

In [35]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model3.fit(training_padded,
                     y_train,
                     epochs=num_epochs, 
                     validation_data=(testing_padded, y_test),
                     callbacks =[early_stop],
                     verbose=2)

Epoch 1/30
140/140 - 4s - loss: 0.4282 - accuracy: 0.8613 - val_loss: 0.3943 - val_accuracy: 0.8664 - 4s/epoch - 28ms/step
Epoch 2/30
140/140 - 2s - loss: 0.3977 - accuracy: 0.8658 - val_loss: 0.3969 - val_accuracy: 0.8664 - 2s/epoch - 17ms/step
Epoch 3/30
140/140 - 2s - loss: 0.3817 - accuracy: 0.8665 - val_loss: 0.1566 - val_accuracy: 0.9623 - 2s/epoch - 17ms/step
Epoch 4/30
140/140 - 2s - loss: 0.0865 - accuracy: 0.9769 - val_loss: 0.0459 - val_accuracy: 0.9919 - 2s/epoch - 17ms/step
Epoch 5/30
140/140 - 2s - loss: 0.0542 - accuracy: 0.9847 - val_loss: 0.0412 - val_accuracy: 0.9892 - 2s/epoch - 17ms/step
Epoch 6/30
140/140 - 2s - loss: 0.0454 - accuracy: 0.9868 - val_loss: 0.0371 - val_accuracy: 0.9910 - 2s/epoch - 16ms/step
Epoch 7/30
140/140 - 2s - loss: 0.0394 - accuracy: 0.9874 - val_loss: 0.0407 - val_accuracy: 0.9910 - 2s/epoch - 16ms/step
Epoch 8/30
140/140 - 2s - loss: 0.0330 - accuracy: 0.9917 - val_loss: 0.0361 - val_accuracy: 0.9910 - 2s/epoch - 16ms/step
Epoch 9/30
140/1

In [36]:
train_dense_results = model3.evaluate(training_padded, np.asarray(y_train), verbose=2, batch_size=256)
valid_dense_results = model3.evaluate(testing_padded, np.asarray(y_test), verbose=2, batch_size=256)
print(f'Train accuracy: {train_dense_results[1]*100:0.2f}')
print(f'Valid accuracy: {valid_dense_results[1]*100:0.2f}')

18/18 - 0s - loss: 0.0184 - accuracy: 0.9957 - 403ms/epoch - 22ms/step
5/5 - 0s - loss: 0.0462 - accuracy: 0.9901 - 113ms/epoch - 23ms/step
Train accuracy: 99.57
Valid accuracy: 99.01


## Comparing the four different models

In [37]:
# Comparing the four different models
print(f"Dense model loss and accuracy: {model.evaluate(testing_padded, y_test)} " )
print(f"LSTM model loss and accuracy: {model1.evaluate(testing_padded, y_test)} " )
print(f"Bi-LSTM model loss and accuracy: {model2.evaluate(testing_padded, y_test)} " )
print(f"GRU model loss and accuracy: {model3.evaluate(testing_padded, y_test)}")

Dense model loss and accuracy: [0.04302380979061127, 0.9892376661300659] 
LSTM model loss and accuracy: [0.06843268126249313, 0.9739910364151001] 
Bi-LSTM model loss and accuracy: [0.05136081948876381, 0.9874439239501953] 
GRU model loss and accuracy: [0.04619617387652397, 0.9901345372200012]


## Predict the Ham or Spam for the new messages

In [45]:
df.loc[df['msg_type']==1]['message'].values[:10]

array(["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
       'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.',
       'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030',
       'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info',
       'URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18',
       'XXXMobileMovieClub: To

In [47]:
predict_msg = [
  "Have friends and colleagues who could benefit from these weekly updates? Send them to this link to subscribe",
  "Call me",
  "SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info",
  "Only one chance to win CASH! From 100 to 20,000 pounds txt"
  ]

def predict_spam(predict_msg):
  new_seq = tokenizer.texts_to_sequences(predict_msg)
  padded = pad_sequences(new_seq,
                         maxlen = max_len,
                         padding = padding_type,
                         truncating = trunc_type)
  return(model.predict(padded))

predict_spam(predict_msg)



array([[0.32852173],
       [0.00537429],
       [0.9997755 ],
       [0.9550846 ]], dtype=float32)

In [48]:
df['msg_type'].value_counts()

msg_type
0    4825
1     747
Name: count, dtype: int64