# LSTM Model
1. Import required libraries
2. Load clean data from drive
3. Tokenize and padding
4. Compile model and evaluate with actual (imbalanced) data
5. Print Classification report for the base model
6. Upsample minotrity data
7. Build Model, Evaluate and print classification report
8. Conclusion

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report



In [2]:
path = '/content/drive/MyDrive/Guvi Files/Final Project2-V2/'
train_x = pd.read_pickle(path + 'data/train_tweets_clean.pkl')

In [3]:
train_x.shape

(29530, 5)

In [4]:
# Keep only the needed columns
train_x = train_x[['clean_tweet', 'label']]

In [5]:
train_x.head()

Unnamed: 0,clean_tweet,label
0,when a father is dysfunctional and is so sel...,0
1,thanks for lyft credit i cant use cause they...,0
2,bihday your majesty,0
3,model i love u take with u all the time in u...,0
4,factsguide society now motivation,0


In [6]:
# Tokenize and pad
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(train_x['clean_tweet'].values)
X = tokenizer.texts_to_sequences(train_x['clean_tweet'].values)
X = pad_sequences(X)

Y = pd.get_dummies(train_x['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(23624, 33) (23624, 2)
(5906, 33) (5906, 2)


In [7]:
print(Y_train[0])

[ True False]


In [8]:
# Compile model
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())



None


In [9]:
batch_size = 50
model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 1)

Epoch 1/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 110ms/step - accuracy: 0.9289 - loss: 0.2486
Epoch 2/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 109ms/step - accuracy: 0.9539 - loss: 0.1337
Epoch 3/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 111ms/step - accuracy: 0.9586 - loss: 0.1156
Epoch 4/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 107ms/step - accuracy: 0.9612 - loss: 0.1015
Epoch 5/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 107ms/step - accuracy: 0.9651 - loss: 0.0962
Epoch 6/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 116ms/step - accuracy: 0.9693 - loss: 0.0822
Epoch 7/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 108ms/step - accuracy: 0.9744 - loss: 0.0701
Epoch 8/10
[1m473/473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 106ms/step - accuracy: 0.9729 - loss: 0.0724
Epoch 9/10
[1m4

<keras.src.callbacks.history.History at 0x792bd5d63bb0>

In [10]:
model.summary()

In [11]:
# Evaluate the model
Y_pred = model.predict(X_test,batch_size = batch_size)
#print classification report
print(classification_report(np.argmax(Y_test,axis=1), np.argmax(Y_pred,axis=1)))

[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      5498
           1       0.68      0.50      0.57       408

    accuracy                           0.95      5906
   macro avg       0.82      0.74      0.77      5906
weighted avg       0.94      0.95      0.95      5906



## Address Imbalance and try again

In [12]:
train_x['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,27517
1,2013


In [13]:
# Separate majority and minority classes
data_majority = train_x[train_x['label'] == 0]
data_minority = train_x[train_x['label'] == 1]

bias = data_minority.shape[0]/data_majority.shape[0]
# lets split train/test data first then
train = pd.concat([data_majority.sample(frac=0.8,random_state=200),
         data_minority.sample(frac=0.8,random_state=200)])
test = pd.concat([data_majority.drop(data_majority.sample(frac=0.8,random_state=200).index),
        data_minority.drop(data_minority.sample(frac=0.8,random_state=200).index)])

train = shuffle(train)
test = shuffle(test)

In [14]:
print(train['label'].value_counts())
print(test['label'].value_counts())
#

label
0    22014
1     1610
Name: count, dtype: int64
label
0    5503
1     403
Name: count, dtype: int64


In [15]:
# Separate majority and minority classes in training data for upsampling
data_majority = train_x[train_x['label'] == 0]
data_minority = train_x[train_x['label'] == 1]

print("majority class before upsample:",data_majority.shape)
print("minority class before upsample:",data_minority.shape)

# Upsample minority class
data_minority_upsampled = resample(data_minority,
                                 replace=True,     # sample with replacement
                                 n_samples= data_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])

# Display new class counts
print("After upsampling\n",data_upsampled.label.value_counts(),sep = "")

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(train_x['clean_tweet'].values) # training with whole data

X_train = tokenizer.texts_to_sequences(data_upsampled['clean_tweet'].values)
X_train = pad_sequences(X_train,maxlen=32)
Y_train = pd.get_dummies(data_upsampled['label']).values
print('x_train shape:',X_train.shape)
X_test = tokenizer.texts_to_sequences(test['clean_tweet'].values)
X_test = pad_sequences(X_test,maxlen=32)
Y_test = pd.get_dummies(test['label']).values
print("x_test shape", X_test.shape)

majority class before upsample: (27517, 2)
minority class before upsample: (2013, 2)
After upsampling
label
0    27517
1    27517
Name: count, dtype: int64
x_train shape: (55034, 32)
x_test shape (5906, 32)


In [16]:
# compile model for upsampled data
embed_dim = 128
lstm_out = 192

model_up = Sequential()
model_up.add(Embedding(max_fatures, embed_dim,input_length = X_train.shape[1]))
model_up.add(SpatialDropout1D(0.4))
model_up.add(LSTM(lstm_out, dropout=0.4, recurrent_dropout=0.4))
model_up.add(Dense(2,activation='softmax'))
model_up.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])



In [18]:
batch_size = 128
# also adding weights
class_weights = {0: 1 ,
                1: 1.6/bias }
model_up.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1,
          class_weight=class_weights)

Epoch 1/15
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 181ms/step - accuracy: 0.9569 - loss: 0.2080
Epoch 2/15
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 188ms/step - accuracy: 0.9557 - loss: 0.2114
Epoch 3/15
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 179ms/step - accuracy: 0.9590 - loss: 0.1936
Epoch 4/15
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 178ms/step - accuracy: 0.9623 - loss: 0.1788
Epoch 5/15
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 184ms/step - accuracy: 0.9625 - loss: 0.1782
Epoch 6/15
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 179ms/step - accuracy: 0.9639 - loss: 0.1681
Epoch 7/15
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 183ms/step - accuracy: 0.9665 - loss: 0.1578
Epoch 8/15
[1m430/430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 180ms/step - accuracy: 0.9661 - loss: 0.1671
Epoch 9/15
[1m4

<keras.src.callbacks.history.History at 0x792bd112edd0>

In [19]:
# Evaluate the tuned model
Y_pred = model_up.predict(X_test,batch_size = batch_size)
#print classification report
print(classification_report(np.argmax(Y_test,axis=1), np.argmax(Y_pred,axis=1)))

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      5503
           1       0.68      1.00      0.81       403

    accuracy                           0.97      5906
   macro avg       0.84      0.98      0.90      5906
weighted avg       0.98      0.97      0.97      5906



In [22]:
# save the model as pickle file for perdiction
import pickle
with open(path + 'models/lstm_model.pkl', 'wb') as file:
    pickle.dump(model_up, file)