# LSTM Model
1. Import required libraries
2. Load clean data from drive
3. Tokenize and padding
4. Compile model and evaluate with actual (imbalanced) data
5. Print Classification report for the base model
6. Upsample minotrity data
7. Build Model, Evaluate and print classification report
8. Conclusion

In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report



In [3]:
path = '/content/drive/MyDrive/Guvi Files/Final Project2-V2/'
train_x = pd.read_pickle(path + 'data/train_tweets_clean.pkl')

In [4]:
train_x.shape

(29530, 5)

In [5]:
# Keep only the needed columns
train_x = train_x[['clean_tweet', 'label']]

In [6]:
train_x.head()

Unnamed: 0,clean_tweet,label
0,when a father is dysfunctional and is so sel...,0
1,thanks for lyft credit i cant use cause they...,0
2,bihday your majesty,0
3,model i love u take with u all the time in u...,0
4,factsguide society now motivation,0


In [21]:
from keras.utils import to_categorical

# Tokenize and pad
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_x['clean_tweet'].values)
max_features = len(tokenizer.word_index) + 1
X = tokenizer.texts_to_sequences(train_x['clean_tweet'].values)
X = pad_sequences(X)

Y = to_categorical(train_x['label'].values)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(23624, 33) (23624, 2)
(5906, 33) (5906, 2)


In [22]:
print(Y_train[0])

[1. 0.]


In [23]:
# Compile model
from keras.optimizers import Adam
from keras import regularizers
opt = Adam(learning_rate=0.01)

model = Sequential()
model.add(Embedding(max_features, 16, input_length=X_train.shape[1], mask_zero=True))
model.add(LSTM(12, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(6, kernel_regularizer=regularizers.l1_l2(0.3)))
model.add(Dropout(0.9))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
print(model.summary())



None


In [24]:
#import Earlystopping
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min',patience=5)
history_lstm = model.fit(X_train, Y_train, epochs=120,batch_size=300,validation_data=(X_test,Y_test),shuffle=False, callbacks=[es])


Epoch 1/120
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 56ms/step - accuracy: 0.9181 - loss: 4.4806 - val_accuracy: 0.9309 - val_loss: 0.3655
Epoch 2/120
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step - accuracy: 0.9313 - loss: 0.3991 - val_accuracy: 0.9309 - val_loss: 0.3029
Epoch 3/120
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 0.9313 - loss: 0.3289 - val_accuracy: 0.9309 - val_loss: 0.2909
Epoch 4/120
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.9313 - loss: 0.3007 - val_accuracy: 0.9309 - val_loss: 0.2854
Epoch 5/120
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 52ms/step - accuracy: 0.9313 - loss: 0.2885 - val_accuracy: 0.9309 - val_loss: 0.2751
Epoch 6/120
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.9313 - loss: 0.2816 - val_accuracy: 0.9309 - val_loss: 0.2780
Epoch 7/120
[1m79/79[0m [

In [25]:
model.summary()

In [26]:
# Evaluate the model
Y_pred = model.predict(X_test)
#print classification report
print(classification_report(np.argmax(Y_test,axis=1), np.argmax(Y_pred,axis=1)))

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      5498
           1       0.00      0.00      0.00       408

    accuracy                           0.93      5906
   macro avg       0.47      0.50      0.48      5906
weighted avg       0.87      0.93      0.90      5906



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Address Imbalance and try again

In [27]:
train_x['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,27517
1,2013


In [28]:
# Separate majority and minority classes
data_majority = train_x[train_x['label'] == 0]
data_minority = train_x[train_x['label'] == 1]

bias = data_minority.shape[0]/data_majority.shape[0]
# lets split train/test data first then
train = pd.concat([data_majority.sample(frac=0.8,random_state=200),
         data_minority.sample(frac=0.8,random_state=200)])
test = pd.concat([data_majority.drop(data_majority.sample(frac=0.8,random_state=200).index),
        data_minority.drop(data_minority.sample(frac=0.8,random_state=200).index)])

train = shuffle(train)
test = shuffle(test)

In [29]:
print(train['label'].value_counts())
print(test['label'].value_counts())
#

label
0    22014
1     1610
Name: count, dtype: int64
label
0    5503
1     403
Name: count, dtype: int64


In [31]:
# Separate majority and minority classes in training data for upsampling
data_majority = train_x[train_x['label'] == 0]
data_minority = train_x[train_x['label'] == 1]

print("majority class before upsample:",data_majority.shape)
print("minority class before upsample:",data_minority.shape)

# Upsample minority class
data_minority_upsampled = resample(data_minority,
                                 replace=True,     # sample with replacement
                                 n_samples= data_majority.shape[0],    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])

# Display new class counts
print("After upsampling\n",data_upsampled.label.value_counts(),sep = "")

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(train_x['clean_tweet'].values) # training with whole data

X_train = tokenizer.texts_to_sequences(data_upsampled['clean_tweet'].values)
X_train = pad_sequences(X_train,maxlen=32)
Y_train = to_categorical(data_upsampled['label'].values)
print('x_train shape:',X_train.shape)
X_test = tokenizer.texts_to_sequences(test['clean_tweet'].values)
X_test = pad_sequences(X_test,maxlen=32)
Y_test = to_categorical(test['label'].values)
print("x_test shape", X_test.shape)

majority class before upsample: (27517, 2)
minority class before upsample: (2013, 2)
After upsampling
label
0    27517
1    27517
Name: count, dtype: int64
x_train shape: (55034, 32)
x_test shape (5906, 32)


In [40]:
# compile model for upsampled data
opt = Adam(learning_rate=0.01)

model_up = Sequential()
model_up.add(Embedding(max_features, 16, input_length=X_train.shape[1], mask_zero=True))
model_up.add(LSTM(12, dropout=0.7, recurrent_dropout=0.7))
model_up.add(Dense(6, kernel_regularizer=regularizers.l1_l2(0.3)))
model_up.add(Dropout(0.9))
model_up.add(Dense(2, activation='softmax'))
model_up.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
print(model_up.summary())

None


In [41]:
history_lstm_up = model_up.fit(X_train, Y_train, epochs=120,batch_size=300,validation_data=(X_test,Y_test),shuffle=False, callbacks=[es])

Epoch 1/120
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 40ms/step - accuracy: 0.8771 - loss: 2.3470 - val_accuracy: 0.0682 - val_loss: 1.5795
Epoch 2/120
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 37ms/step - accuracy: 0.5285 - loss: 0.9387 - val_accuracy: 0.0682 - val_loss: 1.8351
Epoch 3/120
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 49ms/step - accuracy: 0.5167 - loss: 0.9755 - val_accuracy: 0.0682 - val_loss: 1.9199
Epoch 4/120
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 50ms/step - accuracy: 0.5090 - loss: 0.9867 - val_accuracy: 0.0682 - val_loss: 1.9770
Epoch 5/120
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 39ms/step - accuracy: 0.5142 - loss: 0.9771 - val_accuracy: 0.0682 - val_loss: 2.0672


In [37]:
# Evaluate the tuned model
Y_pred = model_up.predict(X_test)
#print classification report
print(classification_report(np.argmax(Y_test,axis=1), np.argmax(Y_pred,axis=1)))

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5503
           1       0.07      1.00      0.13       403

    accuracy                           0.07      5906
   macro avg       0.03      0.50      0.06      5906
weighted avg       0.00      0.07      0.01      5906



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# save the model as pickle file for perdiction
import pickle
with open(path + 'models/lstm_model.pkl', 'wb') as file:
    pickle.dump(model_up, file)