In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix

import pickle
import pandas as pd
import numpy as np

In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten

In [0]:
def Create_LSTM_Model(No_Features=300, No_Hidden_Layers=2, No_Hidden_Neurons=150, 
                    Hidden_Activation ="relu", No_OP_Neurons=1, 
                    Output_Activation="sigmoid", Kernel_Initializer="random_normal",
                    Optimizer="adam", Loss='binary_crossentropy', Metrics =['accuracy']):
  
  classifier = Sequential()

  ## Input Layer
  # classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
  #                      kernel_initializer=Kernel_Initializer, input_dim=No_Features))
  
  classifier.add(LSTM(No_Hidden_Neurons, input_shape=(1,No_Features), return_sequences=True,
                      activation=Hidden_Activation, kernel_initializer=Kernel_Initializer))
  
  ## Hidden layers with default tanh activation and sigmoid recurrent_activation
  for i in range(No_Hidden_Layers):
    classifier.add(LSTM(No_Hidden_Neurons, return_sequences=True))
  classifier.add(Flatten())
  ## Output Layer
  classifier.add(Dense(No_OP_Neurons, activation=Output_Activation, 
                       kernel_initializer=Kernel_Initializer))
  
  classifier.compile(optimizer =Optimizer, loss=Loss, metrics = Metrics)

  return classifier
  

In [0]:
def Train_LSTM(LSTM_classifier, train_data, feature_list=[], Batch_Size=50, Epochs=100):

  train_data.dropna()
  train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
  train_data['Label'] = pd.to_numeric(train_data['Label'], errors='coerce')
  train_data = train_data.dropna(subset=['Label'])
  
  train_features = train_data[feature_list]    
  train_labels = train_data["Label"]
  train_labels = train_labels.astype('int')

  print("train_features shape: ", train_features.shape)
  # print("train_features col: ", train_features.columns)
  # print("train_features head: ", train_features.head)
  train_features = np.array(train_features)
  train_features = np.reshape(train_features, (train_features.shape[0], 1, train_features.shape[1]))
  print("train_features shape: ", train_features.shape)
  print(train_labels.shape)
  LSTM_classifier.fit(train_features,train_labels, batch_size=Batch_Size, epochs=Epochs)
  eval_model=LSTM_classifier.evaluate(train_features, train_labels)

  print("Loss: ", eval_model[0])
  print("Accuracy of the model: ", eval_model[1])
  return LSTM_classifier


In [0]:
## Store trained model in a file to reuse in other codes without training again on same data

def Store_Trained_LSTM(LSTM_obj, Filepath):
  
  with open(Filepath, "wb") as file:
    pickle.dump(LSTM_obj, file)

In [0]:
## Load stored trained model and returns random forest model object

def Load_Trained_LSTM(Filepath):
  
  with open(Filepath, "rb") as file:
    LSTM_obj = pickle.load(file)

  return LSTM_obj

In [0]:
def Evaluate_LSTM(test_data, LSTM_Model_FilePath, feature_list=[], threshold=0.5):
  
  test_data.dropna()
  test_data = pd.DataFrame(np.nan_to_num(np.array(test_data)),  columns = test_data.columns)
  test_data['Label'] = pd.to_numeric(test_data['Label'], errors='coerce')
  test_data = test_data.dropna(subset=['Label'])

  test_features = test_data[feature_list]
  test_labels = test_data["Label"]
  test_labels = test_labels.astype('int')
  # print(test_features)
  test_features=np.array(test_features);
  test_features = np.reshape(test_features, (test_features.shape[0], 1, test_features.shape[1]))
  LSTM_obj = Load_Trained_LSTM(LSTM_Model_FilePath) 
  # print(test_features.shape)
  predictions = LSTM_obj.predict(test_features)
  predictions_list = [int(p[0]) for p in predictions]
  
  for i in range(len(predictions_list)):
    if predictions_list[i] >= threshold:
      predictions_list[i] = 1
    else:
      predictions_list[i] = 0
  
  errors = abs(predictions_list - test_labels)

  # Calculate mean absolute error (MAE)
  MAE = round(np.mean(errors), 2)
  
  ## Confusion Matrix and Classification Report
  Confusion_Matrix = confusion_matrix(test_labels,predictions_list)
  Report = classification_report(test_labels,predictions_list)
  
  return MAE, Confusion_Matrix, Report
  

In [0]:
## WORD2VEC EMBEDDINGS

Column_List = [ "Caption"]
Vector_Size = 300
Embedding_Cols = [str(i) for i in range(Vector_Size)]
Column_List.extend(Embedding_Cols)
Column_List.append("Label")

Train_Embedding_FilePath = "/content/TrainData_Word2Vec_Embeddings.csv"
Test_Embedding_FilePath = "/content/TestData_Word2Vec_Embeddings.csv"
LSTM_Model_FilePath = "/content/LSTM_Word2Vec_Train_Model.pkl"

train_data = pd.read_csv(Train_Embedding_FilePath, usecols=Column_List)
test_data = pd.read_csv(Test_Embedding_FilePath, usecols=Column_List)
print(train_data.shape)
print(len(Embedding_Cols))
## Training Phase
LSTM_Classifier = Create_LSTM_Model()
LSTM_obj = Train_LSTM(LSTM_Classifier, train_data, Embedding_Cols)


In [0]:
Store_Trained_LSTM(LSTM_obj, LSTM_Model_FilePath)

In [133]:
## Testing Phase
MAE, Confusion_Matrix, Report = Evaluate_LSTM(test_data, LSTM_Model_FilePath, Embedding_Cols, 0.5)

print("============ FOR WORD2VEC EMBEDDINGS ============")

print("MEAN ABSOLUTE ERROR: ", MAE)

print("\n")
print("============ CONFUSION MATRIX ===============")
print(Confusion_Matrix)

print("\n")
print("============ CLASSIFICATION REPORT ==============")
print(Report)

tn, fp, fn, tp = Confusion_Matrix.ravel()
Accuracy = (tn+tp)/(tn + fp + fn + tp)

print("Accuracy: ", Accuracy*100)

MEAN ABSOLUTE ERROR:  0.22


[[719  14]
 [310 413]]


              precision    recall  f1-score   support

           0       0.70      0.98      0.82       733
           1       0.97      0.57      0.72       723

    accuracy                           0.78      1456
   macro avg       0.83      0.78      0.77      1456
weighted avg       0.83      0.78      0.77      1456

Accuracy:  77.74725274725274


In [138]:
## GLOVE EMBEDDINGS

Column_List = [ "Caption_Tokens"]
Vector_Size = 300
Embedding_Cols = [str(i) for i in range(Vector_Size)]
Column_List.extend(Embedding_Cols)
Column_List.append("Label")

Train_Embedding_FilePath = "/content/TrainData_Glove_Embeddings.csv"
Test_Embedding_FilePath = "/content/TestData_Glove_Embeddings.csv"
LSTM_Model_FilePath = "/content/LSTM_Glove_Train_Model.pkl"

train_data = pd.read_csv(Train_Embedding_FilePath, usecols=Column_List)
test_data = pd.read_csv(Test_Embedding_FilePath, usecols=Column_List)

## Training Phase
LSTM_Classifier = Create_LSTM_Model()
LSTM_obj = Train_LSTM(LSTM_Classifier, train_data, Embedding_Cols)
Store_Trained_LSTM(LSTM_obj, LSTM_Model_FilePath)

train_features shape:  (5824, 300)
train_features shape:  (5824, 1, 300)
(5824,)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch

In [139]:
## Testing Phase
MAE, Confusion_Matrix, Report = Evaluate_LSTM(test_data, LSTM_Model_FilePath, Embedding_Cols, 0.5)

print("============ FOR GLOVE EMBEDDINGS ============")

print("MEAN ABSOLUTE ERROR: ", MAE)

print("\n")
print("============ CONFUSION MATRIX ===============")
print(Confusion_Matrix)

print("\n")
print("============ CLASSIFICATION REPORT ==============")
print(Report)

tn, fp, fn, tp = Confusion_Matrix.ravel()
Accuracy = (tn+tp)/(tn + fp + fn + tp)

print("Accuracy: ", Accuracy*100)

MEAN ABSOLUTE ERROR:  0.42


[[714   3]
 [611 128]]


              precision    recall  f1-score   support

           0       0.54      1.00      0.70       717
           1       0.98      0.17      0.29       739

    accuracy                           0.58      1456
   macro avg       0.76      0.58      0.50      1456
weighted avg       0.76      0.58      0.49      1456

Accuracy:  57.829670329670336
