In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

import pickle
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [0]:
from keras import Sequential
from keras.layers import Dense

In [0]:
def Create_NN_Model(No_Features=300, No_Hidden_Layers=2, No_Hidden_Neurons=150, 
                    Hidden_Activation ="relu", No_OP_Neurons=1, 
                    Output_Activation="sigmoid", Kernel_Initializer="random_normal",
                    Optimizer="adam", Loss='binary_crossentropy', Metrics =['accuracy']):
  
  classifier = Sequential()

  ## Input Layer
  classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
                       kernel_initializer=Kernel_Initializer, input_dim=No_Features))
  
  ## Hidden layers
  for i in range(No_Hidden_Layers):
    classifier.add(Dense(No_Hidden_Neurons, activation=Hidden_Activation, 
                         kernel_initializer=Kernel_Initializer))
    
  ## Output Layer
  classifier.add(Dense(No_OP_Neurons, activation=Output_Activation, 
                       kernel_initializer=Kernel_Initializer))
  
  classifier.compile(optimizer =Optimizer, loss=Loss, metrics = Metrics)

  return classifier
  

In [0]:
def Train_NN(NN_classifier, train_data, feature_list=[], Batch_Size=50, Epochs=100):

  train_data.dropna()
  train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
  train_data['Label'] = pd.to_numeric(train_data['Label'], errors='coerce')
  train_data = train_data.dropna(subset=['Label'])
  
  train_features = train_data[feature_list]    
  train_labels = train_data["Label"]
  train_labels = train_labels.astype('int')

  NN_classifier.fit(train_features,train_labels, batch_size=Batch_Size, epochs=Epochs)

  eval_model=NN_classifier.evaluate(train_features, train_labels)
  print("Loss: ", eval_model[0])
  print("Accuracy of the model: ", eval_model[1])

  return NN_classifier


In [0]:
## Store trained model in a file to reuse in other codes without training again on same data

def Store_Trained_NN(NN_obj, Filepath):
  
  with open(Filepath, "wb") as file:
    pickle.dump(NN_obj, file)

In [0]:
## Load stored trained model and returns random forest model object

def Load_Trained_NN(Filepath):
  
  with open(Filepath, "rb") as file:
    NN_obj = pickle.load(file)

  return NN_obj

In [0]:
def Evaluate_NN(test_data, NN_Model_FilePath, feature_list=[], threshold=0.5):
  
  test_data.dropna()
  test_data = pd.DataFrame(np.nan_to_num(np.array(test_data)),  columns = test_data.columns)
  test_data['Label'] = pd.to_numeric(test_data['Label'], errors='coerce')
  test_data = test_data.dropna(subset=['Label'])

  test_features = test_data[feature_list]
  test_labels = test_data["Label"]
  test_labels = test_labels.astype('int')

  NN_obj = Load_Trained_NN(NN_Model_FilePath) 
  predictions = NN_obj.predict(test_features)
  predictions_list = [int(p[0]) for p in predictions]
  
  for i in range(len(predictions_list)):
    if predictions_list[i] >= threshold:
      predictions_list[i] = 1
    else:
      predictions_list[i] = 0
  
  errors = abs(predictions_list - test_labels)

  # Calculate mean absolute error (MAE)
  MAE = round(np.mean(errors), 2)
  
  ## Confusion Matrix and Classification Report
  Confusion_Matrix = confusion_matrix(test_labels,predictions_list)
  Report = classification_report(test_labels,predictions_list)
  
  return MAE, Confusion_Matrix, Report
  

In [8]:
## WORD2VEC EMBEDDINGS

Column_List = [ "Caption"]
Vector_Size = 300
Embedding_Cols = [str(i) for i in range(Vector_Size)]
Column_List.extend(Embedding_Cols)
Column_List.append("Label")

Train_Embedding_FilePath = "/content/TrainData_Word2Vec_Embeddings.csv"
Test_Embedding_FilePath = "/content/TestData_Word2Vec_Embeddings.csv"
NN_Model_FilePath = "/content/NN_Word2Vec_Train_Model.pkl"

train_data = pd.read_csv(Train_Embedding_FilePath, usecols=Column_List)
test_data = pd.read_csv(Test_Embedding_FilePath, usecols=Column_List)

## Training Phase
NN_Classifier = Create_NN_Model()
NN_obj = Train_NN(NN_Classifier, train_data, Embedding_Cols)
Store_Trained_NN(NN_obj, NN_Model_FilePath)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/100





Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch

In [9]:
## Testing Phase
MAE, Confusion_Matrix, Report = Evaluate_NN(test_data, NN_Model_FilePath, Embedding_Cols, 0.5)

print("============ FOR WORD2VEC EMBEDDINGS ============")

print("MEAN ABSOLUTE ERROR: ", MAE)

print("\n")
print("============ CONFUSION MATRIX ===============")
print(Confusion_Matrix)

print("\n")
print("============ CLASSIFICATION REPORT ==============")
print(Report)

tn, fp, fn, tp = Confusion_Matrix.ravel()
Accuracy = (tn+tp)/(tn + fp + fn + tp)

print("Accuracy: ", Accuracy*100)

MEAN ABSOLUTE ERROR:  0.37


[[732   1]
 [536 187]]


              precision    recall  f1-score   support

           0       0.58      1.00      0.73       733
           1       0.99      0.26      0.41       723

    accuracy                           0.63      1456
   macro avg       0.79      0.63      0.57      1456
weighted avg       0.78      0.63      0.57      1456

Accuracy:  63.11813186813187


In [10]:
## Cross Validation

train_data.dropna()
train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
train_data['Label'] = pd.to_numeric(train_data['Label'], errors='coerce')
train_data = train_data.dropna(subset=['Label'])

train_features = train_data[Embedding_Cols]    
train_labels = train_data["Label"]
train_labels = train_labels.astype('int')

# create the sklearn model for the network
model_CV = KerasClassifier(build_fn=Create_NN_Model, verbose=1)

# we choose the initializers that came at the top in our previous cross-validation!!
kernel_initializer = ['random_uniform']
batches = [64*x for x in range(1, 3)]
epochs = [50, 100, 150]
# units = [x for x in range(50, Vector_Size, 50)]
No_Hidden_Layers = [2]
optimizer = ['adam', 'rmsprop']

# No_Hidden_Neurons=units

# grid search for initializer, batch size and number of epochs
param_grid = dict(epochs=epochs, batch_size=batches, Kernel_Initializer=kernel_initializer, 
                 No_Hidden_Layers=No_Hidden_Layers, Optimizer=optimizer)
grid = GridSearchCV(estimator=model_CV, param_grid=param_grid,cv=3)
grid_result = grid.fit(train_features, train_labels)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/

In [11]:
# print results of cross validation

print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')

Best Accuracy for 0.8593750001023431 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 2, 'Optimizer': 'rmsprop', 'batch_size': 128, 'epochs': 50}
 mean=0.8525, std=0.004631 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 50}
 mean=0.8503, std=0.004649 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 100}
 mean=0.8472, std=0.009036 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 150}
 mean=0.8486, std=0.00638 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 128, 'epochs': 50}
 mean=0.8518, std=0.005884 using {'Kernel_Initializer': 'random_uniform', 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 128, 'epochs': 100}
 mean=0.8436, std=0.003162 using {'Kernel_Initializer': 'random_uniform', 'No_Hidd

In [12]:
## GLOVE EMBEDDINGS

Column_List = [ "Caption_Tokens"]
Vector_Size = 200
Embedding_Cols = [str(i) for i in range(Vector_Size)]
Column_List.extend(Embedding_Cols)
Column_List.append("Label")

Train_Embedding_FilePath = "/content/TrainData_preTrainedGlove_Embeddings.csv"
Test_Embedding_FilePath = "/content/TestData_preTrainedGlove_Embeddings.csv"
NN_Model_FilePath = "/content/NN_Glove_Train_Model.pkl"

train_data = pd.read_csv(Train_Embedding_FilePath, usecols=Column_List)
test_data = pd.read_csv(Test_Embedding_FilePath, usecols=Column_List)

## Training Phase
NN_Classifier = Create_NN_Model(No_Features=200, No_Hidden_Neurons=100)
NN_obj = Train_NN(NN_Classifier, train_data, Embedding_Cols)
Store_Trained_NN(NN_obj, NN_Model_FilePath)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [13]:
## Testing Phase
MAE, Confusion_Matrix, Report = Evaluate_NN(test_data, NN_Model_FilePath, Embedding_Cols, 0.5)

print("============ FOR GLOVE EMBEDDINGS ============")

print("MEAN ABSOLUTE ERROR: ", MAE)

print("\n")
print("============ CONFUSION MATRIX ===============")
print(Confusion_Matrix)

print("\n")
print("============ CLASSIFICATION REPORT ==============")
print(Report)

tn, fp, fn, tp = Confusion_Matrix.ravel()
Accuracy = (tn+tp)/(tn + fp + fn + tp)

print("Accuracy: ", Accuracy*100)

MEAN ABSOLUTE ERROR:  0.27


[[686  14]
 [374 382]]


              precision    recall  f1-score   support

           0       0.65      0.98      0.78       700
           1       0.96      0.51      0.66       756

    accuracy                           0.73      1456
   macro avg       0.81      0.74      0.72      1456
weighted avg       0.81      0.73      0.72      1456

Accuracy:  73.35164835164835


In [16]:
## Cross Validation

train_data.dropna()
train_data = pd.DataFrame(np.nan_to_num(np.array(train_data)), columns = train_data.columns)
train_data['Label'] = pd.to_numeric(train_data['Label'], errors='coerce')
train_data = train_data.dropna(subset=['Label'])

train_features = train_data[Embedding_Cols]    
train_labels = train_data["Label"]
train_labels = train_labels.astype('int')

# create the sklearn model for the network
model_CV = KerasClassifier(build_fn=Create_NN_Model, verbose=1)

# we choose the initializers that came at the top in our previous cross-validation!!
kernel_initializer = ['random_uniform']
batches = [64*x for x in range(1, 3)]
epochs = [50, 100, 150]
# units = [x for x in range(50, Vector_Size, 50)]
No_Hidden_Layers = [2]
optimizer = ['adam', 'rmsprop']

# No_Hidden_Neurons=units

# grid search for initializer, batch size and number of epochs
param_grid = dict(No_Features=[200], epochs=epochs, batch_size=batches, Kernel_Initializer=kernel_initializer, 
                 No_Hidden_Layers=No_Hidden_Layers, Optimizer=optimizer)
grid = GridSearchCV(estimator=model_CV, param_grid=param_grid,cv=3)
grid_result = grid.fit(train_features, train_labels)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

In [17]:
# print results of cross validation

print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')

Best Accuracy for 0.8619505495835955 using {'Kernel_Initializer': 'random_uniform', 'No_Features': 200, 'No_Hidden_Layers': 2, 'Optimizer': 'rmsprop', 'batch_size': 128, 'epochs': 150}
 mean=0.8546, std=0.00613 using {'Kernel_Initializer': 'random_uniform', 'No_Features': 200, 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 50}
 mean=0.8546, std=0.004388 using {'Kernel_Initializer': 'random_uniform', 'No_Features': 200, 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 100}
 mean=0.8513, std=0.006133 using {'Kernel_Initializer': 'random_uniform', 'No_Features': 200, 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 64, 'epochs': 150}
 mean=0.8388, std=0.02358 using {'Kernel_Initializer': 'random_uniform', 'No_Features': 200, 'No_Hidden_Layers': 2, 'Optimizer': 'adam', 'batch_size': 128, 'epochs': 50}
 mean=0.8553, std=0.006532 using {'Kernel_Initializer': 'random_uniform', 'No_Features': 200, 'No_Hidden_Layers': 2, 'Optimizer': 'ada