In [22]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.layers import BatchNormalization
import keras_tuner as kt
import tensorflow as tf
import swifter
from sklearn.model_selection import StratifiedKFold


In [23]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

Decisions
1. Use keras hyperparameter tuner to optimize learning rate, # of neurons, epochs, etc.

In [24]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

Num GPUs Available:  1
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10782995102282667847
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5828050944
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6132659599877217875
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2070 SUPER, pci bus id: 0000:07:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [25]:
DATASET_NAME = 'dataset_v2'
train_df = pd.read_csv(f'../output/train_{DATASET_NAME}.csv')
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,...,question1_length,question2_length,question1_punctuation_count,question2_punctuation_count,question1_hash,question2_hash,question1_degree,question2_degree,question1_degree_deviation,question2_degree_deviation
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market,71,100,...,66,57,1,1,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,1,1,-0.504718,-0.504718
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,43,91,...,51,88,5,5,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,4,2,2.495282,0.495282
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns,41,55,...,73,59,1,1,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,1,1,-0.504718,-0.504718
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23,25,30,...,50,65,2,10,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,1,1,-0.504718,-0.504718
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane...,fish would survive salt water,41,53,...,76,39,3,1,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,3,1,1.495282,-0.504718


In [26]:
features = ['simple_ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio', 'question1_type', 'question2_type', 'question1_punctuation_count', 'question2_punctuation_count']

train_is_duplicate_df = train_df[['is_duplicate']].copy()
train_features_df = train_df[features].copy()

In [28]:
input_dim = len(train_features_df.columns)

def model_builder(hp):
  model = Sequential()

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  input_layer_units = hp.Int('input_layer_units', min_value=4, max_value=16, step=2)
  layer_1_units = hp.Int('layer_1_units', min_value=4, max_value=16, step=2)
    
  input_layer_activation = hp.Choice('input_layer_activation', values=['relu', 'tanh', 'sigmoid'], default='relu')
  layer_activation_1 = hp.Choice('layer_activation_1', values=['relu', 'tanh', 'sigmoid'], default='relu')
  
  model.add(Dense(units=input_layer_units, input_dim=input_dim, activation=input_layer_activation))
  model.add(Dense(units=layer_1_units, activation=layer_activation_1))
  model.add(BatchNormalization())
  model.add(Dense(1, activation='sigmoid'))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy'
                )

  return model


skf = StratifiedKFold(n_splits=5)

results = []

fold = 0
for train_index, test_index in skf.split(train_features_df, train_is_duplicate_df):
    result = {}
    
    train_features_fold_df = train_features_df.iloc[train_index]
    train_is_duplicate_fold_df = train_is_duplicate_df.iloc[train_index]
    
    valid_features_fold_df = train_features_df.iloc[test_index]
    valid_is_duplicate_fold_df = train_is_duplicate_df.iloc[test_index]
    
    tuner = kt.Hyperband(model_builder,
                        objective=kt.Objective('val_loss', direction='min'), # same as binary cross entropy
                        max_epochs=10,
                        factor=3,
                        directory='output',
                        project_name='quora_question_pairs')

    stop_early = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

    tuner.search(train_features_fold_df, train_is_duplicate_fold_df, epochs=15, validation_split=0.2, callbacks=[stop_early])

    best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

    model = tuner.hypermodel.build(best_hps)

    history = model.fit(train_features_fold_df, train_is_duplicate_fold_df, epochs=15, validation_split=0.2, batch_size=1000)

    val_acc_per_epoch = history.history['val_loss']
    best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
    result['best_epoch'] = best_epoch

    hypermodel = tuner.hypermodel.build(best_hps)

    # Retrain the model
    hypermodel.fit(train_features_fold_df, train_is_duplicate_fold_df, epochs=best_epoch, validation_split=0.2)

    eval_result = hypermodel.evaluate(valid_features_fold_df, valid_is_duplicate_fold_df)
    result['test_loss'] = str(eval_result)
    results.append(result)
    
print(results)
    

Trial 30 Complete [00h 03m 03s]
val_loss: 0.5814000368118286

Best val_loss So Far: 0.5274534821510315
Total elapsed time: 00h 35m 51s
INFO:tensorflow:Oracle triggered exit
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
INFO:tensorflow:Reloading Oracle from existing project output/quora_question_pairs/oracle.json
INFO:tensorflow:Reloading Tuner from output/quora_question_pairs/tuner0.json
INFO:tensorflow:Oracle triggered exit
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
INFO:tensorflow:Reloading Oracle from existing project output/quora_question_pairs/oracle.json
INFO:tensorflow:Reloading Tuner from output/quora_question_pairs/tuner0.json
INFO:tensorflow:Oracle triggered exit
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15

In [29]:
# predictions = hypermodel.predict(valid_features_df)

# def is_correct(row):
#     is_duplicate = True if row['is_duplicate'] == 1 else False
#     return is_duplicate == (predictions[row.name] > 0.5)

# valid_df['is_correct'] = valid_df.swifter.apply(lambda row: is_correct(row)[0], axis=1)

In [30]:
# incorrect = valid_df.loc[valid_df['is_correct'] == False]
# incorrect.to_csv(f'../output/v3_nn_{DATASET_NAME}_incorrect.csv', index=False)

# correct = valid_df.loc[valid_df['is_correct'] == True]
# correct.to_csv(f'../output/v3_nn_{DATASET_NAME}_correct.csv', index=False)