In [5]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.layers import BatchNormalization
import keras_tuner as kt
import tensorflow as tf
import swifter


In [6]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

Decisions
1. Use keras hyperparameter tuner to optimize learning rate, # of neurons, epochs, etc.

In [7]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

Num GPUs Available:  1
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8833777564792535668
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5897977856
locality {
  bus_id: 1
  links {
  }
}
incarnation: 649234863903934078
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2070 SUPER, pci bus id: 0000:07:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [8]:
DATASET_NAME = 'dataset_v2'
train_df = pd.read_csv(f'../output/train_{DATASET_NAME}.csv')
valid_df = pd.read_csv(f'../output/valid_{DATASET_NAME}.csv')
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,...,question1_length,question2_length,question1_punctuation_count,question2_punctuation_count,question1_hash,question2_hash,question1_degree,question2_degree,question1_degree_deviation,question2_degree_deviation
0,224553,371,27778,What was the significance of the battle of Som...,What was the significance of the battle of Som...,1,wa significance battle somme battle compare c...,wa significance battle somme battle compare c...,64,91,...,119,123,2,2,What was the significance of the battle of Som...,What was the significance of the battle of Som...,20,20,18.495282,18.495282
1,31197,57541,8255,How do I get meth out of my system in 2 days?,How can I get meth out of my system ASAP?,1,get meth system 2 days,get meth system asap,50,81,...,45,41,1,1,How do I get meth out of my system in 2 days?,How can I get meth out of my system ASAP?,17,17,15.495282,15.495282
2,310228,23108,29504,Why is salt water taffy candy imported in France?,Why is saltwater taffy candy imported in Brazil?,1,salt water taffy candy import france,saltwater taffy candy import brazil,73,86,...,49,48,1,1,Why is salt water taffy candy imported in France?,Why is saltwater taffy candy imported in Brazil?,26,27,24.495282,25.495282
3,15827,30203,30204,What is the best way to take a picture with a ...,How do I take good pictures with my phone?,1,best way take picture phone,take good picture phone,57,79,...,52,42,1,1,What is the best way to take a picture with a ...,How do I take good pictures with my phone?,1,1,-0.504718,-0.504718
4,128555,206708,206709,Who is the most beautiful actress in China?,Who is the most beautiful actress in Europe?,0,beautiful actress china,beautiful actress europe,62,75,...,43,44,1,1,Who is the most beautiful actress in China?,Who is the most beautiful actress in Europe?,2,2,0.495282,0.495282


In [9]:
features = ['simple_ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio', 'question1_type', 'question2_type', 'question1_punctuation_count', 'question2_punctuation_count']

train_is_duplicate_df = train_df[['is_duplicate']].copy()
train_features_df = train_df[features].copy()

valid_is_duplicate_df = valid_df[['is_duplicate']].copy()
valid_features_df = valid_df[features].copy()

In [10]:
input_dim = len(train_features_df.columns)

def model_builder(hp):
  model = Sequential()

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  input_layer_units = hp.Int('input_layer_units', min_value=4, max_value=16, step=2)
  layer_1_units = hp.Int('layer_1_units', min_value=4, max_value=16, step=2)
    
  input_layer_activation = hp.Choice('input_layer_activation', values=['relu', 'tanh', 'sigmoid'], default='relu')
  layer_activation_1 = hp.Choice('layer_activation_1', values=['relu', 'tanh', 'sigmoid'], default='relu')
  
  model.add(Dense(units=input_layer_units, input_dim=input_dim, activation=input_layer_activation))
  model.add(Dense(units=layer_1_units, activation=layer_activation_1))
  model.add(BatchNormalization())
  model.add(Dense(1, activation='sigmoid'))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy'
                )

  return model


tuner = kt.Hyperband(model_builder,
                     objective=kt.Objective('val_loss', direction='min'), # same as binary cross entropy
                     max_epochs=10,
                     factor=3,
                     directory='output',
                     project_name='quora_question_pairs')

stop_early = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

tuner.search(train_features_df, train_is_duplicate_df, epochs=15, validation_split=0.2, callbacks=[stop_early])

best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

model = tuner.hypermodel.build(best_hps)
history = model.fit(train_features_df, train_is_duplicate_df, epochs=15, validation_split=0.2, batch_size=1000)

val_acc_per_epoch = history.history['val_loss']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(train_features_df, train_is_duplicate_df, epochs=best_epoch, validation_split=0.2)

eval_result = hypermodel.evaluate(valid_features_df, valid_is_duplicate_df)
print("test loss:", eval_result)

Trial 30 Complete [00h 03m 10s]
val_loss: 0.5368462800979614

Best val_loss So Far: 0.5356546640396118
Total elapsed time: 00h 36m 10s
INFO:tensorflow:Oracle triggered exit
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Best epoch: 1
test loss: 0.5612156391143799


In [11]:
predictions = hypermodel.predict(valid_features_df)

def is_correct(row):
    is_duplicate = True if row['is_duplicate'] == 1 else False
    return is_duplicate == (predictions[row.name] > 0.5)

valid_df['is_correct'] = valid_df.swifter.apply(lambda row: is_correct(row)[0], axis=1)

Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 106301.33it/s]


In [12]:
incorrect = valid_df.loc[valid_df['is_correct'] == False]
incorrect.to_csv(f'../output/v3_nn_{DATASET_NAME}_incorrect.csv', index=False)

correct = valid_df.loc[valid_df['is_correct'] == True]
correct.to_csv(f'../output/v3_nn_{DATASET_NAME}_correct.csv', index=False)