In [1]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import BinaryAccuracy
import keras_tuner as kt
import tensorflow as tf
import swifter


In [2]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

Decisions
1. Use keras hyperparameter tuner to optimize learning rate, # of neurons, epochs, etc.

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

Num GPUs Available:  1
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11789145601497034975
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5866061824
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7162466448552807067
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2070 SUPER, pci bus id: 0000:07:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [4]:
DATASET_NAME = 'dataset_v2'
train_df = pd.read_csv(f'../output/train_{DATASET_NAME}.csv')
valid_df = pd.read_csv(f'../output/valid_{DATASET_NAME}.csv')
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_lemma,question2_lemma,simple_ratio,partial_ratio,token_sort_ratio,token_set_ratio,question1_type,question2_type
0,227692,336605,336606,What are the negative consequences of polyamory?,Could Russia have prevented WWI?,0,negative consequence polyamory,could russia prevent wwi,29,32,33,33,2,-1
1,150484,236840,236841,Is Zeus still powerful without his thunderbolts?,Is Zeus powerful without his thunderbolts and ...,1,zeus still powerful without thunderbolts,zeus powerful without thunderbolt,75,88,90,90,-1,1
2,183632,280761,280762,How do we buy a house?,How do you buy a house?,1,buy house,buy house,55,100,100,100,6,6
3,342846,470884,106122,How do I bring back dead one?,Will science ever make it possible to bring ba...,0,bring back dead one,science ever make possible bring back dead,38,89,62,88,6,-1
4,403822,537442,372925,How will rising sea levels and subsequent tida...,How would I calculate the visible surface area...,0,rise sea level subsequent tidal change affect ...,would calculate visible surface area moon caus...,44,48,48,51,6,6


In [5]:
features = ['simple_ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio', 'question1_type', 'question2_type']

train_is_duplicate_df = train_df[['is_duplicate']].copy()
train_features_df = train_df[features].copy()

valid_is_duplicate_df = valid_df[['is_duplicate']].copy()
valid_features_df = valid_df[features].copy()

In [6]:
input_dim = len(train_features_df.columns)

def model_builder(hp):
  model = Sequential()

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  input_layer_units = hp.Int('input_layer_units', min_value=4, max_value=16, step=2)
  layer_1_units = hp.Int('layer_1_units', min_value=4, max_value=16, step=2)
  
  threshold = hp.Float('threshold', min_value=0.05, max_value=0.95, step=0.05)
  
  input_layer_activation = hp.Choice('input_layer_activation', values=['relu', 'tanh', 'sigmoid'], default='relu')
  layer_activation_1 = hp.Choice('layer_activation_1', values=['relu', 'tanh', 'sigmoid'], default='relu')
  
  model.add(Dense(units=input_layer_units, input_dim=input_dim, activation=input_layer_activation))
  model.add(Dense(units=layer_1_units, activation=layer_activation_1))
  model.add(Dense(1, activation='sigmoid'))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy',
                metrics=[BinaryAccuracy(name="binary_accuracy", dtype=None, threshold=threshold)])

  return model


tuner = kt.Hyperband(model_builder,
                     objective='binary_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='output',
                     project_name='quora_question_pairs')

stop_early = EarlyStopping(monitor='val_loss', patience=5)

tuner.search(train_features_df, train_is_duplicate_df, epochs=2, validation_split=0.2, callbacks=[stop_early])

best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

model = tuner.hypermodel.build(best_hps)
history = model.fit(train_features_df, train_is_duplicate_df, epochs=2, validation_split=0.2, batch_size=len(train_features_df))

val_acc_per_epoch = history.history['binary_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(train_features_df, train_is_duplicate_df, epochs=best_epoch, validation_split=0.2)

eval_result = hypermodel.evaluate(valid_features_df, valid_is_duplicate_df)
print("[test loss, test accuracy]:", eval_result)

INFO:tensorflow:Reloading Oracle from existing project output/quora_question_pairs/oracle.json
INFO:tensorflow:Reloading Tuner from output/quora_question_pairs/tuner0.json
INFO:tensorflow:Oracle triggered exit
Epoch 1/2
Epoch 2/2
Best epoch: 2
Epoch 1/2
Epoch 2/2
[test loss, test accuracy]: [0.5745606422424316, 0.6719062924385071]


In [7]:
predictions = hypermodel.predict(valid_features_df)

def is_correct(row):
    is_duplicate = True if row['is_duplicate'] == 1 else False
    return is_duplicate == (predictions[row.name] > 0.5)

valid_df['is_correct'] = valid_df.swifter.apply(lambda row: is_correct(row)[0], axis=1)

Pandas Apply: 100%|██████████| 80858/80858 [00:00<00:00, 116105.31it/s]


In [8]:
false_positives = valid_df.loc[valid_df['is_correct'] == False]
false_positives.to_csv(f'../output/v3_nn_{DATASET_NAME}_false_positives.csv', index=False)
false_positives.count()

id                  26529
qid1                26529
qid2                26529
question1           26529
question2           26529
is_duplicate        26529
question1_lemma     26528
question2_lemma     26529
simple_ratio        26529
partial_ratio       26529
token_sort_ratio    26529
token_set_ratio     26529
question1_type      26529
question2_type      26529
is_correct          26529
dtype: int64