In [23]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
from kerastuner import HyperModel

NUMERIC_FEATURES = [
    'p1_height',
    'p2_height',
    'p1_age',
    'p2_age',
    'p1_rating',
    'p2_rating',
    'p1_dev',
    'p2_dev',
    'p1_surface_rating',
    'p2_surface_rating',
    'p1_surface_dev',
    'p2_surface_dev',
    'p1_w',
    'p2_w',
    'p1_l',
    'p2_l',
    'p1_surface_w',
    'p2_surface_w',
    'p1_surface_l',
    'p2_surface_l',
    'p1_inactive_days',
    'p2_inactive_days',
    'p1_recent_rating',
    'p2_recent_rating'
]
CATEGORICAL_FEATURES = [
    'is_hard',
    'is_clay',
    'is_grass',
    'is_bo5',
    'p1_lefty',
    'p2_lefty',
    'p1_home',
    'p2_home'
    
]

dataframe = pd.read_csv('../data/matches.csv')[:76425]
validation = pd.read_csv('../data/matches.csv')[76425:]


# def build_model(preprocessing_head, inputs, hp):
#   body = tf.keras.Sequential([
#     keras.layers.InputLayer(input_shape=(32,)),
#     keras.layers.Dense(hp.Int('hidden_size', 30, 300, step=30), activation='relu'),
#     keras.layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1)),
#     keras.layers.Dense(1, activation='sigmoid')
#   ])

#   preprocessed_inputs = preprocessing_head(inputs)
#   result = body(preprocessed_inputs)
#   model = tf.keras.Model(inputs, result)

#   model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
#                 optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate', 0.0001, 0.01, sampling='log')))
#   return model


In [24]:
class MyHyperModel(HyperModel):
    def __init__(self, preprocessing_head, inputs):
        self.preprocessing_head = preprocessing_head
        self.inputs = inputs
    def build(self, hp):
        body = tf.keras.Sequential([
            keras.layers.InputLayer(input_shape=(32,)),
            keras.layers.Dense(hp.Int('hidden_size', 20, 500, step=20), activation='relu'),
            keras.layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.05)),
            keras.layers.Dense(1, activation='sigmoid')
        ])

        preprocessed_inputs = self.preprocessing_head(self.inputs)
        result = body(preprocessed_inputs)
        model = tf.keras.Model(self.inputs, result)

        model.compile(
                      loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                      optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate', 0.0001, 0.01, sampling='log'))
                      )
        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Int("batch_size", 50, 10050, step=500),
            **kwargs,
        )

In [25]:
def get_model(dataframe):
    labels = dataframe.pop('p1_win')
    dataframe.pop('match_hash')
    dataframe.pop('tourney_name')
    dataframe.pop('tourney_date')
    dataframe.pop('p1_name')
    dataframe.pop('p2_name')

    dataframe_features = dataframe.copy()

    inputs = {}

    # match column names with input objects
    for name, column in dataframe_features.items():
        dtype = tf.float32
        inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

    numeric_inputs = {name:input for name,input in inputs.items()
                    if name in NUMERIC_FEATURES}

    # normalize the numeric inputs and gather them in an array
    x = keras.layers.Concatenate()(list(numeric_inputs.values()))
    norm = keras.layers.Normalization()
    norm.adapt(np.array(dataframe[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    preprocessed_inputs = [all_numeric_inputs]

    for name, input in inputs.items():
        if name in NUMERIC_FEATURES:
            continue
        preprocessed_inputs.append(input)

    preprocessed_inputs_cat = keras.layers.Concatenate()(preprocessed_inputs)
    preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

    features_dict = {name: np.array(value) for name, value in dataframe_features.items()}
    f_dict = {name:values for name, values in features_dict.items()}
    preprocessing(f_dict)

    data_model = MyHyperModel(preprocessing, inputs)

    return data_model, features_dict, labels

In [26]:
data_model, features_dict, labels = get_model(dataframe)
# data_model.fit(x=features_dict, y=labels, epochs=50)
# data_model.save('test')

_, val_features_dict, val_labels = get_model(validation)

tuner = kt.BayesianOptimization(data_model, objective='val_loss', directory='tuning', max_trials=100)
tuner.search(x=features_dict, y=labels, epochs=100, 
             validation_data=(val_features_dict, val_labels),
             callbacks=[tf.keras.callbacks.EarlyStopping('val_loss', patience=4)]
            )
# num_rows = dataframe.shape[0]
# training_end_index = math.floor((2/3) * num_rows)
# # train_dataframe = dataframe.iloc[:training_end_index]
# # val_dataframe = dataframe.iloc[training_end_index:]
# train_dataframe = dataframe.iloc[:300]
# val_dataframe = dataframe.iloc[300:450]

INFO:tensorflow:Reloading Tuner from tuning\untitled_project\tuner0.json

Search: Running Trial #11

Value             |Best Value So Far |Hyperparameter
380               |420               |hidden_size
0.15              |0                 |dropout
0.00029284        |0.0034253         |learning_rate
50                |1050              |batch_size

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

KeyboardInterrupt: 

In [None]:
best_hp = tuner.get_best_hyperparameters()[0]
best_model = tuner.get_best_models()[0]

print('best', best_hp.values)


# reloaded = tf.keras.models.load_model('test')
# validation = pd.read_csv('../data/matches.csv')[76425:80000]
# _, features_dict, labels = get_model(validation)
# results = reloaded.evaluate(features_dict, labels)
# print(results)
# predictions = reloaded.predict(features_dict)
# for i in range(76425, 76525):
#     pred_index = i - 76425
#     print('Prediction:', predictions[pred_index], 'Result:', labels[i])

best {'hidden_size': 420, 'dropout': 0.0, 'learning_rate': 0.003425317024450299, 'batch_size': 1050}
