In [72]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import kerastuner as kt
from kerastuner import HyperModel

NUMERIC_FEATURES = [
    'p1_height',
    'p2_height',
    'p1_age',
    'p2_age',
    'p1_rating',
    'p2_rating',
    'p1_dev',
    'p2_dev',
    'p1_surface_rating',
    'p2_surface_rating',
    'p1_surface_dev',
    'p2_surface_dev',
    'p1_w',
    'p2_w',
    'p1_l',
    'p2_l',
    'p1_surface_w',
    'p2_surface_w',
    'p1_surface_l',
    'p2_surface_l',
    'p1_inactive_days',
    'p2_inactive_days',
    'p1_recent_rating',
    'p2_recent_rating'
]
CATEGORICAL_FEATURES = [
    'is_hard',
    'is_clay',
    'is_grass',
    'is_bo5',
    'p1_lefty',
    'p2_lefty',
    'p1_home',
    'p2_home'
    
]

dataframe = pd.read_csv('../data/matches.csv')[:76425]
validation = pd.read_csv('../data/matches.csv')[76425:]


def build_model(preprocessing_head, inputs, hidden_layer, learning_rate, dropout):
  body = tf.keras.Sequential([
    keras.layers.InputLayer(input_shape=(32,)),
    keras.layers.Dense(hidden_layer, activation='relu'),
    keras.layers.Dropout(dropout),
    keras.layers.Dense(1, activation='sigmoid')
  ])

  preprocessed_inputs = preprocessing_head(inputs)
  result = body(preprocessed_inputs)
  model = tf.keras.Model(inputs, result)

  model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                optimizer=tf.keras.optimizers.Adam(learning_rate),
                metrics='acc')
  return model


In [73]:
class MyHyperModel(HyperModel):
    def __init__(self, preprocessing_head, inputs):
        self.preprocessing_head = preprocessing_head
        self.inputs = inputs
    def build(self, hp):
        body = tf.keras.Sequential([
            keras.layers.InputLayer(input_shape=(32,)),
            keras.layers.Dense(hp.Int('hidden_size', 80, 400), activation='relu'),
            keras.layers.Dropout(hp.Float('dropout', 0.35, 0.8)),
            keras.layers.Dense(1, activation='sigmoid')
        ])

        preprocessed_inputs = self.preprocessing_head(self.inputs)
        result = body(preprocessed_inputs)
        model = tf.keras.Model(self.inputs, result)

        model.compile(
                      loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                      optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate', 0.001, 0.01)),
                      metrics=['acc']
                      )
        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Int("batch_size", 2000, 6000),
            **kwargs,
        )

In [74]:
def get_model(dataframe, hidden_layer, learning_rate, dropout):
    labels = dataframe.pop('p1_win')
    dataframe.pop('match_hash')
    dataframe.pop('tourney_name')
    dataframe.pop('tourney_date')
    dataframe.pop('p1_name')
    dataframe.pop('p2_name')

    dataframe_features = dataframe.copy()

    inputs = {}

    # match column names with input objects
    for name, column in dataframe_features.items():
        dtype = tf.float32
        inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)

    numeric_inputs = {name:input for name,input in inputs.items()
                    if name in NUMERIC_FEATURES}

    # normalize the numeric inputs and gather them in an array
    x = keras.layers.Concatenate()(list(numeric_inputs.values()))
    norm = keras.layers.Normalization()
    norm.adapt(np.array(dataframe[numeric_inputs.keys()]))
    all_numeric_inputs = norm(x)

    preprocessed_inputs = [all_numeric_inputs]

    for name, input in inputs.items():
        if name in NUMERIC_FEATURES:
            continue
        preprocessed_inputs.append(input)

    preprocessed_inputs_cat = keras.layers.Concatenate()(preprocessed_inputs)
    preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

    features_dict = {name: np.array(value) for name, value in dataframe_features.items()}
    f_dict = {name:values for name, values in features_dict.items()}
    preprocessing(f_dict)

    # data_model = MyHyperModel(preprocessing, inputs)
    data_model = build_model(preprocessing, inputs, hidden_layer, learning_rate, dropout)

    return data_model, features_dict, labels

In [75]:
for hidden_layer in range(400, 601, 100):
    for learning_rate in range(8, 15):
        learning_rate = learning_rate / 1000
        for int_dropout in range(5, 15, 2):
            dropout = int_dropout / 100
            dataframe = pd.read_csv('../data/matches.csv')[:76425]
            validation = pd.read_csv('../data/matches.csv')[76425:]
            data_model, features_dict, labels = get_model(dataframe, hidden_layer, learning_rate, dropout)
            _, val_features_dict, val_labels = get_model(validation, hidden_layer, learning_rate, dropout)

            for trial in range(2):
                data_model.fit(x=features_dict, y=labels, epochs=20000, batch_size=2000, 
                            callbacks=[tf.keras.callbacks.EarlyStopping('loss', patience=20)],
                            validation_data=(val_features_dict, val_labels),
                        )
                data_model.save(f'hidden{hidden_layer}lr{learning_rate}dropout{dropout}trial{trial}')
# data_model.save('test1')

# tuner = kt.BayesianOptimization(data_model, objective='loss', directory='training_weights', max_trials=200)
# tuner.search(x=features_dict, y=labels, epochs=20000, 
#              validation_data=(val_features_dict, val_labels),
#              callbacks=[tf.keras.callbacks.EarlyStopping('loss', patience=15)]
#             )
# num_rows = dataframe.shape[0]
# training_end_index = math.floor((2/3) * num_rows)
# # train_dataframe = dataframe.iloc[:training_end_index]
# # val_dataframe = dataframe.iloc[training_end_index:]
# train_dataframe = dataframe.iloc[:300]
# val_dataframe = dataframe.iloc[300:450]

Epoch 1/20000
Epoch 2/20000
Epoch 3/20000
Epoch 4/20000
Epoch 5/20000
Epoch 6/20000
Epoch 7/20000
Epoch 8/20000
Epoch 9/20000
Epoch 10/20000
Epoch 11/20000
Epoch 12/20000
Epoch 13/20000
Epoch 14/20000
Epoch 15/20000
Epoch 16/20000
Epoch 17/20000
Epoch 18/20000
Epoch 19/20000
Epoch 20/20000
Epoch 21/20000
Epoch 22/20000
Epoch 23/20000
Epoch 24/20000
Epoch 25/20000
Epoch 26/20000
Epoch 27/20000
Epoch 28/20000
Epoch 29/20000
Epoch 30/20000
Epoch 31/20000
Epoch 32/20000
Epoch 33/20000
Epoch 34/20000
Epoch 35/20000
Epoch 36/20000
Epoch 37/20000
Epoch 38/20000
Epoch 39/20000
Epoch 40/20000
Epoch 41/20000
Epoch 42/20000
Epoch 43/20000
Epoch 44/20000
Epoch 45/20000
Epoch 46/20000
Epoch 47/20000
Epoch 48/20000
Epoch 49/20000
Epoch 50/20000
Epoch 51/20000
Epoch 52/20000
Epoch 53/20000
Epoch 54/20000
Epoch 55/20000
Epoch 56/20000
Epoch 57/20000
Epoch 58/20000
Epoch 59/20000
Epoch 60/20000
Epoch 61/20000
Epoch 62/20000
Epoch 63/20000
Epoch 64/20000
Epoch 65/20000
Epoch 66/20000
Epoch 67/20000
Epoc



INFO:tensorflow:Assets written to: hidden400lr0.008dropout0.05trial0\assets


INFO:tensorflow:Assets written to: hidden400lr0.008dropout0.05trial0\assets


Epoch 1/20000
Epoch 2/20000
Epoch 3/20000
Epoch 4/20000
Epoch 5/20000
Epoch 6/20000
Epoch 7/20000
Epoch 8/20000
Epoch 9/20000
Epoch 10/20000
Epoch 11/20000
Epoch 12/20000
Epoch 13/20000
Epoch 14/20000
Epoch 15/20000
Epoch 16/20000
Epoch 17/20000
Epoch 18/20000
Epoch 19/20000
Epoch 20/20000
Epoch 21/20000
Epoch 22/20000
Epoch 23/20000
Epoch 24/20000
Epoch 25/20000
Epoch 26/20000
Epoch 27/20000
Epoch 28/20000
Epoch 29/20000
Epoch 30/20000
Epoch 31/20000
Epoch 32/20000
Epoch 33/20000
Epoch 34/20000
Epoch 35/20000
Epoch 36/20000
Epoch 37/20000
Epoch 38/20000
Epoch 39/20000
Epoch 40/20000
Epoch 41/20000
Epoch 42/20000
Epoch 43/20000
Epoch 44/20000
Epoch 45/20000
Epoch 46/20000
Epoch 47/20000
Epoch 48/20000
Epoch 49/20000
Epoch 50/20000
Epoch 51/20000




INFO:tensorflow:Assets written to: hidden400lr0.008dropout0.05trial1\assets


INFO:tensorflow:Assets written to: hidden400lr0.008dropout0.05trial1\assets


Epoch 1/20000


In [None]:
reloaded = tf.keras.models.load_model('test1')
results = reloaded.evaluate(val_features_dict, val_labels)
val_predictions = reloaded.predict(val_features_dict)
val_labels_list = val_labels.tolist()

predictions = reloaded.predict(features_dict)
labels_list = labels.tolist()

from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

x, y = calibration_curve(val_labels_list, val_predictions, n_bins = 100)
plt.plot(y, x, 'b')
plt.plot([0, 1], [0, 1])

x, y = calibration_curve(labels_list, predictions, n_bins = 100)
plt.plot(y, x, 'r')
plt.plot([0, 1], [0, 1])


#_____________________________________________________________
# validation = pd.read_csv('../data/matches.csv')[85000:]
# _, val_features_dict, val_labels = get_model(validation)


# data_model.load_weights('asdf/metrics')
# results = data_model.evaluate(val_features_dict, val_labels)
# predictions = data_model.predict(val_features_dict)
# labels_list = val_labels.tolist()
# print(len(val_features_dict), len(labels_list))
# print(val_features_dict)

# for i in range(0, len(predictions)):
#     print('Prediction:', predictions[i], 'Result:', labels_list[i])



OSError: No file or directory found at test1