In [1]:
import pandas as pd
import numpy as np
import math
import glob

from model import TennisModel
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam

In [2]:
# Files
DATASET_FULL = 'dataset/dataset_full_2023-06-26.csv'
DATASET = 'dataset/dataset_2023-06-26.csv'
COUNTRY_CODES = 'dataset/ioc_iso.csv'
TOURNAMENTS = 'dataset/tournaments.csv'
PLAYERS = 'dataset/atp_players.csv'
NEW_DATASET = 'dataset/latest_matches/atp_matches_2023.csv'

LATEST_TOUR_MATCHES = 'dataset/latest_matches/atp_matches_2023.csv'
LATEST_CHALLENGER_MATCHES = 'dataset/latest_matches/atp_matches_qual_chall_2023.csv'
LATEST_FUTURES_MATCHES = 'dataset/latest_matches/atp_matches_futures_2023.csv'
LATEST_MATCHES = glob.glob('dataset/latest_matches/*.csv')
ALL_MATCHES = glob.glob('dataset/unused_datasets/*.csv')

pd.set_option('mode.chained_assignment', None)

In [3]:
df = pd.read_csv(DATASET, keep_default_na=False)

compare_df = pd.read_csv(DATASET, keep_default_na=False)
compare_df.index = range(compare_df.shape[0])

are_equal = df.equals(compare_df)

if are_equal:
    print("The DataFrames are equal.")
else:
    print("The DataFrames are not equal.")

The DataFrames are equal.


In [3]:
model = TennisModel(DATASET, DATASET_FULL, TOURNAMENTS, COUNTRY_CODES, PLAYERS, 'model_files/tennis_model_2l_2023-08-07.keras')

In [4]:
model.add_matches(LATEST_MATCHES)

Unnamed: 0,tourney_name,tourney_level,tourney_date,p1_id,p1_name,p1_age,p2_id,p2_name,p2_age,best_of,...,p2_tourney_losses,p1_last2w_games,p2_last2w_games,p1_weeks_inactive,p2_weeks_inactive,p1_cwins,p2_cwins,p1_closses,p2_closses,new_col
0,Bloemfontein,A,19680108,109950,Francois Van Der Merwe,23.6,210786,J Oosthuysen,23.6,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,Bloemfontein,A,19680108,103616,Andre Van Der Merwe B,23.6,100011,Torben Ulrich,39.2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,Bloemfontein,A,19680108,202674,Keith Brebnor,20.4,106867,Derek Schroder,23.6,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,Bloemfontein,A,19680108,109914,Jackie Saul,23.6,210785,P Bonfa,23.6,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,Bloemfontein,A,19680108,107078,George Rudman,23.6,210784,Byron Humphrey,23.6,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870049,Washington,A,20230731,134868,Tallon Griekspoor,27.0,200670,J J Wolf,24.6,3,...,2.0,2.0,6.0,0.0,0.0,2.0,2.0,0.0,0.0,1
870050,Washington,A,20230731,126203,Taylor Fritz,25.7,111442,Jordan Thompson,29.2,3,...,5.0,6.0,7.0,0.0,0.0,7.0,3.0,0.0,0.0,1
870051,Washington,A,20230731,105554,Daniel Evans,33.1,105777,Grigor Dimitrov,32.2,3,...,7.0,4.0,3.0,0.0,0.0,3.0,6.0,0.0,0.0,1
870052,Washington,A,20230731,134868,Tallon Griekspoor,27.0,126203,Taylor Fritz,25.7,3,...,3.0,3.0,7.0,0.0,0.0,3.0,8.0,0.0,0.0,1


In [6]:
model.df.to_csv('dataset/dataset_2023-08-07.csv', index=False)
model.df_symm.to_csv('dataset/dataset_2023-08-07_cnn.csv', index=False)
model.df_full.to_csv('dataset/dataset_full_2023-08-07.csv', index=False)

In [4]:
cols = ['Tournament', 'Surface', 'Round', 'Winner', 'Loser', 'B365W', 'B365L']
wimbledon = pd.read_csv('dataset/wimbledon.csv', usecols=cols)
player_df = pd.read_csv('dataset/atp_players.csv')
player_df['full_name'] = player_df['name_first'] + ' ' + player_df['name_last']

In [6]:
one_unit_sizing_1l = 0
one_unit_sizing_2l = 0
one_unit_sizing_3l = 0
kelly_sizing_1l = 0
kelly_sizing_2l = 0
kelly_sizing_3l = 0

for index, row in wimbledon.iterrows():
    winner = row['Winner']
    loser = row['Loser']

    if len(winner.split(' ')) > 2:
        winner_last_name = winner.split(' ')[:-1]
        winner_last_name = ' '.join(winner_last_name)
        winner_first_initial = winner.split(' ')[-1]
    else:
        winner_last_name = winner.split(' ')[0]
        winner_first_initial = winner.split(' ')[1][0]

    if len(loser.split(' ')) > 2:
        loser_last_name = loser.split(' ')[:-1]
        loser_last_name = ' '.join(loser_last_name)
        loser_first_initial = loser.split(' ')[-1]
    else:
        loser_last_name = loser.split(' ')[0]
        loser_first_initial = loser.split(' ')[1][0]

    if winner_last_name == 'Barrios': 
        p1 = 'Tomas Barrios Vera'
    elif winner_last_name == 'Davidovich':
        p1 = 'Alejandro Davidovich Fokina'
    elif winner_last_name == 'Carballes':
        p1 = 'Roberto Carballes Baena'
    elif winner_last_name == 'O Connell':
        p1 = 'Christopher Oconnell'
    elif winner_last_name == 'Thompson':
        p1 = 'Jordan Thompson' 
    elif winner_last_name == 'Zhang':
        p1 = 'Zhizhen Zhang'
    else:
        p1_vals = player_df[player_df['name_last'] == winner_last_name].dropna(subset=['dob'])
        if len(p1_vals) == 0:
            raise Exception(f'winner {winner} does not exist in database.')
        elif len(p1_vals) == 1:
            p1 = p1_vals['full_name'].values[0]
        else:
            p1_same_initials = p1_vals[p1_vals['name_first'].str[0] == winner_first_initial]
            if len(p1_same_initials) == 0:
                raise Exception(f'winner {winner} initial does not exist in database')
            elif len(p1_same_initials) == 1:
                p1 = p1_same_initials['full_name'].values[0]
            else:
                p1_youngest_player = p1_same_initials.sort_values('dob')
                p1 = p1_youngest_player.iloc[-1]['full_name']

    if loser_last_name == 'Barrios':
        p2 = 'Tomas Barrios Vera'
    elif loser_last_name == 'Ramos-Vinolas':
        p2 = 'Albert Ramos'
    elif loser_last_name == 'Auger-Aliassime':
        p2 = 'Felix Auger Aliassime'
    elif loser_last_name == 'Zapata':
        p2 = 'Bernabe Zapata Miralles'
    elif loser_last_name == 'Van':
        p2 = 'Luca Van Assche'
    elif loser_last_name == 'Marozsan':
        p2 = 'Fabian Marozsan'
    elif loser_last_name == 'Davidovich':
        p2 = 'Alejandro Davidovich Fokina'
    elif loser_last_name == 'Carballes':
        p2 = 'Roberto Carballes Baena'
    elif loser_last_name == 'O Connell':
        p2 = 'Christopher Oconnell'
    elif loser_last_name == 'Thompson':
        p2 = 'Jordan Thompson'
    elif loser_last_name == 'Nakashima':
        p2 = 'Brandon Nakashima'
    elif loser_last_name == 'Zhang':
        p2 = 'Zhizhen Zhang'
    else:       
        p2_vals = player_df[player_df['name_last'] == loser_last_name].dropna(subset=['dob'])
        if len(p2_vals) == 0:
            raise Exception(f'loser {loser} does not exist in database.')
        elif len(p2_vals) == 1:
            p2 = p2_vals['full_name'].values[0]
        else:
            p2_same_initials = p2_vals[p2_vals['name_first'].str[0] == loser_first_initial]
            if len(p2_same_initials) == 0:
                raise Exception(f'loser {loser} initial does not exist in database')
            elif len(p2_same_initials) == 1:
                p2 = p2_same_initials['full_name'].values[0]
            else:
                p2_youngest_player = p2_same_initials.sort_values('dob')
                p2 = p2_youngest_player.iloc[-1]['full_name']
    
    match = {
        'tourney_name' : 'Wimbledon',
        'surface' : 'Grass',
        'tourney_level' : 'G',
        'tourney_date' : 20230716,
        'best_of' : 5,
        'file_origin' : 'tour',
        'p1_name' : p1,
        'p2_name' : p2,   
    }
        
    preds = model.predict_match(match, 0)
    if preds['p1_win_prob'] > 0.5:
        one_unit_sizing_1l += (row['B365W'] - 1)
        print(f'CNN_1L: Unit bet was good for {p1} and {p2} -> increasing record by {(row["B365W"]) - 1} units')
    else:
        one_unit_sizing_1l -= 1
        print(f'CNN_1L: Unit bet was bad for {p1} and {p2} -> decreasing record by {1} units')
    print(f'CNN_1L: Unit record is now {one_unit_sizing_1l}')

    preds = model.predict_match(match, 1)
    if preds['p1_win_prob'] > 0.5:
        one_unit_sizing_2l += (row['B365W'] - 1)
        print(f'CNN_2L: Unit bet was good for {p1} and {p2} -> increasing record by {(row["B365W"]) - 1} units')
    else:
        one_unit_sizing_2l -= 1
        print(f'CNN_2L: Unit bet was bad for {p1} and {p2} -> decreasing record by {1} units')
    print(f'CNN_2L: Unit record is now {one_unit_sizing_2l}')

    preds = model.predict_match(match, 2)
    if preds['p1_win_prob'] > 0.5:
        one_unit_sizing_3l += (row['B365W'] - 1)
        print(f'CNN_3L: Unit bet was good for {p1} and {p2} -> increasing record by {(row["B365W"]) - 1} units')
    else:
        one_unit_sizing_3l -= 1
        print(f'CNN_3L: Unit bet was bad for {p1} and {p2} -> decreasing record by {1} units')
    print(f'CNN_3L: Unit record is now {one_unit_sizing_3l}')

    p1_bet, p2_bet = model.get_bet(match, row['B365W'], row['B365L'], 0)
    if p1_bet > 0 :
        print(f'CNN_1L: kelly bet was good for {p1} and {p2} -> increasing record by {(p1_bet * row["B365W"]) - p1_bet} units')
        kelly_sizing_1l += ((p1_bet * row["B365W"]) - p1_bet)
    elif p2_bet > 0:
        print(f'CNN_1L: kelly bet was bad for {p1} and {p2} -> decreasing record by {p2_bet} units')
        kelly_sizing_1l -= p2_bet
    else:
        print(f'CNN_1L: No kelly bet was made for {p1} and {p2}')
    print(f'CNN_1L: kelly record is now {kelly_sizing_1l}U')

    p1_bet, p2_bet = model.get_bet(match, row['B365W'], row['B365L'], 1)
    if p1_bet > 0:
        print(f'CNN_2L: kelly bet was good for {p1} and {p2} -> increasing record by {(p1_bet * row["B365W"]) - p1_bet} units')
        kelly_sizing_2l += ((p1_bet * row["B365W"]) - p1_bet)
    elif p2_bet > 0:
        print(f'CNN_2L: kelly bet was bad for {p1} and {p2} -> decreasing record by {p2_bet} units')
        kelly_sizing_2l -= p2_bet
    else:
        print(f'CNN_2L: No kelly bet was made for {p1} and {p2}')
    print(f'CNN_2L: kelly record is now {kelly_sizing_2l}U')

    p1_bet, p2_bet = model.get_bet(match, row['B365W'], row['B365L'], 2)
    if p1_bet > 0:
        print(f'CNN_3L: kelly bet was good for {p1} and {p2} -> increasing record by {(p1_bet * row["B365W"]) - p1_bet} units')
        kelly_sizing_3l += ((p1_bet * row["B365W"]) - p1_bet)
    elif p2_bet > 0:
        print(f'CNN_3L: kelly bet was bad for {p1} and {p2} -> decreasing record by {p2_bet} units')
        kelly_sizing_3l -= p2_bet
    else:
        print(f'CNN_3L: No kelly bet was made for {p1} and {p2}')
    print(f'CNN_3L: kelly record is now {kelly_sizing_3l}U')
        

print(f'Final record for CNN_1L unit bet is {one_unit_sizing_1l}U')
print(f'Final record for CNN_2L unit bet is {one_unit_sizing_2l}U')
print(f'Final record for CNN_3L unit bet is {one_unit_sizing_3l}U')
print(f'Final record for CNN_1L kelly bet is {kelly_sizing_1l}U')
print(f'Final record for CNN_2L kelly bet is {kelly_sizing_2l}U')
print(f'Final record for CNN_3L kelly bet is {kelly_sizing_3l}U')

CNN_1L: Unit bet was bad for Tomas Barrios Vera and Sebastian Baez -> decreasing record by 1 units
CNN_1L: Unit record is now -1
CNN_2L: Unit bet was bad for Tomas Barrios Vera and Sebastian Baez -> decreasing record by 1 units
CNN_2L: Unit record is now -1
CNN_3L: Unit bet was bad for Tomas Barrios Vera and Sebastian Baez -> decreasing record by 1 units
CNN_3L: Unit record is now -1
CNN_1L: No kelly bet was made for Tomas Barrios Vera and Sebastian Baez
CNN_1L: kelly record is now 0U
CNN_2L: No kelly bet was made for Tomas Barrios Vera and Sebastian Baez
CNN_2L: kelly record is now 0U
CNN_3L: No kelly bet was made for Tomas Barrios Vera and Sebastian Baez
CNN_3L: kelly record is now 0U
CNN_1L: Unit bet was good for Lorenzo Musetti and Juan Pablo Varillas -> increasing record by 0.06000000000000005 units
CNN_1L: Unit record is now -0.94
CNN_2L: Unit bet was good for Lorenzo Musetti and Juan Pablo Varillas -> increasing record by 0.06000000000000005 units
CNN_2L: Unit record is now -0.9

In [2]:
# Add matches below that are for ongoing tourneys that data have not been updated yet
latest_matches = [
    {
        'tourney_name' : 'Canadian Masters',
        'surface' : 'Hard',
        'tourney_level' : 'G',
        'tourney_date' : 20230808,
        'best_of' : 3,
        'file_origin' : 'tour',
        'winner_name' : 'Mackenzie McDonald',
        'loser_name' : 'Milos Raonic',
        'winner' : 
    },
    {
        'tourney_name' : 'Canadian Masters',
        'surface' : 'Hard',
        'tourney_level' : 'G',
        'tourney_date' : 20230808,
        'best_of' : 3,
        'file_origin' : 'tour',
        'p1_name' : 'Adrian Mannarino',
        'p2_name' : 'Taro Daniel',
    },
    {
        'tourney_name' : 'Canadian Masters',
        'surface' : 'Hard',
        'tourney_level' : 'G',
        'tourney_date' : 20230808,
        'best_of' : 3,
        'file_origin' : 'tour',
        'p1_name' : 'Mackenzie McDonald',
        'p2_name' : 'Milos Raonic',
    },
]

In [6]:
new_matches = pd.DataFrame(matches)

In [18]:
match = {
        'tourney_name' : 'Wimbledon',
        'surface' : 'Grass',
        'tourney_level' : 'G',
        'tourney_date' : 20230716,
        'best_of' : 5,
        'file_origin' : 'tour',
        'p1_name' : 'Novak Djokovic',
        'p2_name' : 'Andrey Rublev',
    }
preds = model.predict_match(match)

In [20]:
preds['p1_win_prob']

0.46600647270679474

In [64]:
#'p1_name' : 'Tomas Barrios Vera',
#'p2_name' : 'Sebastian Baez',
#'p1_name' : 'Lorenzo Musetti',
#'p2_name' : 'Juan Pablo Varillas',
# 'p1_name' : 'Aslan Karatsev',
# 'p2_name' : 'Luca Van Assche',
# 'p1_name' : 'Jordon Thompson',
# 'p2_name' : 'Brandon Nakashima',
# 'p1_name' : 'Andrey Rublev',
# 'p2_name' : 'Max Purcell',

match = {
    'tourney_name' : 'Wimbledon',
    'surface' : 'Grass',
    'tourney_level' : 'G',
    'tourney_date' : 20230716,
    'best_of' : 5,
    'file_origin' : 'tour',
    'p1_name' : 'Maximilian Marterer',
    'p2_name' : 'Borna Gojo',
}

# match = {
#     'tourney_name' : 'Hamburg',
#     'surface' : 'Clay',
#     'tourney_level' : 'A',
#     'tourney_date' : 20230716,
#     'best_of' : 3,
#     'file_origin' : 'tour',
#     'p1_name' : 'Maximilian Marterer',
# 'p2_name' : 'Borna Gojo',
# }

In [65]:
model.get_bet(match, 2, 1.8)

type of preds is {'p1_win_prob': 0.359793484210968, 'p2_win_prob': 0.640206515789032}
Model suggests a 0U bet for Maximilian Marterer
Model suggests a 0.0U bet for Borna Gojo


0U

In [35]:
def convert_to_other_dec(decimal):
    prob = 1 / decimal
    return 1 / ((1 - prob))
convert_to_other_dec(1.66)

2.5151515151515156

In [7]:
df = pd.read_csv('dataset/dataset_2023-08-07_cnn.csv')

In [8]:
# Data splitting/scaling
n_train = math.ceil(df.shape[0] * 0.8)
train_df = df[:n_train]
test_df = df[n_train:]
X_train = train_df.drop(['winner'], axis=1)
y_train = train_df['winner']
X_test = test_df.drop(['winner'], axis=1)
y_test = test_df['winner']
X = df.drop(['winner'], axis=1)
y = df['winner']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

full_scaler = StandardScaler()
X_scaled = full_scaler.fit_transform(X)

In [9]:
# SOME TESTING I DID
from tensorflow import keras

def build_model(units, dropout, lr):
    # Create the model
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=84, input_shape=(84,), activation='relu'))
    model.add(keras.layers.Dense(units=units, activation='relu'))
    model.add(keras.layers.Dropout(dropout))
    model.add(keras.layers.Dense(units=1, activation='sigmoid'))
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_model_2l(units_1, units_2, dropout_1, dropout_2, lr):
    # Create the model
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=84, input_shape=(84,), activation='relu'))
    model.add(keras.layers.Dense(units=units_1, activation='relu'))
    model.add(keras.layers.Dropout(dropout_1))
    model.add(keras.layers.Dense(units=units_2, activation='relu'))
    model.add(keras.layers.Dropout(dropout_2))
    model.add(keras.layers.Dense(units=1, activation='sigmoid'))
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

def build_model_3l(units_1, units_2, units_3, dropout_1, dropout_2, dropout_3, lr):
    # Create the model
    model = keras.Sequential()
    model.add(keras.layers.Dense(units=84, input_shape=(84,), activation='relu'))
    model.add(keras.layers.Dense(units=units_1, activation='relu'))
    model.add(keras.layers.Dropout(dropout_1))
    model.add(keras.layers.Dense(units=units_2, activation='relu'))
    model.add(keras.layers.Dropout(dropout_2))
    model.add(keras.layers.Dense(units=units_3, activation='relu'))
    model.add(keras.layers.Dropout(dropout_3))
    model.add(keras.layers.Dense(units=1, activation='sigmoid'))
    optimizer = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# prev_best_model = build_model(128, 0.2, 0.01)
# prev_best_model.fit(X_train_scaled, y_train, batch_size=5696, epochs=64)

# BEST MODEL SO FAR
best_model = build_model(192, 0.45, 0.00048509380878639)
best_model.fit(X_scaled, y, batch_size=256, epochs=64)
best_model.save('model_files/tennis_model_1l_2023-08-07.keras')

best_model_2l = build_model_2l(480, 512, 0.35, 0.35, 0.00073156663)
best_model_2l.fit(X_scaled, y, batch_size=256, epochs=256)
best_model_2l.save('model_files/tennis_model_2l_2023-08-07.keras')

best_model_3l = build_model_3l(224, 32, 96, 0.3, 0.45, 0.2, 0.004828779)
best_model_3l.fit(X_scaled, y, batch_size=5696, epochs=64)
best_model_3l.save('model_files/tennis_model_3l_2023-08-07.keras')

# test_model_1 = build_model(256, 0.2, 0.01)
# test_model_1.fit(X_train_scaled, y_train, batch_size=5696, epochs=64)

# test_model_2 = build_model(192, 0.2, 0.01)
# test_model_2.fit(X_train_scaled, y_train, batch_size=5696, epochs=64)


Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64
Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epo

: 

In [5]:
true_labels = y_test.values.flatten()

def get_accuracy(model, data):
    preds = model.predict(data)
    pred_labels = np.round(preds).flatten()
    return np.mean(pred_labels == true_labels)

# print(f'Model #1 Accuracy: {get_accuracy(prev_best_model, X_test_scaled)}')
#VAL accuracy got abt 0.859816312789917
print(f'Model 1 layer Accuracy: {get_accuracy(best_model, X_test_scaled)}')
print(f'Model 2 layers Accuracy: {get_accuracy(best_model_2l, X_test_scaled)}')
print(f'Model 3 layers Accuracy: {get_accuracy(best_model_3l, X_test_scaled)}')
# print(f'Model #2 Accuracy: {get_accuracy(best_model, X_test_scaled)}')
# print(f'Model #3 Accuracy: {get_accuracy(test_model_1, X_test_scaled)}')
# print(f'Model #4 Accuracy: {get_accuracy(test_model_2, X_test_scaled)}')

Model 1 layer Accuracy: 0.8638343504342737
Model 2 layers Accuracy: 0.8668928599225716
Model 3 layers Accuracy: 0.8617847743304285


In [14]:
true_labels = y_test.values.flatten()

def get_accuracy(model, data):
    preds = model.predict(data)
    pred_labels = np.round(preds).flatten()
    return np.mean(pred_labels == true_labels)

# print(f'Model #1 Accuracy: {get_accuracy(prev_best_model, X_test_scaled)}')
#VAL accuracy got abt 0.859816312789917
print(f'Model 1 layer Accuracy: {get_accuracy(best_model, X_test_scaled)}')
print(f'Model 2 layers Accuracy: {get_accuracy(best_model_2l, X_test_scaled)}')
print(f'Model 3 layers Accuracy: {get_accuracy(best_model_3l, X_test_scaled)}')
# print(f'Model #2 Accuracy: {get_accuracy(best_model, X_test_scaled)}')
# print(f'Model #3 Accuracy: {get_accuracy(test_model_1, X_test_scaled)}')
# print(f'Model #4 Accuracy: {get_accuracy(test_model_2, X_test_scaled)}')

Model 1 layer Accuracy: 0.8620240356758845
Model 2 layers Accuracy: 0.8654947665184015


In [4]:
# Data splitting/scaling
n_train = math.ceil(df.shape[0] * 0.8)
train_df = df[:n_train]
test_df = df[n_train:]
X_train = train_df.drop(['winner'], axis=1)
y_train = train_df['winner']
X_test = test_df.drop(['winner'], axis=1)
y_test = test_df['winner']
X = df.drop(['winner'], axis=1)
y = df['winner']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

full_scaler = StandardScaler()
X_scaled = full_scaler.fit_transform(X)

In [26]:
X = df.drop(['winner'], axis=1)
y = df['winner']

full_scaler = StandardScaler()
X_scaled = full_scaler.fit_transform(X)

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
import kerastuner as kt

# Define the HyperModel class
class MyHyperModel(kt.HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = keras.Sequential()
        model.add(layers.Dense(units=84, input_shape=self.input_shape, activation='relu'))
        model.add(layers.Dense(units=hp.Int('units_1', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.05)))
        model.add(layers.Dense(units=hp.Int('units_2', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.05)))
        model.add(layers.Dense(units=hp.Int('units_3', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout_3', min_value=0.2, max_value=0.5, step=0.05)))
        model.add(layers.Dense(units=1, activation='sigmoid'))
        optimizer = Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='LOG'))
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Choice("batch_size", [16, 32, 64, 128, 256, 512, 1024, 2048, 5696]),
            epochs=hp.Choice("epochs", [16, 32, 59, 64, 128, 256]),
            **kwargs,
        )

# Initialize the MyHyperModel
hypermodel = MyHyperModel(input_shape=(84,))

# Initialize the BayesianOptimization tuner
tuner = kt.BayesianOptimization(hypermodel,
                                objective='val_accuracy',
                                max_trials=100,
                                directory='keras_results',
                                project_name='my_project')

# Perform the hyperparameter search without specifying epochs and batch_size
tuner.search(X_scaled, y, validation_split=0.2, callbacks=[tf.keras.callbacks.EarlyStopping('val_loss', patience=3)])

# Get the best hyperparameters and retrain the model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
tuner.hypermodel.fit(best_hps, model, X_scaled, y, validation_split=0.2)

Trial 100 Complete [00h 00m 15s]
val_accuracy: 0.8489400744438171

Best val_accuracy So Far: 0.8598220944404602
Total elapsed time: 1d 19h 23m 39s
INFO:tensorflow:Oracle triggered exit


Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x2ad848e69e0>

In [10]:
three_model = keras.models.load_model('model_files/tennis_model_3l.keras')

In [12]:
three_model.get_config()

{'name': 'sequential_1',
 'layers': [{'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 84),
    'dtype': 'float32',
    'sparse': False,
    'ragged': False,
    'name': 'dense_5_input'}},
  {'class_name': 'Dense',
   'config': {'name': 'dense_5',
    'trainable': True,
    'batch_input_shape': (None, 84),
    'dtype': 'float32',
    'units': 84,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None}},
    'bias_initializer': {'class_name': 'Zeros', 'config': {}},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'activity_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None}},
  {'class_name': 'Dense',
   'config': {'name': 'dense_6',
    'trainable': True,
    'dtype': 'float32',
    'units': 224,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None}},
    'bias

In [14]:
model.get_config()

{'name': 'sequential_2',
 'layers': [{'class_name': 'InputLayer',
   'config': {'batch_input_shape': (None, 84),
    'dtype': 'float32',
    'sparse': False,
    'ragged': False,
    'name': 'dense_10_input'}},
  {'class_name': 'Dense',
   'config': {'name': 'dense_10',
    'trainable': True,
    'batch_input_shape': (None, 84),
    'dtype': 'float32',
    'units': 84,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None}},
    'bias_initializer': {'class_name': 'Zeros', 'config': {}},
    'kernel_regularizer': None,
    'bias_regularizer': None,
    'activity_regularizer': None,
    'kernel_constraint': None,
    'bias_constraint': None}},
  {'class_name': 'Dense',
   'config': {'name': 'dense_11',
    'trainable': True,
    'dtype': 'float32',
    'units': 224,
    'activation': 'relu',
    'use_bias': True,
    'kernel_initializer': {'class_name': 'GlorotUniform',
     'config': {'seed': None}},
    'b

In [6]:
model.save('model_files/tennis_model_3l.keras')

In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
import kerastuner as kt

# Define the HyperModel class
class MyHyperModel(kt.HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = keras.Sequential()
        model.add(layers.Dense(units=84, input_shape=self.input_shape, activation='relu'))
        model.add(layers.Dense(units=hp.Int('units_1', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.05)))
        model.add(layers.Dense(units=hp.Int('units_2', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.05)))
        model.add(layers.Dense(units=hp.Int('units_3', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout_3', min_value=0.2, max_value=0.5, step=0.05)))
        model.add(layers.Dense(units=hp.Int('units_4', min_value=32, max_value=512, step=32), activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout_4', min_value=0.2, max_value=0.5, step=0.05)))
        model.add(layers.Dense(units=1, activation='sigmoid'))
        optimizer = Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='LOG'))
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        return model
    
    def fit(self, hp, model, *args, **kwargs):
        return model.fit(
            *args,
            batch_size=hp.Choice("batch_size", [16, 32, 64, 128, 256, 512, 1024, 2048, 5696]),
            epochs=hp.Choice("epochs", [16, 32, 59, 64, 128, 256]),
            **kwargs,
        )

# Initialize the MyHyperModel
hypermodel = MyHyperModel(input_shape=(84,))

# Initialize the BayesianOptimization tuner
tuner = kt.BayesianOptimization(hypermodel,
                                objective='val_accuracy',
                                max_trials=100,
                                directory='keras_results',
                                project_name='my_project')

# Perform the hyperparameter search without specifying epochs and batch_size
tuner.search(X_scaled, y, validation_split=0.2, callbacks=[tf.keras.callbacks.EarlyStopping('val_loss', patience=3)])

# Get the best hyperparameters and retrain the model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
tuner.hypermodel.fit(best_hps, model, X_scaled, y, validation_split=0.2)

INFO:tensorflow:Reloading Tuner from keras_results\my_project\tuner0.json
INFO:tensorflow:Oracle triggered exit
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.History at 0x2ad849f1bd0>

In [8]:
model.save('model_files/tennis_model_4l.keras')

In [11]:
last_row = df.tail(1).drop(['winner'], axis=1)
df.drop(df.index[-1], inplace=True)

In [14]:
# Data splitting/scaling
n_train = math.ceil(df.shape[0] * 0.8)
train_df = df[:n_train]
test_df = df[n_train:]
X_train = train_df.drop(['winner'], axis=1)
y_train = train_df['winner']
X_test = test_df.drop(['winner'], axis=1)
y_test = test_df['winner']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)

In [11]:
# Hyperparameter Tuning of SGDCClassifier
sgd = SGDClassifier(loss='log_loss', max_iter=10000)

sgd_grid = {
    'alpha': np.logspace(-4, 0, num=10),
    'l1_ratio': np.linspace(0, 1, num=10),
}

sgd_search = RandomizedSearchCV(sgd, sgd_grid, n_iter=5, cv=10, scoring='accuracy', n_jobs=-1, random_state=1)
sgd_result = sgd_search.fit(X_train_scaled, y_train)
print('Best Score: %s' % sgd_result.best_score_)
print('Best Hyperparameters: %s' % sgd_result.best_params_)


Best Score: 0.8366844061643338
Best Hyperparameters: {'l1_ratio': 0.3333333333333333, 'alpha': 0.002154434690031882}


In [12]:
accuracy_score(sgd_result.predict(X_test_scaled), y_test)

0.7879734793888729

In [17]:
# Get soft predictions on the test data
soft_predictions = sgd_result.best_estimator_.predict_proba(X_test_scaled)

# Store the soft predictions for the positive class
positive_probabilities = soft_predictions[:, 1]  # Assuming positive class is at index 1

# Print the list of soft predictions
print(positive_probabilities)

[0.051782   0.12659339 0.36959372 ... 0.42179688 0.56396445 0.49341695]


In [18]:
sgd_result.predict(X_test_scaled)

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [19]:
last_row_scaled = scaler.transform(last_row.values)

In [20]:
last_row_scaled

array([[ 0.66466963,  2.37423513,  2.40622664, -0.85943664,  0.92415122,
         0.35500351, -0.40946442, -0.25553629,  1.61574935, -0.40567998,
        -1.53193405, -0.09183244, -0.29852639, -0.25628614, -0.42889935,
        -0.32782647, -0.40680271,  1.78105616,  1.52146332,  0.99839768,
         0.6519688 ,  0.00869852,  0.04233081,  0.46824724,  0.51660507,
         2.70911283,  1.3613346 ,  2.71521904,  1.72559854,  0.57839841,
         2.23870095,  1.37204774,  2.91258684,  5.04432303,  4.69900636,
         1.15723864, -0.33284642,  0.39586629, -0.51973475,  1.67174177,
         0.81886844, -0.83752684, -1.01021883, -0.92954405, -1.08154756,
         0.71671815, -0.20299294, -0.22722429,  1.01646057, -0.17198426,
        -0.86915249,  2.29330119,  1.54862069, -0.2292402 , -0.43657297,
        -0.35518317, -0.48121709, -0.46273843, -0.59059882,  1.98547253,
         1.18518667,  4.73369462,  4.05020844,  3.09886878,  3.51880616,
        -0.22489999, -0.11083021, -0.48670934, -0.4

In [None]:
sgd_result.predict_proba(last_row_scaled)