In [4]:
# Imports and settings
import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import xgboost as xgb
import random
import os
import shap

from joblib import dump
from datetime import datetime
from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

pd.set_option('display.float_format', '{:.6f}'.format)
random_seed = 909
random.seed(random_seed)
np.random.seed(random_seed)

In [ ]:
model_id = f'baseline_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
dirname = f"outputs/{model_id}/"
os.mkdir(dirname)

In [33]:
con = duckdb.connect("E:/duckdb/tennis.duckdb",read_only=True)
base_table = con.execute("SELECT * FROM base_table").df()
con.close()

In [36]:
base_table

Unnamed: 0,id,groundType,tournament_name,tournament_category,tournament_points,tournament_round_category,datetime,match_status,winner,score_period1,...,winnersTotal,breakPointsSavedAttempted,firstReturnPointsAttempted,firstServeAccuracyAttempted,firstServePointsAccuracyAttempted,secondReturnPointsAttempted,secondServeAccuracyAttempted,secondServePointsAccuracyAttempted,pre_match_elo,post_match_elo
0,4842840,Hardcourt outdoor,Chennai,ATP,250.000000,Other,2013-12-31 23:50:00,Retired,0,4.000000,...,,,,,,,,,1500.000000,1484.000000
1,4842840,Hardcourt outdoor,Chennai,ATP,250.000000,Other,2013-12-31 23:50:00,Retired,1,6.000000,...,,,,,,,,,1500.000000,1516.000000
2,4842848,Hardcourt outdoor,Chennai,ATP,250.000000,Other,2014-01-01 00:40:00,Ended,0,6.000000,...,,,,,,,,,1500.000000,1484.000000
3,4842848,Hardcourt outdoor,Chennai,ATP,250.000000,Other,2014-01-01 00:40:00,Ended,1,7.000000,...,,,,,,,,,1500.000000,1516.000000
4,4843638,Hardcourt outdoor,Doha,ATP,250.000000,Middle Stages,2014-01-01 01:20:00,Ended,0,3.000000,...,,,,,,,,,1500.000000,1484.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1598789,12898841,Hardcourt outdoor,"ITF China F13, Men Singles",ITF Men,,Middle Stages,2024-09-26 10:45:00,Ended,1,8.000000,...,,3.000000,45.000000,67.000000,40.000000,28.000000,27.000000,27.000000,2013.174789,2014.918650
1598790,12899383,Hardcourt outdoor,"Monastir, Singles M-ITF-TUN-51A",ITF Men,,Middle Stages,2024-09-26 02:20:00,Ended,0,4.000000,...,,7.000000,52.000000,58.000000,32.000000,44.000000,26.000000,26.000000,1559.817361,1558.092945
1598791,12899383,Hardcourt outdoor,"Monastir, Singles M-ITF-TUN-51A",ITF Men,,Middle Stages,2024-09-26 02:20:00,Ended,1,6.000000,...,,14.000000,32.000000,96.000000,52.000000,26.000000,44.000000,44.000000,2057.597605,2059.322020
1598792,12901201,Hardcourt indoor,"ITF Slovakia 05A, Women Singles",ITF Women,,Middle Stages,2024-09-26 02:55:00,Ended,1,8.000000,...,,8.000000,33.000000,68.000000,36.000000,29.000000,32.000000,32.000000,1731.257921,1742.681536


In [41]:
target = 'home_winner'

id_cols = ['id','tournament_name','match']
time_split_col = 'datetime'

exclude_cols = []
cat_cols = ['groundType','tournament_category','tournament_round_category']
match_stat_cols = ['aces',	'backhandErrors',	'backhandUnforcedErrors',	'backhandWinners',	'breakPointsSaved',	'breakPointsScored',	'doubleFaults',	'dropShotUnforcedErrors',	'dropShotWinners',	'errorsTotal',	'firstReturnPoints',	'firstServeAccuracy',	'firstServePointsAccuracy',	'forehandErrors',	'forehandUnforcedErrors',	'forehandWinners',	'gamesWon',	'groundstrokeErrors',	'groundstrokeUnforcedErrors',	'groundstrokeWinners',	'lobUnforcedErrors',	'lobWinners',	'maxGamesInRow',	'maxPointsInRow',	'overheadStrokeErrors',	'overheadWinners',	'pointsTotal',	'receiverPointsScored',	'returnErrors',	'returnWinners',	'secondReturnPoints',	'secondServeAccuracy',	'secondServePointsAccuracy',	'serviceGamesTotal',	'serviceGamesWon',	'servicePointsScored',	'tiebreaks',	'unforcedErrorsTotal',	'volleyUnforcedErrors',	'volleyWinners',	'winnersTotal',	'breakPointsSavedAttempted',	'firstReturnPointsAttempted',	'firstServeAccuracyAttempted',	'firstServePointsAccuracyAttempted',	'secondReturnPointsAttempted',	'secondServeAccuracyAttempted',	'secondServePointsAccuracyAttempted']


In [42]:
for col in match_stat_cols:
    base_table[col] = pd.to_numeric(base_table[col])

In [43]:
spine = base_table[['id','groundType','tournament_name','tournament_category','tournament_points','tournament_round_category','datetime','match_status']]

id                                      int32
groundType                             object
tournament_name                        object
tournament_category                    object
tournament_points                     float64
                                       ...   
secondReturnPointsAttempted           float64
secondServeAccuracyAttempted          float64
secondServePointsAccuracyAttempted    float64
pre_match_elo                         float64
post_match_elo                        float64
Length: 79, dtype: object