In [169]:
# # Imports and settings
import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import xgboost as xgb
import random
import os
import shap
from joblib import dump
from datetime import datetime
from pathlib import Path
from sklearn.metrics import mean_squared_error
from functools import reduce
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.metrics import log_loss
from tqdm import tqdm
from sklearn.metrics import brier_score_loss

pd.set_option('display.float_format', '{:.6f}'.format)
random_seed = 909
random.seed(random_seed)
np.random.seed(random_seed)

In [170]:
con = duckdb.connect("E:/duckdb/tennis.duckdb",read_only=True)
df = con.execute("SELECT * FROM model_table WHERE market_id IS NOT NULL").df()
con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [355]:
sim_file = pd.read_csv(r"E:\tennis-model\model\outputs\20241021_161003\simulation_file.csv")
price_file = pd.read_csv('../../data/price_file.csv')
sim_file_melt = pd.melt(sim_file, id_vars=[x for x in sim_file.columns if 'selection_id' not in x],
                        value_name='selection_id', var_name='position')
sim_file_melt['selection_id'] = sim_file_melt['selection_id'].astype(int)
sim_file_melt['market_id'] = sim_file_melt['market_id'].astype(str).str.pad(9, fillchar='0', side='right')
price_file['market_id'] = price_file['market_id'].astype(str).str.pad(9, fillchar='0', side='right')
for col in [x for x in sim_file_melt.columns if 'prediction' in x]:
    sim_file_melt.loc[sim_file_melt['position'] == 'selection_id_away', col] = 1 - sim_file_melt[col]
mm_base_table = sim_file_melt.merge(price_file, on=['market_id', 'selection_id'])

In [356]:
con = duckdb.connect("E:/duckdb/tennis.duckdb", read_only=True)
tennis_markets = con.execute("""
SELECT market_id, selection_id, event_date, result FROM  market_summaries m
""").df()
con.close()
mm_base_table = mm_base_table.merge(tennis_markets, on=['market_id', 'selection_id']).query('tournament_category in ["Challenger"] and tournament_points == 75')

In [357]:
mm_base_table

Unnamed: 0,market_id,tournament_category,tournament_points,prediction_xgb_standard_0,prediction_lgb_standard_0,prediction_xgb_standard_avg,prediction_lgb_standard_avg,prediction_xgb_price_0,prediction_xgb_price_avg,prediction_xgb_adv_0,...,prediction_lgb_standard_rfs_avg,position,selection_id,seconds_to_start,total_matched,atb,atb_size,last_traded_price,event_date,result
41,1.178057268,Challenger,75,0.642880,0.633397,0.642880,0.633397,0.482978,0.482978,0.496949,...,0.643215,selection_id_home,9633532,297.690000,0.000000,2.120000,10.310000,2.120000,2021-01-18 10:20:00,WINNER
42,1.178057269,Challenger,75,0.413948,0.340624,0.413948,0.340624,0.404937,0.404937,0.532813,...,0.380732,selection_id_home,8174357,232.363000,0.000000,1.870000,51.000000,1.920000,2021-01-19 08:36:00,LOSER
43,1.178057315,Challenger,75,0.505555,0.435099,0.505555,0.435099,0.447084,0.447084,0.498575,...,0.457069,selection_id_home,6948116,-307.951000,0.000000,2.460000,14.060000,2.500000,2021-01-18 15:10:00,WINNER
44,1.178057397,Challenger,75,0.573304,0.503051,0.573304,0.503051,0.517713,0.517713,0.500986,...,0.444502,selection_id_home,9630472,295.976000,0.000000,1.580000,15.320000,1.590000,2021-01-18 13:28:00,LOSER
45,1.178057428,Challenger,75,0.467223,0.394191,0.467223,0.394191,0.472110,0.472110,0.506068,...,0.402879,selection_id_home,8842202,299.105000,0.000000,1.720000,24.910000,1.720000,2021-01-18 14:57:00,LOSER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77267,1.232456927,Challenger,75,0.738796,0.771660,0.738796,0.771660,0.626767,0.626767,0.598401,...,0.702985,selection_id_away,40594116,292.162000,0.000000,1.360000,68.120000,1.360000,2024-08-31 05:25:00,WINNER
77269,1.232471602,Challenger,75,0.569946,0.576532,0.569946,0.576532,0.497678,0.497678,0.452879,...,0.446255,selection_id_away,25215601,269.355000,1092.900000,1.860000,25.430000,1.870000,2024-08-31 12:30:00,WINNER
77270,1.232471726,Challenger,75,0.542264,0.652042,0.542264,0.652042,0.599558,0.599558,0.566910,...,0.569439,selection_id_away,22733757,299.244000,0.000000,1.680000,4.710000,1.680000,2024-08-31 15:00:00,LOSER
77271,1.232472082,Challenger,75,0.532976,0.487617,0.532976,0.487617,0.455039,0.455039,0.437403,...,0.624645,selection_id_away,8477117,278.114000,0.000000,2.060000,21.000000,2.080000,2024-08-31 15:10:00,LOSER


In [358]:
def assign_slice_id(df, slice_size=250):
    # Sort the dataframe by market_id to ensure consistent slice assignment
    df = df.sort_values('market_id')
    
    # Get unique market_ids
    unique_markets = df['market_id'].unique()
    
    # Calculate the number of slices
    num_slices = len(unique_markets) // slice_size
    if len(unique_markets) % slice_size != 0:
        num_slices += 1
    
    # Create a dictionary mapping market_id to slice_id
    market_to_slice = {market_id: slice_id 
                       for slice_id, markets in enumerate(np.array_split(unique_markets, num_slices))
                       for market_id in markets}
    
    # Assign slice_id to each row based on its market_id
    df['slice_id'] = df['market_id'].map(market_to_slice)
    
    return df

# Assuming df_reduced is your dataframe
mm_base_table = assign_slice_id(mm_base_table).reset_index(drop=True)

In [353]:
import numpy as np
import pandas as pd
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

def optimized_prediction_and_betting(df, params, prediction_columns, test=False, return_df=False):
    df = df.copy()
    
    # 1. Create new column using optimized params
    weights = np.array([params.get(f'weight_{col}', 0) for col in prediction_columns])
    
    predictions_matrix = df[prediction_columns].values
    weighted_sum = np.dot(predictions_matrix, weights)

    df['weighted_prediction'] = weighted_sum

    if not test and not return_df and (df['weighted_prediction'] > 1).any():
        max_pred = df['weighted_prediction'].max()
        return (-1000000.0) * (max_pred - 1.0), 1.0

    if not test and not return_df and (df['weighted_prediction'] < 0).any():
        min_pred = df['weighted_prediction'].min()
        return (1000000.0) * (min_pred), 1.0

    # 2. Calculate EV by comparing weighted prediction to 'atb'
    df['ev'] = (df['weighted_prediction'] * (df['atb'] - 1) * 0.95) - (1 - df['weighted_prediction'])

    # 3. Filter to bets only (positive EV)
    df_bets = df[df['ev'] > 0.0 #params['min_ev']
    ].copy()

    if df_bets.empty:
        return 0, 0

    # 4. Calculate stake and winnings
    max_stake = params.get('max_stake')
    df_bets['stake'] = np.minimum(max_stake, df_bets['atb_size'])
    
    df_bets['winnings'] = np.where(df_bets['result'] == "WINNER", 
                                   (df_bets['atb'] - 1) * 0.95 * df_bets['stake'],
                                   -df_bets['stake'])

    if return_df:
        return df_bets

    total_stake = df_bets['stake'].sum()
    total_winnings = df_bets['winnings'].sum()

    return total_stake, total_winnings

def evaluate_slice(df_slice, params, prediction_columns, test=False):
    total_staked, winnings = optimized_prediction_and_betting(df_slice, params, prediction_columns, test)
    if total_staked < 1000.0:  # Adjust this threshold based on your data
        return -(1000.0-total_staked)
    return winnings / total_staked

def objective(trial, train_slices, prediction_columns):
    weights = {f'weight_{col}': trial.suggest_float(f'weight_{col}', -1.0, 1.0) for col in prediction_columns}
    
    # Normalize weights to sum to 1
    total_weight = sum(weights.values())
    normalized_weights = {k: v / total_weight for k, v in weights.items()}
    
    params = {
        **normalized_weights,
        'max_stake': 10.0,#trial.suggest_float('max_stake', 1.0, 50.0),
    }

    slice_rois = []
    for df_slice in train_slices:
        roi = evaluate_slice(df_slice, params, prediction_columns)
        if roi == float('-inf'):
            return float('-inf')
        slice_rois.append(roi)
    
    mean_roi = np.mean(slice_rois)
    
    return mean_roi

def sliding_cv_optimization(mm_base_table, prediction_columns, n_trials):
    # Sort the dataframe by slice_id to ensure chronological order
    mm_base_table = mm_base_table.sort_values('slice_id')
    unique_slices = mm_base_table['slice_id'].unique()

    # Define the number of slices for training and testing
    train_size = 10
    test_size = 1

    # Create TimeSeriesSplit object
    tscv = TimeSeriesSplit(n_splits=5, test_size=test_size, gap=0)

    results = []

    for fold, (train_index, test_index) in enumerate(tscv.split(unique_slices)):
        print(f"Processing fold {fold + 1}")

        # Get the slice IDs for this fold
        train_slices = unique_slices[train_index]
        test_slice = unique_slices[test_index][0]

        # Prepare the training data
        train_data = [mm_base_table[mm_base_table['slice_id'] == slice_id] for slice_id in train_slices[-train_size:]]

        # Create a study object and run the optimization
        study = optuna.create_study(direction='maximize')        
        study.optimize(lambda trial: objective(trial, train_data, prediction_columns), n_trials=n_trials, show_progress_bar=False)

        # Get the best parameters
        best_params = study.best_params

        # Normalize weights
        weight_keys = [k for k in best_params.keys() if k.startswith('weight_')]
        total_weight = sum(best_params[k] for k in weight_keys)
        normalized_weights = {k: best_params[k] / total_weight for k in weight_keys}

        params = {
            **normalized_weights,
            'max_stake':10.0,# best_params['max_stake'],
        }

        # Evaluate on the training slices
        train_roi = np.mean([evaluate_slice(slice_data, params, prediction_columns) for slice_data in train_data])

        # Evaluate on the test slice
        test_data = mm_base_table[mm_base_table['slice_id'] == test_slice]
        test_roi = evaluate_slice(test_data, params, prediction_columns, test=True)

        print(f"Fold {fold + 1} - Train ROI: {train_roi:.4f}, Test ROI: {test_roi:.4f}")

        results.append({
            'fold': fold + 1,
            'train_slices': train_slices[-train_size:],
            'test_slice': test_slice,
            'best_params': params,
            'train_roi': train_roi,
            'test_roi': test_roi
        })

    return results

# Main execution
prediction_columns = ['prediction_xgb_standard_0','prediction_lgb_standard_0','prediction_xgb_of_0','prediction_xgb_price_0']

# Preprocess the data
mm_base_table['result'] = pd.Categorical(mm_base_table['result'], categories=["WINNER", "LOSER"])
mm_base_table['implied_odds'] = 1 / mm_base_table['atb']
mm_base_table = mm_base_table[~mm_base_table['atb'].isna()]

# Run the sliding CV optimization
results = sliding_cv_optimization(mm_base_table, prediction_columns, n_trials=2000)

# Calculate overall performance
overall_train_roi = np.mean([r['train_roi'] for r in results])
overall_test_roi = np.mean([r['test_roi'] for r in results])
test_roi_std = np.std([r['test_roi'] for r in results])

print(f"\nOverall Performance:")
print(f"Mean Train ROI: {overall_train_roi:.4f}")
print(f"Mean Test ROI: {overall_test_roi:.4f}")
print(f"Test ROI Standard Deviation: {test_roi_std:.4f}")

Processing fold 1
Fold 1 - Train ROI: -0.0309, Test ROI: -0.0435
Processing fold 2
Fold 2 - Train ROI: -0.0346, Test ROI: -0.0265
Processing fold 3
Fold 3 - Train ROI: -0.0319, Test ROI: 0.0165
Processing fold 4
Fold 4 - Train ROI: -0.0257, Test ROI: -0.0047
Processing fold 5
Fold 5 - Train ROI: -0.0210, Test ROI: -0.0377

Overall Performance:
Mean Train ROI: -0.0288
Mean Test ROI: -0.0192
Test ROI Standard Deviation: 0.0222


In [348]:
mm_base_table

Unnamed: 0,market_id,tournament_category,tournament_points,prediction_xgb_standard_0,prediction_lgb_standard_0,prediction_xgb_standard_avg,prediction_lgb_standard_avg,prediction_xgb_price_0,prediction_xgb_price_avg,prediction_xgb_adv_0,...,selection_id,seconds_to_start,total_matched,atb,atb_size,last_traded_price,event_date,result,slice_id,implied_odds
0,1.178057268,Challenger,75,0.642880,0.633397,0.642880,0.633397,0.482978,0.482978,0.496949,...,9633532,297.690000,0.000000,2.120000,10.310000,2.120000,2021-01-18 10:20:00,WINNER,0,0.471698
1,1.178057268,Challenger,75,0.357120,0.366603,0.357120,0.366603,0.517022,0.517022,0.503051,...,2345099,297.690000,3740.090000,1.890000,5.580000,1.890000,2021-01-18 10:20:00,LOSER,0,0.529101
2,1.178057269,Challenger,75,0.586052,0.659376,0.586052,0.659376,0.595063,0.595063,0.467187,...,9632014,232.363000,0.000000,2.040000,12.820000,2.100000,2021-01-19 08:36:00,WINNER,0,0.490196
3,1.178057269,Challenger,75,0.413948,0.340624,0.413948,0.340624,0.404937,0.404937,0.532813,...,8174357,232.363000,0.000000,1.870000,51.000000,1.920000,2021-01-19 08:36:00,LOSER,0,0.534759
4,1.178057315,Challenger,75,0.505555,0.435099,0.505555,0.435099,0.447084,0.447084,0.498575,...,6948116,-307.951000,0.000000,2.460000,14.060000,2.500000,2021-01-18 15:10:00,WINNER,0,0.406504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45853,1.232472082,Challenger,75,0.467025,0.512383,0.467025,0.512383,0.544961,0.544961,0.562597,...,38599010,278.114000,0.000000,1.920000,21.760000,1.920000,2024-08-31 15:10:00,WINNER,22,0.520833
45854,1.232472367,Challenger,0,0.344657,0.417167,0.344657,0.417167,0.347115,0.347115,0.461976,...,45377348,298.860000,0.000000,3.800000,4.000000,3.950000,2024-08-31 11:55:00,LOSER,22,0.263158
45855,1.232472367,Challenger,0,0.655343,0.582833,0.655343,0.582833,0.652885,0.652885,0.538024,...,9631075,298.860000,0.000000,1.340000,118.730000,1.340000,2024-08-31 11:55:00,WINNER,22,0.746269
45856,1.232472766,Challenger,75,0.374917,0.475173,0.374917,0.475173,0.445141,0.445141,0.458956,...,13369459,209.253000,0.000000,2.780000,21.490000,2.820000,2024-08-31 16:50:00,LOSER,22,0.359712


In [293]:
from optuna.visualization import plot_slice

def create_filtered_study(original_study, threshold):
    filtered_study = optuna.create_study(direction=original_study.direction)
    for trial in original_study.trials:
        if trial.value is not None and trial.value > threshold:
            filtered_study.add_trial(trial)
    return filtered_study

# Create a new study with only the trials you want to visualize
filtered_study = create_filtered_study(study, -0.1)

# Use the filtered study for visualization
plot_slice(filtered_study, ['weight_prediction_xgb_standard_0'])

[I 2024-10-22 09:48:06,087] A new study created in memory with name: no-name-50d1b798-7fca-49c4-8340-01ab35ea0156


In [50]:
from optuna.visualization import plot_param_importances

plot_param_importances(study)