In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import mean_squared_error

In [2]:
games = pd.read_csv(f'data/cumulative_with_cur_year_and_last_year_ratings_2010_2023.csv')
games = games[games['completed'] == True]
games = games[games['year'] != 2023]
games['last_year_team_rating*num_games_into_season'] = games.apply(lambda row: row['last_year_team_rating'] * row['num_games_into_season'], axis=1)
games['last_year_opponent_rating*num_games_into_season'] = games.apply(lambda row: row['last_year_opponent_rating'] * row['num_games_into_season'], axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(games[['team_rating', 'opponent_rating', 'team_win_total_future', 'opponent_win_total_future', 'last_year_team_rating', 'last_year_opponent_rating', 'num_games_into_season', 'last_year_team_rating*num_games_into_season', 'last_year_opponent_rating*num_games_into_season', 'team_last_10_rating', 'opponent_last_10_rating', 'team_last_5_rating', 'opponent_last_5_rating', 'team_last_3_rating', 'opponent_last_3_rating', 'team_last_1_rating', 'opponent_last_1_rating', 'team_days_since_most_recent_game', 'opponent_days_since_most_recent_game']], games['margin'], test_size=0.2, random_state=41)

In [17]:
# use optuna to tune the parameters
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [52]:
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=100)

[32m[I 2023-01-25 22:07:11,355][0m A new study created in memory with name: regression[0m
[33m[W 2023-01-25 22:07:11,360][0m Trial 0 failed with parameters: {'max_depth': 10, 'learning_rate': 0.6343428806487641, 'n_estimators': 214, 'min_child_weight': 2, 'gamma': 0.029961678608671688, 'subsample': 0.16432938669846478, 'colsample_bytree': 0.10971432743278545, 'reg_alpha': 0.6255840696406675, 'reg_lambda': 0.4420785327106033, 'random_state': 53} because of the following error: ValueError('Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17\n  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35\n  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53\n  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71\n  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89\n  90  91  92  93  94  95  96  97  98  99 100 101 102 103], got [-57. 

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103], got [-57. -56. -54. -51. -49. -48. -47. -45. -44. -42. -41. -40. -39. -38.
 -37. -36. -35. -34. -33. -32. -31. -30. -29. -28. -27. -26. -25. -24.
 -23. -22. -21. -20. -19. -18. -17. -16. -15. -14. -13. -12. -11. -10.
  -9.  -8.  -7.  -6.  -5.  -4.  -3.  -2.  -1.   1.   2.   3.   4.   5.
   6.   7.   8.   9.  10.  11.  12.  13.  14.  15.  16.  17.  18.  19.
  20.  21.  22.  23.  24.  25.  26.  27.  28.  29.  30.  31.  32.  33.
  34.  35.  36.  37.  38.  39.  40.  41.  42.  43.  44.  45.  46.  47.
  48.  49.  50.  53.  55.  61.]

In [15]:
print('Best parameters', study.best_params)
print('Best value', study.best_value)

Best parameters {'max_depth': 3, 'learning_rate': 0.029107780559027997, 'n_estimators': 335, 'min_child_weight': 9, 'gamma': 0.5195681210769822, 'subsample': 0.908711599613052, 'colsample_bytree': 0.6667307628090092, 'reg_alpha': 0.3146270282768631, 'reg_lambda': 0.1375893443806859, 'random_state': 631}
Best value 155.3689425052693


In [10]:
games['win'] = games.apply(lambda row: 1 if row['margin'] > 0 else 0, axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(games[['team_rating', 'opponent_rating', 'team_win_total_future', 'opponent_win_total_future', 'last_year_team_rating', 'last_year_opponent_rating', 'num_games_into_season', 'team_last_10_rating', 'opponent_last_10_rating', 'team_last_5_rating', 'opponent_last_5_rating', 'team_last_3_rating', 'opponent_last_3_rating', 'team_last_1_rating', 'opponent_last_1_rating', 'team_days_since_most_recent_game', 'opponent_days_since_most_recent_game']], games['win'], test_size=0.5, random_state=41)

In [15]:
# use optuna to tune the parameters for a classifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = XGBClassifier(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    return log_loss(y_test, y_pred)

In [16]:
study = optuna.create_study(direction='minimize', study_name='win_probability')
study.optimize(objective, n_trials=100)

[32m[I 2023-01-28 11:28:56,837][0m A new study created in memory with name: win_probability[0m
[32m[I 2023-01-28 11:28:58,995][0m Trial 0 finished with value: 0.66156079666028 and parameters: {'max_depth': 8, 'learning_rate': 0.05746408648351279, 'n_estimators': 311, 'min_child_weight': 2, 'gamma': 0.4367635460744282, 'subsample': 0.018860949915606946, 'colsample_bytree': 0.3894982801279265, 'reg_alpha': 0.6794355184376749, 'reg_lambda': 0.6041807001047681, 'random_state': 1}. Best is trial 0 with value: 0.66156079666028.[0m
[32m[I 2023-01-28 11:29:26,760][0m Trial 1 finished with value: 1.039901039920528 and parameters: {'max_depth': 10, 'learning_rate': 0.3976800746515197, 'n_estimators': 172, 'min_child_weight': 6, 'gamma': 0.2725445297548599, 'subsample': 0.4727425393032529, 'colsample_bytree': 0.8391931534413788, 'reg_alpha': 0.04239507583964167, 'reg_lambda': 0.22018751183594557, 'random_state': 376}. Best is trial 0 with value: 0.66156079666028.[0m
[32m[I 2023-01-28 11

In [32]:
y_pred = [0.5] * len(y_test)
print('Baseline log loss', log_loss(y_test, y_pred))

Baseline log loss 0.6931471805599452


In [62]:
games = pd.read_csv(f'data/cumulative_with_cur_year_and_last_year_ratings_2010_2023.csv')
games = games[games['completed'] == True]
games = games[games['year'] == 2023]
games['win'] = games.apply(lambda row: 1 if row['margin'] > 0 else 0, axis=1)
# test the model on the 2023 season
params = {'max_depth': 1, 'learning_rate': 0.6642774658597874, 'n_estimators': 365, 'min_child_weight': 3, 'gamma': 0.9138018877462927, 'subsample': 0.9850665433158399, 'colsample_bytree': 0.6714438416337997, 'reg_alpha': 0.6338885144969151, 'reg_lambda': 0.18021132749565166, 'random_state': 23}
model = XGBClassifier(**params)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
print('Log loss', log_loss(y_test, y_pred))

Log loss 0.6154474363917245
