In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import mean_squared_error

: 

In [None]:
games = pd.read_csv(f'data/cumulative_with_cur_year_and_last_year_ratings_2010_2023.csv')
games = games[games['completed'] == True]
games['last_year_team_rating*num_games_into_season'] = games.apply(lambda row: row['last_year_team_rating'] * row['num_games_into_season'], axis=1)
games['last_year_opponent_rating*num_games_into_season'] = games.apply(lambda row: row['last_year_opponent_rating'] * row['num_games_into_season'], axis=1)

: 

In [17]:
X_train, X_test, y_train, y_test = train_test_split(games[['team_rating', 'opponent_rating', 'team_win_total_future', 'opponent_win_total_future', 'last_year_team_rating', 'last_year_opponent_rating', 'num_games_into_season', 'last_year_team_rating*num_games_into_season', 'last_year_opponent_rating*num_games_into_season', 'team_last_10_rating', 'opponent_last_10_rating', 'team_last_5_rating', 'opponent_last_5_rating', 'team_last_3_rating', 'opponent_last_3_rating', 'team_last_1_rating', 'opponent_last_1_rating', 'team_days_since_most_recent_game', 'opponent_days_since_most_recent_game']], games['margin'], test_size=0.2, random_state=41)

In [18]:
# use optuna to tune the parameters
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

In [19]:
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=100)

[32m[I 2023-01-02 18:26:16,133][0m A new study created in memory with name: regression[0m
[32m[I 2023-01-02 18:26:21,111][0m Trial 0 finished with value: 180.91813254833698 and parameters: {'max_depth': 6, 'learning_rate': 0.18662304996481519, 'n_estimators': 413, 'min_child_weight': 5, 'gamma': 0.5074909242320084, 'subsample': 0.7114585923655443, 'colsample_bytree': 0.4775447545054888, 'reg_alpha': 0.6863384841156724, 'reg_lambda': 0.22508916765999518, 'random_state': 724}. Best is trial 0 with value: 180.91813254833698.[0m
[32m[I 2023-01-02 18:26:24,700][0m Trial 1 finished with value: 292.6862730564269 and parameters: {'max_depth': 4, 'learning_rate': 0.1088503012806579, 'n_estimators': 630, 'min_child_weight': 2, 'gamma': 0.6558083647527146, 'subsample': 0.018423549576626962, 'colsample_bytree': 0.6752309430481509, 'reg_alpha': 0.557647907247734, 'reg_lambda': 0.972583664293339, 'random_state': 162}. Best is trial 0 with value: 180.91813254833698.[0m
[32m[I 2023-01-02 18:

In [15]:
print('Best parameters', study.best_params)
print('Best value', study.best_value)

Best parameters {'max_depth': 3, 'learning_rate': 0.029107780559027997, 'n_estimators': 335, 'min_child_weight': 9, 'gamma': 0.5195681210769822, 'subsample': 0.908711599613052, 'colsample_bytree': 0.6667307628090092, 'reg_alpha': 0.3146270282768631, 'reg_lambda': 0.1375893443806859, 'random_state': 631}
Best value 155.3689425052693
