# NCAABB Elo

In [None]:
cd ..

In [None]:
from src.consts import ESPNSportTypes, START_SEASONS, SEASON_GROUPS
from src.utils import find_year_for_season, create_dataframe, put_dataframe, get_dataframe, df_rename_fold
from src.sport import ESPNSport
from src.event import ESPNEventsAPI
import numpy as np
import pandas as pd
from src.consts import ESPNSportTypes, SEASON_GROUPS, ELO_HYPERPARAMETERS

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, brier_score_loss, log_loss
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score,roc_auc_score
import datetime
def _classification_evaluation(y_true, y_pred, k=20,hfa=68):
    metadata_dict = {}
    metadata_dict['created_at'] = datetime.datetime.now().strftime('%m-%d-%Y')
    metadata_dict['sample_size'] = len(y_true)
    metadata_dict['k'] = k
    metadata_dict['hfa'] = hfa

    y_true = np.array(y_true).ravel()
    if set(y_true) == {0,1} or len(list(set(y_true))) <= 2:
        # Binary classification
        y_true_binary = y_true.astype(bool)
        brier_score = brier_score_loss(y_true_binary, y_pred)
        y_pred_binary = (y_pred > 0.5).astype(int)
        accuracy = accuracy_score(y_true_binary, y_pred_binary)
        precision = precision_score(y_true_binary, y_pred_binary)
        recall = recall_score(y_true_binary, y_pred_binary)
        f1 = f1_score(y_true_binary, y_pred_binary)
        if len(list(set(y_true))) < 2:
            roc_auc = None
            log_loss_score = None
        else:
            roc_auc = roc_auc_score(y_true_binary, y_pred_binary)
            log_loss_score = log_loss(y_true_binary, y_pred)
    else:
        # Multiclass classification
        y_true_multiclass = y_true
        log_loss_score = log_loss(y_true_multiclass, y_pred)
        y_pred_multiclass = np.argmax(y_pred, axis=1)
        accuracy = accuracy_score(y_true_multiclass, y_pred_multiclass)
        precision = precision_score(y_true_multiclass, y_pred_multiclass, average='weighted')
        recall = recall_score(y_true_multiclass, y_pred_multiclass, average='weighted')
        f1 = f1_score(y_true_multiclass, y_pred_multiclass, average='weighted')
        roc_auc = None
        brier_score = None

    metrics_dict = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'brier_score': brier_score,
        'log_loss_score': log_loss_score
    }

    metrics = []
    for val, k in metrics_dict.items():
        metrics.append(
            {**metadata_dict, **{'metric_name': val, 'value': k}}
        )
    metadata_df = pd.DataFrame(metrics)
    return metadata_df
## If its refresh pull whole thing

## If its upsert get the latest seasons in the fs and determine active sports that need pulling
df = pd.DataFrame()
for sport in [ESPNSportTypes.COLLEGE_BASKETBALL]:
    current_season = find_year_for_season(sport)
    seasons = list(range(START_SEASONS[sport], current_season +1))
    for season in seasons:
        fs_df = get_dataframe(f'./data/events/{sport.value}/{season}.parquet')
        df = pd.concat([df,fs_df],ignore_index=True)

folded_df = df_rename_fold(df[['season','home_team_name','home_team_score','away_team_name','away_team_score']],'away_','home_')
num_games = folded_df.groupby(['team_name','season']).agg({
    'team_score':'count'
})
num_games = num_games.reset_index().groupby('season')['team_score'].mean().mean()
avg_score = folded_df.groupby(['team_name','season']).agg({
    'team_score':'mean'
})
avg_score = avg_score.reset_index().groupby('season')['team_score'].mean().mean()


hw = df.loc[df.neutral_site == 0].copy()
hw['home_is_winner'] = hw['home_team_score'] > hw['away_team_score']
hw = hw.groupby(['season']).agg({'home_is_winner':'sum','id':'count'}).reset_index()
hw['perc'] = hw['home_is_winner'] / hw['id']
hw = hw['perc'].mean()
print(f'Averge Number of Games per Season: {round(num_games, 2)}')
print(f'Averge Score per Season: {round(avg_score, 2)}')
print(f'Averge Home Team Win Percentage per Season: {round(hw, 2)}')
from src.elo import EloRunner
elo_cols = ['str_event_id', 'season', 'date','neutral_site',  'home_team_id', 'home_team_score', 'away_team_id','away_team_score']

## Default HPO 
k = int(num_games * 2) + 2
hfa = int(hw * 100) + 2

k = 25
hfa = 75

k = ELO_HYPERPARAMETERS[sport]['k']
hfa = ELO_HYPERPARAMETERS[sport]['hfa']

er = EloRunner(
    df=df[elo_cols].rename(columns={'home_team_id':'home_team_name','away_team_id':'away_team_name'}),
    allow_future=True,
    k=k,
    mean_elo=1505,
    home_field_advantage=hfa,
    width=800,
    preloaded_elos=None
)

elo_df = er.run_to_date()
elo_df = elo_df.rename(columns={'home_team_name':'home_team_id','away_team_name':'away_team_id'})
elo_df = pd.merge(elo_df,df[['id','str_event_id','home_team_name','away_team_name','is_postseason','tournament_id','is_finished','datetime']],on=['str_event_id'])
elo_df['result'] = elo_df['home_team_score'] > elo_df['away_team_score']
#elo_df['error'] = elo_df['home_elo_prob'].round(2) - elo_df['result']
#elo_df['sq_error'] = (elo_df['error']) **2
#elo_df['points'] = 25 - (100 * elo_df['error'])
#elo_df['elo_correct'] = elo_df['error'] < 0.5

eval_df = elo_df.loc[((elo_df.is_finished == 1)&(elo_df.season >= START_SEASONS[sport] + 2))].copy()

eval_df['error'] = eval_df['home_elo_prob'].round(2) - eval_df['result']
eval_df['sq_error'] = (eval_df['error']) **2
eval_df['points'] = 25 - (100 * eval_df['error'])

y_true = eval_df['result']
y_pred = eval_df['home_elo_prob']
points = eval_df['points'].sum() / eval_df.shape[0]
print(f"Model Score: {round(points,2)}")

report = _classification_evaluation(y_true, y_pred, k=k,hfa=hfa)
report

In [None]:
from src.steps.elo_runner import run_elo_for_sport
run_elo_for_sport('./data/events','./data/elo', ESPNSportTypes.COLLEGE_BASKETBALL)

In [None]:
elo_df2 = pd.DataFrame()
for sport in [ESPNSportTypes.COLLEGE_BASKETBALL]:
    current_season = find_year_for_season(sport)
    seasons = list(range(START_SEASONS[sport], current_season +1))
    for season in seasons:
        fs_df = get_dataframe(f'./data/elo/{sport.value}/{season}.parquet')
        elo_df2 = pd.concat([elo_df2,fs_df],ignore_index=True)
elo_df2['result'] = elo_df2['home_team_score'] > elo_df2['away_team_score']
#elo_df['error'] = elo_df['home_elo_prob'].round(2) - elo_df['result']
#elo_df['sq_error'] = (elo_df['error']) **2
#elo_df['points'] = 25 - (100 * elo_df['error'])
#elo_df['elo_correct'] = elo_df['error'] < 0.5

eval_df2 = elo_df2.loc[((elo_df2.is_finished == 1)&(elo_df2.season >= START_SEASONS[sport] + 2))].copy()

eval_df2['error'] = eval_df2['home_elo_prob'].round(2) - eval_df2['result']
eval_df2['sq_error'] = (eval_df2['error']) **2
eval_df2['points'] = 25 - (100 * eval_df2['error'])

y_true = eval_df2['result']
y_pred = eval_df2['home_elo_prob']
points = eval_df2['points'].sum() / eval_df2.shape[0]
print(f"Model Score: {round(points,2)}")

report2 = _classification_evaluation(y_true, y_pred, k=k,hfa=hfa)
report2

In [None]:
folded_elo_df = df_rename_fold(elo_df.loc[elo_df.is_finished==1][['season','datetime','home_team_name','away_team_name','home_team_id','away_team_id','home_elo_pre','away_elo_pre','home_elo_post','away_elo_post']],'away_','home_').sort_values('datetime')
folded_elo_df.groupby('team_id').nth(-1).sort_values(['elo_post'],ascending=False)[0:25]

In [None]:
from src.elo import EloRunner
elo_cols = ['str_event_id', 'season', 'date','neutral_site',  'home_team_id', 'home_team_score', 'away_team_id','away_team_score']

reports = pd.DataFrame()
for k in [10,15, 20, 25, 30]:
    for hfa in [55, 60, 65, 70, 75]:
        er = EloRunner(
            df=df[elo_cols].rename(columns={'home_team_id':'home_team_name','away_team_id':'away_team_name'}),
            allow_future=True,
            k=k,
            mean_elo=1505,
            home_field_advantage=hfa,
            width=800,
            preloaded_elos=None
        )

        elo_df = er.run_to_date()
        elo_df = elo_df.rename(columns={'home_team_name':'home_team_id','away_team_name':'away_team_id'})
        elo_df = pd.merge(elo_df,df[['id','str_event_id','home_team_name','away_team_name','is_postseason','tournament_id','is_finished','datetime']],on=['str_event_id'])
        elo_df['result'] = elo_df['home_team_score'] > elo_df['away_team_score']
        eval_df = elo_df.loc[((elo_df.is_finished == 1)&(elo_df.season >= START_SEASONS[sport] + 2))].copy()

        y_true = eval_df['result']
        y_pred = eval_df['home_elo_prob']
        report = _classification_evaluation(y_true, y_pred, k=k,hfa=hfa)
        reports = pd.concat([reports, report], ignore_index=True)

In [None]:
reports.loc[reports['metric_name'] == 'accuracy'].sort_values(['value'])