In [None]:
%matplotlib inline
import matplotlib as pyplot
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
gd2_df = pd.read_pickle('data/gd2_df.pkl')
covers_df = pd.read_pickle('data/covers_df.pkl')
os_df = pd.read_pickle('data/os_df.pkl')

In [None]:
df = pd.merge(gd2_df, covers_df,
              left_on=['gid', 'team_code'],
              right_on=['gid', 'team'])

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['season'] = df['date'].dt.year
df['weekday'] = df['date'].dt.weekday

df = df.sort_values(['team_code', 'date'])

df[['batting_r', 'other_team_runs']] = df[['batting_r', 'other_team_runs']].astype(int)
df[['wins', 'starter_w']] = df[['wins', 'starter_w']].astype(int)
df['starter_game_score'] = df['starter_game_score'].astype(float)

df['run_diff'] = df['batting_r'] - df['other_team_runs']

In [None]:
df['game_as_favorite'] = ((df['us_line'] < 0) ).astype(int)
df['game_as_underdog'] = ((df['us_line'] >= 0)).astype(int)

df['win_as_favorite'] = ((df['us_line'] < 0) & (df['batting_r'] > df['other_team_runs'])).astype(int)
df['win_as_underdog'] = ((df['us_line'] >= 0) & (df['batting_r'] > df['other_team_runs'])).astype(int)

In [None]:
game_cols = ['gid', 'status_ind', 'date', 'weekday', 'clouds', 'hp_ump', 'temp', 'wind_dir', 'wind_speed',
             'over_under', 'league', 'away_time_diff', 'division_game', 'game_type']
game_df = df[game_cols].drop_duplicates('gid')

In [None]:
for i in ['home', 'away']:
    col = '{}_money_line'.format(i)
    indexer = (os_df[col] != '') & (os_df[col].notnull())
    os_df[col] = os_df[col].loc[indexer].astype(int)

col = 'home_spread'
indexer = (os_df[col] != '') & (os_df[col].notnull())
os_df[col] = os_df[col].loc[indexer].astype(float)

os_keep_re = '.'
os_df = os_df.filter(regex=os_keep_re)
game_df = pd.merge(game_df, os_df, on='gid', how='left')

In [None]:
team_group = ['season', 'team_code']
shift_cols = ['batting_avg', 'batting_obp', 'batting_ops', 'pitching_era', 'wins', 'loss']

team_df = (df.groupby(team_group)[shift_cols]
             .transform(lambda x: x.shift())).astype(float)

team_df['team_runs'] = df['batting_r']

team_df['team_win_pct'] = team_df['wins'] / (team_df['wins'] + team_df['loss'])

team_df['team_last_5_win_pct']  = (df.groupby(team_group)['wins']
                                     .transform(lambda x: (x.shift() - x.shift(6)) / 5))

team_df['season_complete_pct'] = (team_df['wins'] + team_df['loss']) / 162

no_shift_cols = ['us_line', 'decimal_line']
team_df[no_shift_cols] = df[no_shift_cols]

gb_cols = ['games_back', 'games_back_wildcard']
team_df[gb_cols] = (df.groupby(team_group)[gb_cols]
                      .transform(lambda x: x.shift().ffill()))

team_df['season_win_pct_as_favorite'] = (df.groupby(team_group)['win_as_favorite']
                                           .transform(lambda x: x.shift().cumsum()) /
                                         df.groupby(team_group)['game_as_favorite']
                                           .transform(lambda x: x.shift().cumsum()))

team_df['season_win_pct_as_underdog'] = (df.groupby(team_group)['win_as_underdog']
                                           .transform(lambda x: x.shift().cumsum()) /
                                         df.groupby(team_group)['game_as_underdog']
                                           .transform(lambda x: x.shift().cumsum()))

In [None]:
oth_group = ['season', 'team_code', 'other_team_code']

oth_df = (df.groupby(oth_group)['batting_r', 'other_team_runs']
            .transform(lambda x: x.astype(int).shift()))

oth_df = pd.concat([df[oth_group], oth_df], axis=1)

oth_df['oth_win'] = (oth_df['batting_r'] > oth_df['other_team_runs']).astype(int)

team_df['season_win_pct_against_other_team'] = (oth_df.groupby(oth_group)['oth_win']
                                                      .transform(lambda x: x.expanding().mean()))

In [None]:
starter_group = ['season', 'team_code', 'starter_name_display_first_last']
starter_cols = ['starter_w', 'starter_l', 'starter_s_bb', 'starter_s_ip',
                'starter_s_h', 'starter_s_so', 'starter_era']

starter_df = (df.groupby(starter_group)[starter_cols]
                .transform(lambda x: x.shift())).astype(float)

starter_df['starter_s_hr'] = (df.groupby(starter_group)['starter_hr']
                                .transform(lambda x: x.astype(int).shift().cumsum()))

starter_df['starter_s_game_score'] = (df.groupby(starter_group)['starter_game_score']
                                        .transform(lambda x: x.shift().expanding().mean()))

starter_df['starter_hr_ip'] = starter_df['starter_s_hr'] / starter_df['starter_s_ip']

starter_df['starter_win_pct'] = (starter_df['starter_w'] / 
                                 (starter_df['starter_w'] + starter_df['starter_l']))

starter_df['starter_last_5_win_pct']  = (df.groupby(starter_group)['starter_w']
                                          .transform(lambda x: (x.shift() - x.shift(6)) / 5))

starter_df['starter_s_whip'] = ((starter_df['starter_s_bb'] + starter_df['starter_s_h']) /
                                starter_df['starter_s_ip'])

starter_df['starter_s_soip'] = (starter_df['starter_s_so'])/ starter_df['starter_s_ip']

In [None]:
add_cols = ['team_code', 'starter_name_display_first_last']

hix = (df['home_away'] == 'home')
home_df = pd.concat([df.loc[hix, add_cols], team_df[hix], starter_df[hix]], axis=1)
home_df = home_df.rename(columns=lambda x: 'home_{}'.format(x))
home_df = pd.concat([df.loc[hix, 'gid'], home_df], axis=1)

aix = (df['home_away'] == 'away')
away_df = pd.concat([df.loc[aix, add_cols], team_df[aix], starter_df[aix]], axis=1)
away_df = away_df.rename(columns=lambda x: 'away_{}'.format(x))
away_df = pd.concat([df.loc[aix, 'gid'], away_df], axis=1)

In [None]:
train_df = pd.merge(game_df, pd.merge(home_df, away_df, on='gid'), on='gid')
# train_df = train_df.fillna(0)

In [None]:
ignore = ['gid', 'status_ind', 'date', 'home_team_code', 'home_wins', 'home_loss',
    'home_starter_name_display_first_last', 'home_starter_w',
    'home_starter_l', 'home_starter_s_bb', 'home_starter_s_ip',
    'home_starter_s_h', 'home_starter_s_so', 'home_starter_s_hr',
    'away_team_code', 'away_wins', 'away_loss', 'away_starter_name_display_first_last',
    'away_starter_w', 'away_starter_l', 'away_starter_s_bb', 'away_starter_s_ip',
    'away_starter_s_h', 'away_starter_s_so', 'away_starter_s_hr', 'home_team_runs', 'away_team_runs',
    'event_id', 'event_date', 'home_abbreviation', 'away_abbreviation']

# ignore = ignore + ['home_us_line', 'home_decimal_line', 'away_us_line', 'away_decimal_line']
# ignore = ignore + ['home_season_win_pct_against_other_team', 'away_season_win_pct_against_other_team']

cats = ['weekday', 'clouds', 'hp_ump', 'wind_dir', 'league', 'game_type']
num = list(set(train_df.columns) - set(cats + ignore))

In [None]:
x = pd.concat([train_df[num].astype(float), pd.get_dummies(train_df[cats])], axis=1)

In [None]:
y = (train_df['home_team_runs'].astype(int) > train_df['away_team_runs'].astype(int)).astype(int)

In [None]:
kf = KFold(n_splits=10, shuffle=False)

In [None]:
dtrain = xgb.DMatrix(x, label=y, feature_names=x.columns)

In [None]:
param = {
    'max_depth': 2,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'reg_alpha': 1,
    'reg_lambda': 1,
    'min_child_weight': 5
}
history = xgb.cv(params=param, dtrain=dtrain, num_boost_round=100, folds=kf.split(x),
                 metrics=['auc'], verbose_eval=1,
                 early_stopping_rounds=5)

In [None]:
best_auc = history['test-auc-mean'].max()
best_auc_iter = history['test-auc-mean'].idxmax()
print("""naive accuracy: {}
best test auc: {} on {} boosting iterations""".format(y.mean(),
                                                      best_auc,
                                                      best_auc_iter))

In [None]:
cv_preds = []
for train_ix, test_ix in kf.split(x):
    dtrain_k = xgb.DMatrix(x.iloc[train_ix], label=y.iloc[train_ix], feature_names=x.columns)
    dtest_k =xgb.DMatrix(x.iloc[test_ix], feature_names=x.columns)
    bst = xgb.train(params=param, dtrain=dtrain_k, num_boost_round=best_auc_iter)
    cv_preds.append({'cv_index': test_ix, 'cv_pred': bst.predict(dtest_k)})

In [None]:
cv_df = pd.concat([pd.DataFrame(f) for f in cv_preds], ignore_index=True)
cv_df = cv_df.sort_values('cv_index')
cv_df['pred_class'] = cv_df['cv_pred'].apply(
    lambda x: 1 if x >= 0.5 else 0)
cv_df['home_wins'] = y
cv_df['pred_bin'] = pd.cut(cv_df['cv_pred'], bins=np.arange(0.0, 1.0, 0.1))
cv_df['implied_probability'] = train_df['home_us_line'].apply(
    lambda x: x / (x - 100) if x < 0 else 100/(x+100))
cv_df['line_bin'] = pd.cut(cv_df['implied_probability'], bins=np.arange(0.0, 1.0, 0.1))

print('cv accuracy: {}'.format(accuracy_score(y_pred=cv_df['pred_class'], y_true=y)))
print('cv auc: {}'.format(roc_auc_score(y, cv_df['cv_pred'])))

line_acc = accuracy_score((train_df['home_us_line'] < train_df['away_us_line']).astype(int), y)
line_auc = roc_auc_score(y, cv_df['implied_probability'])
print('line favorite accuracy: {}'.format(line_acc))
print('line favorite auc: {}'.format(line_auc))

pd.concat([cv_df.groupby('pred_bin')['home_wins'].agg(['count', 'mean']),
           cv_df.groupby('line_bin')['home_wins'].agg(['count', 'mean'])], axis=1)

In [None]:
cv_df = pd.concat([cv_df, 
                   train_df[['home_decimal_line', 'away_decimal_line',
                             'home_us_line', 'away_us_line']]
                  ], axis=1)

In [None]:
cv_df['home_conf'] = cv_df['cv_pred'] * cv_df['home_decimal_line']
cv_df['away_conf'] = (1 - cv_df['cv_pred']) * cv_df['away_decimal_line']
cv_df['bet_conf'] =  np.where(cv_df['home_conf'] > cv_df['away_conf'],
                              cv_df['home_conf'],
                              cv_df['away_conf'])
cv_df['bet_wins'] =  np.where(cv_df['home_conf'] > cv_df['away_conf'],
                              cv_df['home_wins'],
                              1 - cv_df['home_wins'])
cv_df['bet_decimal_line'] =  np.where(cv_df['home_conf'] > cv_df['away_conf'],
                                      cv_df['home_decimal_line'],
                                      cv_df['away_decimal_line'])
cv_df['bet_return'] = cv_df['bet_wins'] * cv_df['bet_decimal_line']
cv_df = cv_df.sort_values('bet_conf', ascending=False)

In [None]:
pyplot.rcParams['figure.figsize'] = [14, 8]
cv_df[['bet_return', 'bet_wins']].expanding(100).mean().plot(use_index=False)

In [None]:
bst = xgb.train(params=param, dtrain=dtrain, num_boost_round=best_auc_iter)

In [None]:
%matplotlib inline
pyplot.rcParams['figure.figsize'] = [14, 16]
xgb.plot_importance(bst, importance_type='gain')