## Imports

In [11]:
%%capture
import warnings
import math
import db_func
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook, show
import matplotlib.pyplot as plt
from datetime import date, datetime
from tqdm.notebook import tqdm
import re
from collections import defaultdict


warnings.filterwarnings('ignore')

try:
    %load_ext autotime
    %load_ext jupyterlab_notify
except:
    !pip3 install ipython-autotime
    %load_ext autotime

time: 863 ms (started: 2023-03-02 21:56:07 -05:00)


In [5]:
## Database Connection

time: 235 µs (started: 2023-03-02 21:50:27 -05:00)


In [6]:
conn = db_func.get_conn()

time: 317 ms (started: 2023-03-02 21:50:27 -05:00)


# Preprocessing


match_df: The final processed dataset to be used in the machine learning models 

In [26]:
match_query = '''SELECT
				m.match_id,  m.away_id, m.home_id,
				m.date, m.away_pts, m.home_pts, m.playoff_game,
				h_ml.home_ml, a_ml.away_ml,
				h_ps.home_spread, a_ps.away_spread,
				h_ps.home_ps_odds, a_ps.away_ps_odds,
				over.over, under.under, ou.spread
			FROM match AS m
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS home_ml,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 1 AND
					o.match_id = m.match_id AND
					o.team_id = m.home_id
				GROUP BY m.match_id
			) AS h_ml ON m.match_id = h_ml.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS away_ml,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 1 AND
					o.match_id = m.match_id AND
					o.team_id = m.away_id
				GROUP BY m.match_id
			) AS a_ml ON m.match_id = a_ml.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS home_ps_odds,
					AVG(spread) AS home_spread,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 2 AND
					o.match_id = m.match_id AND
					o.team_id = m.home_id
				GROUP BY m.match_id
			) AS h_ps ON m.match_id = h_ps.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS away_ps_odds,
					AVG(spread) AS away_spread,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 2 AND
					o.match_id = m.match_id AND
					o.team_id = m.away_id
				GROUP BY m.match_id
			) AS a_ps ON m.match_id = a_ps.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS under,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.over_under = 'under' AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS under ON m.match_id = under.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS over,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.over_under = 'over' AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS over ON m.match_id = over.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(spread) AS spread,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS ou ON m.match_id = ou.match_id
			WHERE date >= DATE('2021-10-29')
			ORDER BY date ASC
			'''

season_query = '''SELECT *
				FROM season'''

player_performance_query = '''SELECT p.*, m.date
							FROM player_performance as p, match as m
							WHERE m.match_id = p.match_id
							AND m.date >= DATE('2021-10-29')
							ORDER BY date ASC'''
team_query = '''SELECT * 
				FROM team_name'''

injury_query = '''SELECT i.* 
				FROM injury as i, match as m
				WHERE m.match_id = i.match_id
				AND m.date >= DATE('2021-10-29')
				ORDER BY m.date ASC'''

match_df = pd.read_sql(match_query, conn)
#match_df.set_index('match_id', inplace=True)
season_df = pd.read_sql(season_query, conn)
pp_df = pd.read_sql(player_performance_query, conn)
team_df = pd.read_sql(team_query, conn)
injury_df = pd.read_sql(injury_query, conn)
match_df['date'] = match_df['date'].map(lambda x: datetime(x.year, x.month, x.day))
pp_df['date'] = pp_df['date'].map(lambda x: datetime(x.year, x.month, x.day))
season_df['start_date'] =season_df['start_date'].map(lambda x: datetime(x.year, x.month, x.day))
season_df['end_date'] = season_df['end_date'].map(lambda x: datetime(x.year, x.month, x.day))

time: 3min 28s (started: 2023-03-02 22:14:34 -05:00)


In [27]:
def get_season(date):
    return season_df[(season_df['start_date'] <= date) &
                     (season_df['end_date'] >= date)]['season'].values[0]

time: 232 µs (started: 2023-03-02 22:18:03 -05:00)


In [28]:
match_df['season'] = match_df['date'].map(get_season)
pp_df['season'] = pp_df['date'].map(get_season)

time: 11 s (started: 2023-03-02 22:18:03 -05:00)


# Feature Engineering

## Basic stats with respect to the favorite (determined by bookies)

In [42]:
# favorite_df = defaultdict(list)
# p = re.compile('prev.*ema')
# match_df['home_movl'] = match_df['home_pts'] - match_df['away_pts']
# match_df['home_win'] = match_df['home_movl'].map(lambda x: 0 if x < 0 else 1)
# for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):    
#     favorite_won = False
#     if row['home_ml'] < row['away_ml']:
#         favorite_df['favorite_ml'].append(row['home_ml'])
#         favorite_df['underdog_ml'].append(row['away_ml'])
#         favorite_df['favorite_is_home'].append(1)
#         favorite_df['favorite_movl'].append(row['home_pts']-row['away_pts'])
#         favorite_df['point_spread'].append(abs(row['home_spread']))

#         if row['home_win']:
#             favorite_won = True
#     else:
#         if not row['home_win']:
#             favorite_won = True
#         favorite_df['favorite_ml'].append(row['away_ml'])
#         favorite_df['underdog_ml'].append(row['home_ml'])
#         favorite_df['favorite_is_home'].append(0)
#         favorite_df['favorite_movl'].append(row['away_pts']-row['home_pts'])
#         favorite_df['point_spread'].append(abs(row['away_spread']))

#     favorite_df['favorite_won'].append(1 if favorite_won else 0)

# favorite_df = pd.DataFrame(favorite_df)
# favorite_df['vig'] = 1/favorite_df['favorite_ml'] + 1/favorite_df['underdog_ml'] - 1
# favorite_df['favorite_implied'] = 1/favorite_df['favorite_ml'] - favorite_df['vig']/2
# favorite_df['underdog_implied'] = 1/favorite_df['underdog_ml'] - favorite_df['vig']/2

# match_df = pd.concat([match_df.reset_index(drop=True),
#                       favorite_df.reset_index(drop=True)],axis=1)
match_df['favorite_id'] = match_df.apply(lambda x: x['home_id'] if x['favorite_is_home'] else x['away_id'],axis=1)
match_df['underdog_id'] = match_df.apply(lambda x: x['home_id'] if not x['favorite_is_home'] else x['away_id'],axis=1)

print(match_df[['favorite_ml', 'home_ml', 'away_ml']][match_df.home_ml > match_df.away_ml])

      favorite_ml   home_ml   away_ml
5        1.818303  2.048750  1.818303
7        1.278492  3.910000  1.278492
17       1.631307  2.383750  1.631307
21       1.772190  2.116250  1.772190
22       1.528661  2.640000  1.528661
...           ...       ...       ...
1221     1.674758  2.464938  1.674758
1228     1.541282  2.608889  1.541282
1229     1.733700  2.173333  1.733700
1243     1.666389  2.345764  1.666389
1246     1.667990  2.305556  1.667990

[461 rows x 3 columns]
time: 18.1 ms (started: 2023-03-02 22:31:41 -05:00)


In [33]:
def get_prev_matches(date, team_id, match_df, opponent_id = 0):
    if opponent_id:
        return match_df[(match_df["date"] < date) &
                        (((match_df["favorite_id"] == team_id) & 
                          (match_df["underdog_id"] == opponent_id)) |
                         ((match_df["favorite_id"] == opponent_id) & 
                          (match_df["underdog_id"] == team_id)))]
    else:
        return match_df[(match_df["date"] < date) &
                    ((match_df["favorite_id"] == team_id) |
                     (match_df["underdog_id"] == team_id))]


time: 389 µs (started: 2023-03-02 22:23:39 -05:00)


In [39]:
match_df.favorite_id.describe()

count    1253.000000
mean        4.943336
std         3.859449
min         2.000000
25%         2.000000
50%         2.000000
75%        10.000000
max        10.000000
Name: favorite_id, dtype: float64

time: 6.39 ms (started: 2023-03-02 22:24:49 -05:00)


In [43]:
get_prev_matches('2022-06-16', 2,match_df)

Unnamed: 0,match_id,away_id,home_id,date,away_pts,home_pts,playoff_game,home_ml,away_ml,home_spread,...,underdog_ml,favorite_is_home,favorite_movl,point_spread,favorite_won,vig,favorite_implied,underdog_implied,favorite_id,underdog_id
12,20063,2,30,2021-10-30,112.0,115.0,0.0,1.777105,2.107500,-2.000000,...,2.107500,1,3.0,2.000000,1,0.037209,0.544109,0.455891,30,2
29,939,5,2,2021-11-01,128.0,114.0,0.0,1.704644,2.241250,-2.875000,...,2.241250,1,-14.0,2.875000,0,0.032812,0.570227,0.429773,2,5
39,14436,2,22,2021-11-03,92.0,79.0,0.0,3.433333,1.342341,7.111111,...,3.433333,0,13.0,7.111111,1,0.036229,0.726852,0.273148,2,22
48,10110,2,16,2021-11-04,95.0,78.0,0.0,1.382372,3.194444,-6.611111,...,3.194444,1,-17.0,6.611111,0,0.036438,0.705175,0.294825,16,2
66,4542,2,7,2021-11-06,104.0,107.0,0.0,1.615455,2.421111,-3.444444,...,2.421111,1,3.0,3.444444,1,0.032054,0.602994,0.397006,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1247,5965,2,10,2022-06-02,120.0,108.0,0.0,1.596724,2.462500,-3.611111,...,2.462500,1,-12.0,3.611111,0,0.032374,0.610096,0.389904,10,2
1248,5966,2,10,2022-06-05,88.0,107.0,0.0,1.495142,2.727778,-4.722222,...,2.727778,1,19.0,4.722222,1,0.035432,0.651117,0.348883,10,2
1249,1280,10,2,2022-06-08,100.0,116.0,0.0,1.658921,2.338000,-4.375000,...,2.338000,1,16.0,4.375000,1,0.030517,0.587543,0.412457,2,10
1250,1271,10,2,2022-06-10,107.0,97.0,0.0,1.638972,2.367059,-4.176471,...,2.367059,1,-10.0,4.176471,0,0.032604,0.593837,0.406163,2,10


time: 15.8 ms (started: 2023-03-02 22:31:46 -05:00)


In [None]:
def get_win_streak(team_id, prev_matches,i):
    

In [44]:
def get_win_ratio(team_id, prev_matches, i):
    if len(prev_matches) < i: 
        return None
    prev_matches['res'] =  prev_matches.apply(lambda x:
                             1 if (x['favorite_id'] == team_id and x['favorite_won']) or 
                                      (x['underdog_id'] == team_id and not x['favorite_won'])        
                             else 0, axis=1)
    return prev_matches['res'].sum()/i    

time: 499 µs (started: 2023-03-02 22:32:31 -05:00)


In [46]:
window_sizes = [5,7]

for w in tqdm(window_sizes):
        match_df[f'past_{w}_favorite_win_ratio'] = match_df.groupby((lambda x: 
                                    get_win_ratio(x['favorite_id'], 
                                        get_prev_matches(x['date'], 
                                                         x['favorite_id'],
                                                         match_df
                                                        ).tail(w),
                                                    w), axis=1)
    match_df[f'past_{w}_underdog_win_ratio'] = match_df.apply(lambda x: 
                                    get_win_ratio(x['underdog_id'], 
                                        get_prev_matches(x['date'], 
                                                         x['underdog_id'],
                                                         match_df
                                                        ).tail(w),
                                                    w), axis=1)


  0%|          | 0/2 [00:00<?, ?it/s]

time: 4.5 s (started: 2023-03-02 22:32:48 -05:00)


In [50]:
#pd.set_option('max_columns', 2)
pd.set_option('min_rows', 20)


time: 611 µs (started: 2023-03-02 22:35:16 -05:00)


# Player Factors

In [54]:
def get_prev_player_match(date, player_id, pp_df):
    return pp_df[(pp_df['date'] < date) & 
                (pp_df['player_id'] == player_id)].tail(1)
def get_active_players(match_id, team_id, pp_df):
    return  pp_df[(pp_df['match_id'] == match_id) &
                      (pp_df['team_id'] == team_id) &
                  (pp_df['sp']>0)]

def get_complete_roster(match_id, team_id, match_df):
    return  pp_df[(pp_df['match_id'] == match_id) &
                      (pp_df['team_id'] == team_id)]

time: 728 µs (started: 2023-03-02 23:09:10 -05:00)


In [78]:
print(pp_df.columns)

Index(['player_id', 'match_id', 'team_id', 'sp', 'inactive', 'ts_pct',
       'efg_pct', 'threepar', 'ftr', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ortg', 'drtg',
       'bpm', 'starter', 'fg', 'fga', 'fg_pct', 'threep', 'threepa',
       'threep_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'pm', 'date', 'season'],
      dtype='object')
time: 649 µs (started: 2023-03-03 00:25:51 -05:00)


In [221]:
window_sizes = [3,4,5,6,7]

gb = pp_df.groupby(['player_id'], group_keys=True)
for w in window_sizes:
    # features = ['sp', 'ts_pct',
    #    'efg_pct', 'threepar', 'ftr', 'orb_pct', 'drb_pct', 'trb_pct',
    #    'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ortg', 'drtg',
    #    'bpm', 'starter', 'fg', 'fga', 'fg_pct', 'threep', 'threepa',
    #    'threep_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl',
    #    'blk', 'tov', 'pf', 'pts', 'pm']
    
    f = ['sp']

    player_dict[1777]['rolling_sp']= player_dict[1777]['sp'].rolling(3).mean().shift(+1)

# filter fatigue dates
#sf.groupby(sf).filter(lambda x: x.sum() > 2)
player_dict[3]['date']

28      2021-10-29
545     2021-10-30
872     2021-11-01
1288    2021-11-03
1896    2021-11-05
2433    2021-11-07
2922    2021-11-10
3215    2021-11-11
3787    2021-11-13
4407    2021-11-15
           ...    
36171   2022-04-05
36532   2022-04-07
36841   2022-04-08
37312   2022-04-10
37756   2022-04-16
38029   2022-04-18
38200   2022-04-20
38488   2022-04-23
38670   2022-04-25
38873   2022-04-28
Name: date, Length: 83, dtype: datetime64[ns]

time: 6.18 ms (started: 2023-03-03 02:17:58 -05:00)


In [192]:
gb = pp_df[['match_id','player_id','sp']].groupby(['player_id']).rolling(3).mean().shift(+1)
gb.rename(columns={'sp':'rolling_sp'})
gb = pd.merge(gb.reset_index(), pp_df.reset_index(), how='inner',on=['player_id'])
print(gb[gb.player_id==9])

       player_id  level_1    match_id_x         sp_x  index  match_id_y  \
6889           9      376  17374.333333  1468.333333    376        9887   
6890           9      376  17374.333333  1468.333333    958        9890   
6891           9      376  17374.333333  1468.333333   1254        9893   
6892           9      376  17374.333333  1468.333333   1898       20047   
6893           9      376  17374.333333  1468.333333   2476        9897   
6894           9      376  17374.333333  1468.333333   3143        9898   
6895           9      376  17374.333333  1468.333333   3598        9901   
6896           9      376  17374.333333  1468.333333   3704       12893   
6897           9      376  17374.333333  1468.333333   4153        9903   
6898           9      376  17374.333333  1468.333333   5068        9907   
...          ...      ...           ...          ...    ...         ...   
14800          9    39639   7650.000000  1092.666667  38261       11741   
14801          9    39639

In [190]:
gb['sp'].rolling(3).mean().shift(+1).get_group(3)

KeyError: 'sp'

time: 19.8 ms (started: 2023-03-03 02:00:47 -05:00)


In [218]:
rolling = pp_df.groupby(['player_id']).rolling(3).mean().shift(+1)
rolling.loc[[3]]

Unnamed: 0_level_0,Unnamed: 1_level_0,match_id,team_id,sp,inactive,ts_pct,efg_pct,threepar,ftr,orb_pct,drb_pct,...,drb,trb,ast,stl,blk,tov,pf,pts,pm,season
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
3,28,,,,,,,,,,,...,,,,,,,,,,
3,545,,,,,,,,,,,...,,,,,,,,,,
3,872,,,,,,,,,,,...,,,,,,,,,,
3,1288,13415.666667,28.0,1098.666667,0.000000,0.252333,0.259000,0.185000,0.037000,5.800000,31.066667,...,5.000000,6.000000,1.000000,0.333333,0.333333,2.000000,2.000000,4.666667,-6.666667,2022.0
3,1896,13822.666667,28.0,1192.333333,0.000000,0.212333,0.185000,0.111000,0.074000,8.233333,32.600000,...,5.666667,7.333333,2.000000,0.333333,0.666667,2.000000,2.333333,4.000000,-4.666667,2022.0
3,2433,17487.000000,28.0,1222.666667,0.000000,0.353000,0.333333,0.148000,0.296333,11.966667,40.300000,...,6.666667,9.000000,1.666667,0.333333,0.333333,1.666667,2.333333,4.333333,-2.333333,2022.0
3,2922,19273.333333,28.0,1431.666667,0.000000,0.498333,0.493333,0.171667,0.357000,9.233333,34.966667,...,6.666667,8.666667,1.333333,0.333333,0.666667,1.000000,2.666667,8.000000,-3.000000,2022.0
3,3215,12913.000000,28.0,1400.333333,0.000000,0.499333,0.502667,0.171667,0.449667,8.500000,34.466667,...,6.666667,8.333333,1.000000,0.666667,1.333333,0.666667,3.000000,7.000000,-5.333333,2022.0
3,3787,11728.000000,28.0,1630.333333,0.000000,0.346000,0.313667,0.127333,0.294000,8.266667,28.300000,...,6.666667,8.666667,1.666667,1.333333,2.333333,0.666667,2.333333,6.666667,-4.000000,2022.0
3,4407,11741.666667,28.0,1085.333333,0.333333,0.163667,0.116667,0.066667,0.233333,6.933333,17.766667,...,4.333333,6.000000,1.666667,1.000000,2.000000,0.666667,1.666667,2.333333,0.333333,2022.0


time: 563 ms (started: 2023-03-03 02:13:51 -05:00)


In [None]:
fatigue, filter by dates

sf.groupby(sf).filter(lambda x: x.sum() > 2)

In [174]:
rolling.getgroup(1777)

AttributeError: 'DataFrame' object has no attribute 'getgroup'

time: 16.4 ms (started: 2023-03-03 01:54:38 -05:00)


## Team Factors

In [None]:
def

## FG%, 3P%, FT%, ORB, DRB, TRB, TOV, AST, STL, BLK, DRTG, ORTG, EFG 

In [68]:
d = defaultdict(list)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    favorite_players = get_active_players(row['match_id'], row['favorite_id'], pp_df)
    underdog_players = get_active_players(row['match_id'], row['underdog_id'], pp_df)
    # d['favorite_bpm'].append(favorite_players['bpm'].sum())
    # d['underdog_bpm'].append(underdog_players['bpm'].sum())
    # d['favorite_fga'].append(favorite_players['fga'].sum())
    # d['underdog_fga'].append(underdog_players['fga'].sum())
#     d['favorite_fg'].append(favorite_players['fg'].sum())
#     d['underdog_fg'].append(underdog_players['fg'].sum())
#     d['favorite_fg_pct'].append(favorite_players['fg_pct'].mean())
#     d['underdog_fg_pct'].append(underdog_players['fg_pct'].mean())
    
#     d['favorite_3p'].append(favorite_players['threep'].sum())
#     d['underdog_3p'].append(underdog_players['threep'].sum())
#     d['favorite_3pa'].append(favorite_players['threepa'].sum())
#     d['underdog_3pa'].append(underdog_players['threepa'].sum())
#     d['favorite_3p_pct'].append(favorite_players['threep_pct'].mean())
#     d['underdog_3p_pct'].append(underdog_players['threep_pct'].mean())
    
#     d['favorite_ft'].append(favorite_players['ft'].sum())
#     d['underdog_ft'].append(underdog_players['ft'].sum())
    d['favorite_fta'].append(favorite_players['fta'].sum())
    d['underdog_fta'].append(underdog_players['fta'].sum())
#     d['favorite_ft_pct'].append(favorite_players['ft_pct'].mean())
#     d['underdog_ft_pct'].append(underdog_players['ft_pct'].mean())
    
#     d['favorite_orb'].append(favorite_players['orb'].sum())
#     d['underdog_orb'].append(underdog_players['orb'].sum())
#     d['favorite_orb_pct'].append(favorite_players['orb_pct'].mean())
#     d['underdog_orb_pct'].append(underdog_players['orb_pct'].mean())
    
#     d['favorite_drb'].append(favorite_players['drb'].sum())
#     d['underdog_drb'].append(underdog_players['drb'].sum())
#     d['favorite_drb_pct'].append(favorite_players['drb_pct'].mean())
#     d['underdog_drb_pct'].append(underdog_players['drb_pct'].mean())
    
#     d['favorite_trb'].append(favorite_players['trb'].sum())
#     d['underdog_trb'].append(underdog_players['trb'].sum())
#     d['favorite_trb_pct'].append(favorite_players['trb_pct'].mean())
#     d['underdog_trb_pct'].append(underdog_players['trb_pct'].mean())
    
#     d['favorite_tov'].append(favorite_players['tov'].sum())
#     d['underdog_tov'].append(underdog_players['tov'].sum())
#     d['favorite_tov_pct'].append(favorite_players['tov_pct'].mean())
#     d['underdog_tov_pct'].append(underdog_players['tov_pct'].mean())
    
#     d['favorite_ast'].append(favorite_players['ast'].sum())
#     d['underdog_ast'].append(underdog_players['ast'].sum())
#     d['favorite_ast_pct'].append(favorite_players['ast_pct'].mean())
#     d['underdog_ast_pct'].append(underdog_players['ast_pct'].mean())
    
#     d['favorite_stl'].append(favorite_players['stl'].sum())
#     d['underdog_stl'].append(underdog_players['stl'].sum())
#     d['favorite_stl_pct'].append(favorite_players['stl_pct'].mean())
#     d['underdog_stl_pct'].append(underdog_players['stl_pct'].mean())
    
#     d['favorite_blk'].append(favorite_players['blk'].sum())
#     d['underdog_blk'].append(underdog_players['blk'].sum())
#     d['favorite_blk_pct'].append(favorite_players['blk_pct'].mean())
#     d['underdog_blk_pct'].append(underdog_players['blk_pct'].mean())
    
#     d['favorite_drtg'].append(favorite_players['drtg'].mean())
#     d['underdog_drtg'].append(underdog_players['drtg'].mean())
    
#     d['favorite_ortg'].append(favorite_players['ortg'].mean())
#     d['underdog_ortg'].append(underdog_players['ortg'].mean())
    
#     d['favorite_efg_pct'].append(favorite_players['efg_pct'].mean())
#     d['underdog_efg_pct'].append(underdog_players['efg_pct'].mean())
        
#     d['sp'].append(favorite_players['sp'].sum())

  0%|          | 0/1253 [00:00<?, ?it/s]

time: 1.04 s (started: 2023-03-02 23:37:46 -05:00)


In [69]:
df = pd.DataFrame(d)
match_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)

time: 1.57 ms (started: 2023-03-02 23:37:49 -05:00)


In [70]:
match_df[['favorite_fta', 'favorite_ft']]

Unnamed: 0,favorite_fta,favorite_ft
0,13.0,9.0
1,22.0,15.0
2,41.0,31.0
3,16.0,10.0
4,19.0,19.0
5,25.0,24.0
6,10.0,9.0
7,25.0,20.0
8,19.0,18.0
9,8.0,4.0


time: 7.3 ms (started: 2023-03-02 23:37:56 -05:00)


In [None]:
## Possessions and Pace

In [72]:
match_df['favorite_possessions'] = \
    0.5 * ((match_df['favorite_fga'] + 0.4*match_df['favorite_fta'] - \
            1.07*(match_df['favorite_orb']/(match_df['favorite_orb'] + match_df['underdog_drb'])) * \
            (match_df['favorite_fga']-match_df['favorite_fg']) + match_df['favorite_tov']) + \
           (match_df['underdog_fga'] + 0.4*match_df['underdog_fta'] -\
           1.07 * (match_df['underdog_orb']/(match_df['underdog_orb'] + match_df['favorite_drb'])) * \
           (match_df['underdog_fga']-match_df['underdog_fg']) + match_df['underdog_tov']))

match_df['underdog_possessions'] = \
    0.5 * ((match_df['underdog_fga'] + 0.4*match_df['underdog_fta'] - \
            1.07*(match_df['underdog_orb']/(match_df['underdog_orb'] + match_df['favorite_drb'])) * \
            (match_df['underdog_fga']-match_df['underdog_fg']) + match_df['underdog_tov']) + \
           (match_df['favorite_fga'] + 0.4*match_df['favorite_fta'] -\
           1.07 * (match_df['favorite_orb']/(match_df['favorite_orb'] + match_df['underdog_drb'])) * \
           (match_df['favorite_fga']-match_df['favorite_fg']) + match_df['favorite_tov']))

match_df['favorite_pace'] = 48 * ((match_df['favorite_possessions'] + match_df['underdog_possessions']) / \
                          (2*(match_df['sp']/(60*5))))
match_df['underdog_pace'] = 48 * ((match_df['favorite_possessions'] + match_df['underdog_possessions']) / \
                          (2*(match_df['sp']/(60*5))))

time: 4.86 ms (started: 2023-03-02 23:47:13 -05:00)


In [73]:
match_df.pace

0        99.061423
1        94.417093
2       101.844706
3        99.483962
4        97.032143
5       102.279494
6        93.079746
7        97.606629
8        98.312557
9       101.078340
           ...    
1243     88.985226
1244     90.672364
1245     92.849808
1246     96.929448
1247     93.924278
1248     98.069967
1249     92.523355
1250     95.008789
1251     95.981395
1252     92.102250
Name: pace, Length: 1253, dtype: float64

time: 2.92 ms (started: 2023-03-02 23:47:23 -05:00)


In [75]:
match_df

Unnamed: 0,match_id,away_id,home_id,date,away_pts,home_pts,playoff_game,home_ml,away_ml,home_spread,...,favorite_efg_pct,underdog_efg_pct,sp,favorite_fga,underdog_fga,favorite_fta,underdog_fta,favorite_possessions,underdog_possessions,pace
0,4761,7,8,2021-10-29,75.0,106.0,0.0,1.600941,2.453750,-3.875000,...,0.542071,0.351214,14398.0,83.0,78.0,13.0,27.0,99.047665,99.047665,99.061423
1,18833,22,28,2021-10-29,109.0,110.0,0.0,1.231994,4.421250,-9.250000,...,0.598889,0.477800,14400.0,86.0,87.0,22.0,16.0,94.417093,94.417093,94.417093
2,1650,12,3,2021-10-29,98.0,105.0,0.0,1.249934,4.188750,-8.500000,...,0.406900,0.255917,14400.0,76.0,93.0,41.0,23.0,101.844706,101.844706,101.844706
3,16676,13,25,2021-10-29,92.0,111.0,0.0,1.604986,2.440000,-3.500000,...,0.626000,0.341364,14400.0,98.0,87.0,16.0,16.0,99.483962,99.483962,99.483962
4,10089,4,16,2021-10-29,99.0,114.0,0.0,1.365268,3.326250,-6.875000,...,0.424333,0.374889,14400.0,91.0,90.0,19.0,22.0,97.032143,97.032143,97.032143
5,12899,26,19,2021-10-29,113.0,109.0,0.0,2.048750,1.818303,1.500000,...,0.497556,0.475700,14399.0,80.0,98.0,25.0,25.0,102.272392,102.272392,102.279494
6,8697,6,14,2021-10-29,101.0,113.0,0.0,1.295749,3.766250,-8.062500,...,0.582000,0.369833,14400.0,84.0,87.0,10.0,22.0,93.079746,93.079746,93.079746
7,12898,20,19,2021-10-30,123.0,117.0,0.0,3.910000,1.278492,8.125000,...,0.504364,0.492000,14400.0,81.0,92.0,25.0,24.0,97.606629,97.606629,97.606629
8,15344,1,23,2021-10-30,94.0,122.0,0.0,1.626603,2.395000,-3.500000,...,0.538846,0.368857,14400.0,86.0,95.0,19.0,19.0,98.312557,98.312557,98.312557
9,6289,21,10,2021-10-30,82.0,103.0,0.0,1.120994,6.915000,-12.437500,...,0.551923,0.373846,14400.0,92.0,83.0,8.0,21.0,101.078340,101.078340,101.078340


time: 16.6 ms (started: 2023-03-02 23:47:46 -05:00)


## EMA Team stats

In [76]:
def ema(current, prev_ema, window_size, smoothing=2.0):
    k = smoothing / (1 + window_size)
    return current * k + prev_ema * (1-k)

time: 447 µs (started: 2023-03-02 23:50:40 -05:00)


In [None]:
def get_prev_team_sum(team_id, home_col, prev_matches):
    away_col = home_col.replace('home', 'away')
    prev_matches['res'] =  prev_matches.apply(lambda x:
                             x[home_col] if x['home_id'] == team_id
                             else x[away_col], axis=1)
    return prev_matches['res'].sum()

In [None]:
smoothing = 2
window_sizes = [3,4,5,6,7]

d = defaultdict(list)

for w in tqdm(range(len(window_sizes))):
    window_size = window_sizes[w]

    ema_favorite_features = \
        [(f'prev_favorite_pts_ema{window_size}',       f'post_favorite_pts_ema{window_size}'),
        (f'prev_favorite_bpm_ema{window_size}',       f'post_favorite_bpm_ema{window_size}'),
        (f'prev_favorite_fg_ema{window_size}',        f'post_favorite_fg_ema{window_size}'),
        (f'prev_favorite_fg_pct_ema{window_size}',    f'post_favorite_fg_pct_ema{window_size}'),
        (f'prev_favorite_3p_ema{window_size}',        f'post_favorite_3p_ema{window_size}'),
        (f'prev_favorite_3p_pct_ema{window_size}',    f'post_favorite_3p_pct_ema{window_size}'),
        (f'prev_favorite_ft_ema{window_size}',        f'post_favorite_ft_ema{window_size}'),
        (f'prev_favorite_ft_pct_ema{window_size}',    f'post_favorite_ft_pct_ema{window_size}'),
        (f'prev_favorite_orb_ema{window_size}',       f'post_favorite_orb_ema{window_size}'),
        (f'prev_favorite_orb_pct_ema{window_size}',   f'post_favorite_orb_pct_ema{window_size}'),
        (f'prev_favorite_drb_ema{window_size}',       f'post_favorite_drb_ema{window_size}'),
        (f'prev_favorite_drb_pct_ema{window_size}',   f'post_favorite_drb_pct_ema{window_size}'),
        (f'prev_favorite_trb_ema{window_size}',       f'post_favorite_trb_ema{window_size}'),
        (f'prev_favorite_trb_pct_ema{window_size}',   f'post_favorite_trb_pct_ema{window_size}'),
        (f'prev_favorite_tov_ema{window_size}',       f'post_favorite_tov_ema{window_size}'),
        (f'prev_favorite_tov_pct_ema{window_size}',   f'post_favorite_tov_pct_ema{window_size}'),
        (f'prev_favorite_ast_ema{window_size}',       f'post_favorite_ast_ema{window_size}'),
        (f'prev_favorite_ast_pct_ema{window_size}',   f'post_favorite_ast_pct_ema{window_size}'),
        (f'prev_favorite_stl_ema{window_size}',       f'post_favorite_stl_ema{window_size}'),
        (f'prev_favorite_stl_pct_ema{window_size}',   f'post_favorite_stl_pct_ema{window_size}'),
        (f'prev_favorite_blk_ema{window_size}',       f'post_favorite_blk_ema{window_size}'),
        (f'prev_favorite_blk_pct_ema{window_size}',   f'post_favorite_blk_pct_ema{window_size}'),
        (f'prev_favorite_drtg_ema{window_size}',      f'post_favorite_drtg_ema{window_size}'),
        (f'prev_favorite_ortg_ema{window_size}',      f'post_favorite_ortg_ema{window_size}'),
        (f'prev_favorite_efg_pct_ema{window_size}',   f'post_favorite_efg_pct_ema{window_size}'),
        (f'prev_favorite_pace_ema{window_size}',      f'post_favorite_pace_ema{window_size}')]

    ema_underdog_features = [(f[0].replace('favorite','underdog'), \
        f[1].replace('favorite','underdog')) for f in ema_favorite_features]
    ema_underdog_features = [(f[0].replace('favorite','underdog'), f[1].replace('favorite','underdog')) for f in ema_favorite_features]
    sma_favorite_features = [(f[0].replace('ema','sma'), f[1].replace('ema','sma')) for f in ema_favorite_features]
    sma_underdog_features = [(f[0].replace('favorite','underdog'), f[1].replace('favorite','underdog')) for f in sma_favorite_features]

    for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
        prev_favorite_matches = get_prev_matches(row['date'], \
            row['favorite_id'], match_df).tail(window_size)
        prev_underdog_matches = get_prev_matches(row['date'], \
            row['underdog_id'], match_df).tail(window_size)
        len_prev_favorite_matches = len(prev_favorite_matches)
        len_prev_underdog_matches = len(prev_underdog_matches)
        for i in range(len(ema_favorite_features)):
            favorite_feature = re.findall('favorite_.*_ema', ema_favorite_features[i][0])[0].replace('_ema', '')
            underdog_feature = favorite_feature.replace('favorite', 'underdog') 

            if not prev_favorite_matches.empty:
                prev_match = prev_favorite_matches.iloc[-1:]
                d[sma_favorite_features[i][0]].append(get_prev_team_sum(row['favorite_id'], \
                    favorite_feature, prev_favorite_matches)/len_prev_favorite_matches)

                if len_prev_favorite_matches < window_size:
                    d[ema_favorite_features[i][0]].append(d[sma_favorite_features[i][0]][-1]) 
                    d[ema_favorite_features[i][1]].append((d[sma_favorite_features[i][0][-1]] \
                        * len_prev_favorite_matches + row[favorite_feature])/(len_prev_favorite_matches + 1))
                else:
                    d[ema_favorite_features[i][0]].append(prev_match[ema_favorite_features[i][1]] \
                                        if prev_match['favorite_id'].values[0] == row['favorite_id'] \
                                        else prev_match[ema_underdog_features[i][1]])

                    d[ema_favorite_features[i][1]].append(ema(row[favorite_feature],  
                                        d[ema_favorite_features[i][0]][-1], 
                                        window_size))
            else:
                d[ema_favorite_features[i][1]].append(row[favorite_feature])


            if not prev_underdog_matches.empty:
                prev_match = prev_underdog_matches.iloc[-1:]
                d[sma_underdog_features[i][0]].append(get_prev_team_sum(row['underdog_id'], \
                    favorite_feature, prev_underdog_matches)/len_prev_underdog_matches)

                if len_prev_underdog_matches < window_size:
                    d[ema_underdog_features[i][0]].append(d[sma_underdog_features[i][0]][-1]) 
                    d[ema_underdog_features[i][1]].append((d[sma_underdog_features[i][0]][-1] \
                        * len_prev_underdog_matches + row[underdog_feature])/(len_prev_underdog_matches + 1))
                else:
                    d[ema_underdog_features[i][0]].append(prev_match[ema_favorite_features[i][1]] \
                                if prev_match['favorite_id'].values[0] == row['favorite_id'] \
                                else prev_match[ema_underdog_features[i][1]])

                    d[ema_underdog_features[i][1]].append(ema(row[underdog_feature],  
                                            d[ema_underdog_features[i][0]][-1], 
                                            window_size))
            else:
                d[ema_underdog_features[i][1]].append(row[underdog_feature])



In [None]:
df = pd.DataFrame(d)
match_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)


# Fatigue Factors