## Imports

In [1]:
%%capture
import warnings
import math
import db_func
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook, show
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
from tqdm.notebook import tqdm
import re
from collections import defaultdict, ChainMap
import multiprocessing
from functools import partial
import numpy as np
from itertools import repeat

warnings.filterwarnings('ignore')

try:
    %load_ext autotime
    %load_ext jupyterlab_notify
except:
    !pip3 install ipython-autotime
    %load_ext autotime

time: 1.11 s (started: 2024-02-13 18:07:11 -05:00)


In [2]:
#pd.set_option('max_columns', 2)
#pd.set_option('min_rows', 20)

time: 264 µs (started: 2023-03-05 03:37:12 -05:00)


## Database Connection

In [3]:
conn = db_func.get_conn()

time: 380 ms (started: 2023-03-05 03:37:12 -05:00)


# Preprocessing


match_df: The final processed dataset to be used in the machine learning models 

In [4]:
match_query = '''SELECT
				m.match_id,  m.away_id, m.home_id,
				m.date, m.away_pts, m.home_pts, m.playoff_game,
				h_ml.home_ml, a_ml.away_ml,
				h_ps.home_spread, a_ps.away_spread,
				h_ps.home_ps_odds, a_ps.away_ps_odds,
				over.over, under.under, ou.spread
			FROM match AS m
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS home_ml,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 1 AND
					o.match_id = m.match_id AND
					o.team_id = m.home_id
				GROUP BY m.match_id
			) AS h_ml ON m.match_id = h_ml.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS away_ml,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 1 AND
					o.match_id = m.match_id AND
					o.team_id = m.away_id
				GROUP BY m.match_id
			) AS a_ml ON m.match_id = a_ml.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS home_ps_odds,
					AVG(spread) AS home_spread,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 2 AND
					o.match_id = m.match_id AND
					o.team_id = m.home_id
				GROUP BY m.match_id
			) AS h_ps ON m.match_id = h_ps.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS away_ps_odds,
					AVG(spread) AS away_spread,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 2 AND
					o.match_id = m.match_id AND
					o.team_id = m.away_id
				GROUP BY m.match_id
			) AS a_ps ON m.match_id = a_ps.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS under,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.over_under = 'under' AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS under ON m.match_id = under.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS over,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.over_under = 'over' AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS over ON m.match_id = over.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(spread) AS spread,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS ou ON m.match_id = ou.match_id
			WHERE date >= DATE('2007-10-30')
			ORDER BY date ASC
			'''

season_query = '''SELECT *
				FROM season'''

player_performance_query = '''SELECT p.*, m.date
							FROM player_performance as p, match as m
							WHERE m.match_id = p.match_id
							AND m.date >= DATE('2007-10-30')
							ORDER BY date ASC'''
team_query = '''SELECT * 
				FROM team_name'''

injury_query = '''SELECT i.* 
				FROM injury as i, match as m
				WHERE m.match_id = i.match_id
				AND m.date >= DATE('2007-10-30')
				ORDER BY m.date ASC'''

match_df = pd.read_sql(match_query, conn)
season_df = pd.read_sql(season_query, conn)
pp_df = pd.read_sql(player_performance_query, conn)
team_df = pd.read_sql(team_query, conn)
injury_df = pd.read_sql(injury_query, conn)
match_df['date'] = match_df['date'].map(lambda x: datetime(x.year, x.month, x.day))
pp_df['date'] = pp_df['date'].map(lambda x: datetime(x.year, x.month, x.day))
season_df['start_date'] =season_df['start_date'].map(lambda x: datetime(x.year, x.month, x.day))
season_df['end_date'] = season_df['end_date'].map(lambda x: datetime(x.year, x.month, x.day))

time: 6min 13s (started: 2023-03-05 03:37:12 -05:00)


In [5]:
def get_season(date):
    return season_df[(season_df['start_date'] <= date) &
                     (season_df['end_date'] >= date)]['season'].values[0]

time: 200 µs (started: 2023-03-05 03:43:26 -05:00)


In [6]:
match_df['season'] = match_df['date'].map(get_season)
pp_df['season'] = pp_df['date'].map(get_season)

time: 2min 17s (started: 2023-03-05 03:43:26 -05:00)


# Feature Engineering

## Basic stats with respect to the favorite (determined by bookies)

In [7]:
favorite_df = defaultdict(list)
p = re.compile('prev.*ema')
match_df['home_movl'] = match_df['home_pts'] - match_df['away_pts']
match_df['home_win'] = match_df['home_movl'].map(lambda x: 0 if x < 0 else 1)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):    
    favorite_won = False
    if row['home_ml'] < row['away_ml']:
        favorite_df['favorite_ml'].append(row['home_ml'])
        favorite_df['underdog_ml'].append(row['away_ml'])
        favorite_df['favorite_is_home'].append(1)
        favorite_df['favorite_movl'].append(row['home_pts']-row['away_pts'])
        favorite_df['point_spread'].append(abs(row['home_spread']))
        favorite_df['favorite_pts'].append(row['home_pts'])
        favorite_df['underdog_pts'].append(row['away_pts'])

        if row['home_win']:
            favorite_won = True
    else:
        if not row['home_win']:
            favorite_won = True
        favorite_df['favorite_ml'].append(row['away_ml'])
        favorite_df['underdog_ml'].append(row['home_ml'])
        favorite_df['favorite_is_home'].append(0)
        favorite_df['favorite_movl'].append(row['away_pts']-row['home_pts'])
        favorite_df['point_spread'].append(abs(row['away_spread']))
        favorite_df['favorite_pts'].append(row['away_pts'])
        favorite_df['underdog_pts'].append(row['home_pts'])


    favorite_df['favorite_won'].append(1 if favorite_won else 0)

favorite_df = pd.DataFrame(favorite_df)
favorite_df['vig'] = 1/favorite_df['favorite_ml'] + 1/favorite_df['underdog_ml'] - 1
favorite_df['favorite_implied'] = 1/favorite_df['favorite_ml'] - favorite_df['vig']/2
favorite_df['underdog_implied'] = 1/favorite_df['underdog_ml'] - favorite_df['vig']/2

match_df = pd.concat([match_df.reset_index(drop=True),
                      favorite_df.reset_index(drop=True)],axis=1)
match_df['favorite_id'] = match_df.apply(lambda x: x['home_id'] if x['favorite_is_home']==1 else x['away_id'],axis=1)
match_df['underdog_id'] = match_df.apply(lambda x: x['home_id'] if x['favorite_is_home']==0 else x['away_id'],axis=1)


  0%|          | 0/19158 [00:00<?, ?it/s]

time: 900 ms (started: 2023-03-05 03:45:43 -05:00)


In [8]:
def get_prev_matches(date, team_id, match_df, opponent_id = 0):
    if opponent_id:
        return match_df[(match_df["date"] < date) &
                        (((match_df["favorite_id"] == team_id) & 
                          (match_df["underdog_id"] == opponent_id)) |
                         ((match_df["favorite_id"] == opponent_id) & 
                          (match_df["underdog_id"] == team_id)))]
    else:
        return match_df[(match_df["date"] < date) &
                    ((match_df["favorite_id"] == team_id) |
                     (match_df["underdog_id"] == team_id))]


time: 352 µs (started: 2023-03-05 03:45:44 -05:00)


In [9]:
match_df.favorite_id.describe()

count    19158.000000
mean        15.339806
std          8.668143
min          1.000000
25%          8.000000
50%         15.000000
75%         23.000000
max         30.000000
Name: favorite_id, dtype: float64

time: 8.95 ms (started: 2023-03-05 03:45:44 -05:00)


In [10]:
def get_win_ratio(team_id, prev_matches, i):
    if len(prev_matches) < i: 
        return None
    prev_matches['res'] =  prev_matches.apply(lambda x:
                             1 if (x['favorite_id'] == team_id and x['favorite_won']) or 
                                      (x['underdog_id'] == team_id and not x['favorite_won'])        
                             else 0, axis=1)
    return prev_matches['res'].sum()/i    

time: 280 µs (started: 2023-03-05 03:45:44 -05:00)


In [11]:
window_sizes = [5,8,12]

for w in tqdm(window_sizes):
    match_df[f'past_{w}_favorite_win_ratio'] = match_df.apply(lambda x: 
                                    get_win_ratio(x['favorite_id'], 
                                        get_prev_matches(x['date'], 
                                                         x['favorite_id'],
                                                         match_df
                                                        ).tail(w),
                                                    w), axis=1)
    match_df[f'past_{w}_underdog_win_ratio'] = match_df.apply(lambda x: 
                                    get_win_ratio(x['underdog_id'], 
                                        get_prev_matches(x['date'], 
                                                         x['underdog_id'],
                                                         match_df
                                                        ).tail(w),
                                                    w), axis=1)


  0%|          | 0/3 [00:00<?, ?it/s]

time: 1min 44s (started: 2023-03-05 03:45:44 -05:00)


# Player Factors

In [12]:
def get_prev_player_match(date, player_id, pp_df):
    return pp_df[(pp_df['date'] < date) & 
                (pp_df['player_id'] == player_id)].tail(1)
def get_active_players(match_id, team_id, pp_df):
    return  pp_df[(pp_df['match_id'] == match_id) &
                      (pp_df['team_id'] == team_id) &
                  (pp_df['sp']>0)]

def get_complete_roster(match_id, team_id, match_df):
    return  pp_df[(pp_df['match_id'] == match_id) &
                      (pp_df['team_id'] == team_id)]

time: 378 µs (started: 2023-03-05 03:47:29 -05:00)


## Define Stats per minute played

In [13]:
pm_stats = ['pts', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk','tov', 'pf']

for f in pm_stats:
    pp_df[f'{f}_avg'] = pp_df.apply(lambda x: x[f]*60/x['sp'] if x['sp']>0 else 0, axis=1)

avg_stats = [f'{f}_avg' for f in pm_stats]

time: 37.1 s (started: 2023-03-05 03:47:29 -05:00)


## EWM Player Stats

In [14]:
def get_player_dict(gb):
    window_sizes = [10,20,40]
    player_dict = dict(tuple(gb))
    
    features = ['ts_pct',
       'efg_pct', 'threepar', 'ftr', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ortg', 'drtg',
       'fg', 'fga', 'fg_pct', 'threep', 'threepa',
       'threep_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'pts_avg', 'orb_avg', 'drb_avg', 'trb_avg',
       'ast_avg', 'stl_avg', 'blk_avg', 'tov_avg', 'pf_avg']
    n_features = len(features)
    for w in window_sizes:
        ewm_features = [f'{f}_ewm_{w}' for f in features]
        for p in player_dict:
            # set the index to date for use in finding variable length windows later
            for i in range(n_features):
                player_dict[p].loc[player_dict[p].sp > 0, ewm_features[i]] = player_dict[p].loc[player_dict[p].sp > 0, features[i]].ewm(span=w,min_periods=0,adjust=False,ignore_na=False).mean().shift(+1).values.tolist()
            player_dict[p].index = player_dict[p]['date']
            player_dict[p].fillna(method='ffill', inplace=True)
    return player_dict

time: 619 µs (started: 2023-03-05 03:48:06 -05:00)


In [3]:
def parallelize_gb(pp_df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    gb = pp_df.groupby(['player_id'], group_keys=True)
    df_split = np.array_split(gb, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    list_dict = pool.map(func, df_split)
    res = dict(ChainMap(*list_dict))
    pool.close()
    pool.join()
    return res

player_dict = parallelize_gb(pp_df, get_player_dict)

NameError: name 'get_player_dict' is not defined

time: 215 ms (started: 2023-03-06 14:50:59 -05:00)


In [16]:
player_dict[3]

Unnamed: 0_level_0,player_id,match_id,team_id,sp,inactive,ts_pct,efg_pct,threepar,ftr,orb_pct,...,pts_ewm_40,pts_avg_ewm_40,orb_avg_ewm_40,drb_avg_ewm_40,trb_avg_ewm_40,ast_avg_ewm_40,stl_avg_ewm_40,blk_avg_ewm_40,tov_avg_ewm_40,pf_avg_ewm_40
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-12-23,3,14920,16,819.0,0.0,0.694,0.750,0.000,1.000,9.5,...,,,,,,,,,,
2020-12-25,3,10398,16,1168.0,0.0,0.698,0.714,0.000,0.286,7.3,...,8.000000,0.586081,0.073260,0.146520,0.219780,0.000000,0.000000,0.000000,0.000000,0.146520
2020-12-29,3,10389,16,1058.0,0.0,0.563,0.625,0.000,0.250,0.0,...,8.146341,0.585056,0.072192,0.144385,0.216577,0.005012,0.000000,0.002506,0.002506,0.154408
2020-12-30,3,10386,16,721.0,0.0,0.000,0.000,0.000,0.000,29.2,...,8.236764,0.584180,0.068671,0.148407,0.217078,0.004767,0.002766,0.002384,0.007916,0.155175
2021-01-01,3,4224,16,719.0,0.0,0.753,0.750,0.000,1.500,16.0,...,7.834970,0.555684,0.077499,0.149286,0.226785,0.004535,0.002631,0.006327,0.019708,0.155724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-18,3,15112,28,1739.0,0.0,0.415,0.375,0.125,0.125,7.0,...,10.341972,0.446731,0.066428,0.161688,0.228117,0.046173,0.018711,0.025121,0.055287,0.086776
2022-04-20,3,18533,28,2133.0,0.0,0.842,0.909,0.182,0.182,6.5,...,10.178949,0.436721,0.066554,0.160533,0.227087,0.043921,0.019481,0.023896,0.052590,0.087592
2022-04-23,3,18528,28,1465.0,0.0,0.363,0.333,0.333,0.333,8.2,...,10.658024,0.442861,0.066052,0.158191,0.224243,0.043151,0.018531,0.024103,0.055513,0.086064
2022-04-25,3,15109,28,1606.0,0.0,0.715,0.682,0.455,0.182,5.6,...,10.382023,0.431247,0.066825,0.154470,0.221296,0.043044,0.017627,0.022927,0.058799,0.083864


time: 18.6 ms (started: 2023-03-05 03:48:19 -05:00)


## Player Fatigue

Seconds played over the last 2 weeks

In [17]:
def get_player_fatigue(x,w,p):
    window = player_dict[p].loc[x.date - timedelta(days=w):x.date - timedelta(days=1)]
    ewm_sp = np.nan
    if len(window):
        #set ewm span=7. A player is unlikely to play more than 7 games in 14 days.
        window['ewm_sp'] = window['sp'].ewm(span=7,min_periods=0,adjust=False,ignore_na=False).mean()
        ewm_sp = window.iloc[-1]['ewm_sp']
    return ewm_sp

window_sizes = [10, 14]

for w in tqdm(window_sizes):
    for p in tqdm(player_dict):
        player_dict[p][f'sp_ewm_{w}'] = player_dict[p].apply(lambda x: get_player_fatigue(x,w+1, p), axis=1)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1708 [00:00<?, ?it/s]

  0%|          | 0/1708 [00:00<?, ?it/s]

time: 8min 37s (started: 2023-03-05 03:48:19 -05:00)


In [18]:
player_dict[3]['sp_ewm_14']

date
2020-12-23            NaN
2020-12-25     819.000000
2020-12-29     906.250000
2020-12-30     944.187500
2021-01-01     888.390625
                 ...     
2022-04-18    1485.213867
2022-04-20    1581.230469
2022-04-23    1502.988281
2022-04-25    1687.843750
2022-04-28    1702.503906
Name: sp_ewm_14, Length: 164, dtype: float64

time: 1.75 ms (started: 2023-03-05 03:56:56 -05:00)


In [19]:
match_df['n_f_players'] = match_df.apply(lambda row: len(get_active_players(row['match_id'], row['favorite_id'], pp_df)), axis=1)
match_df['n_u_players'] = match_df.apply(lambda row: len(get_active_players(row['match_id'], row['underdog_id'], pp_df)), axis=1)

time: 29.9 s (started: 2023-03-05 03:56:56 -05:00)


In [20]:
match_df['n_f_players'][match_df['n_f_players']<8]
match_df['n_u_players'][match_df['n_u_players']<8]

304      7
915      7
1009     7
1495     7
1551     7
1569     7
1614     7
1620     7
1632     7
1678     7
1717     7
1810     7
1898     7
2124     7
2261     7
2424     7
2427     7
2440     7
2463     7
2494     7
2504     7
2523     7
2532     7
2751     7
2781     7
2837     6
3051     7
3843     7
5004     7
5152     7
6232     7
7140     7
7534     7
7594     7
8242     7
8861     7
10008    7
10178    6
11491    7
11500    7
12807    7
12816    7
12878    7
14198    7
14200    7
16280    7
16662    7
16800    7
17557    7
17646    7
17718    7
17742    6
18484    7
18504    7
19044    6
19057    6
19062    7
Name: n_u_players, dtype: int64

time: 1.91 ms (started: 2023-03-05 03:57:26 -05:00)


## Insert player factors into match dataframe

Most NBA teams rotate at least 8 players. 
We will use the top 8 players in terms of time played as features

In [21]:
def insert_player_features(match_df, player_dict, pp_df):
    window_sizes = [10,20,40]
    d = defaultdict(list)
    player_features = ['ts_pct',
       'efg_pct', 'threepar', 'ftr', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ortg', 'drtg',
       'fg', 'fga', 'fg_pct', 'threep', 'threepa',
       'threep_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'pts_avg', 'orb_avg', 'drb_avg', 'trb_avg',
       'ast_avg', 'stl_avg', 'blk_avg', 'tov_avg', 'pf_avg']
    err = 0
    n_features = len(player_features)
    for idx, row in match_df.iterrows():
        match_id = row['match_id']
        favorite_players = get_active_players(row['match_id'], row['favorite_id'], pp_df).sort_values(by=['sp'], ascending=False).head(8)
        underdog_players = get_active_players(row['match_id'], row['underdog_id'], pp_df).sort_values(by=['sp'], ascending=False).head(8)
        if favorite_players.shape[0] < 8 or underdog_players.shape[0] < 8:
            err += 1
        for w in window_sizes:
            # ewm_features = [f'{f}_ewm_{w}' for f in player_features]
#             for i in range(8):
#                 fav_player_ewm_features = [f'fav_p{i}_{f}_ewm_{w}' for f in player_features]
#                 und_player_ewm_features = [f'und_p{i}_{f}_ewm_{w}' for f in player_features]
#                 if favorite_players.shape[0] >= 8 and underdog_players.shape[0] >= 8:
#                     fav_player_id = favorite_players.iloc[i]['player_id']
#                     und_player_id = underdog_players.iloc[i]['player_id']

#                     fav_player_df = player_dict[fav_player_id]
#                     und_player_df = player_dict[und_player_id]
#                 for j in range(n_features):
#                     fav_feature = fav_player_ewm_features[j]
#                     und_feature = und_player_ewm_features[j]
                    
#                     if favorite_players.shape[0] < 8 or underdog_players.shape[0] < 8:
#                         d[fav_feature].append(np.nan)
#                         d[und_feature].append(np.nan)
#                     else:
#                         d[fav_feature].append(fav_player_df[fav_player_df['match_id'] == match_id][ewm_features[j]].values[0])
#                         d[und_feature].append(und_player_df[und_player_df['match_id'] == match_id][ewm_features[j]].values[0])
                for j in [10,14]:
                    if favorite_players.shape[0] < 8 or underdog_players.shape[0] < 8:
                        d[f'sp_ewm_{j}'].append(np.nan)
                        d[f'sp_ewm_{j}'].append(np.nan)
                    else:
                        d[f'fav_p{i}_sp_ewm_{j}'].append(fav_player_df[fav_player_df['match_id'] == match_id][f'sp_ewm_{j}'].values[0])
                        d[f'und_p{i}_sp_ewm_{j}'].append(und_player_df[und_player_df['match_id'] == match_id][f'sp_ewm_{j}'].values[0])
    df = pd.DataFrame(d)
    
    new_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)
    print('err:', err)
    return new_df

time: 1.91 ms (started: 2023-03-05 03:57:26 -05:00)


In [22]:
insert_player_features(match_df.tail(100), player_dict, pp_df)

err: 1


Unnamed: 0,match_id,away_id,home_id,date,away_pts,home_pts,playoff_game,home_ml,away_ml,home_spread,...,fav_p7_ast_avg_ewm_40,und_p7_ast_avg_ewm_40,fav_p7_stl_avg_ewm_40,und_p7_stl_avg_ewm_40,fav_p7_blk_avg_ewm_40,und_p7_blk_avg_ewm_40,fav_p7_tov_avg_ewm_40,und_p7_tov_avg_ewm_40,fav_p7_pf_avg_ewm_40,und_p7_pf_avg_ewm_40
0,10011,2,15,2022-04-10,139.0,110.0,0.0,4.152222,1.250862,9.166667,...,0.048016,0.111106,0.034579,0.051510,0.014444,0.020868,0.075589,0.076093,0.102111,0.081044
1,4791,14,8,2022-04-10,146.0,141.0,0.0,1.420668,3.011111,-6.000000,...,0.099542,0.102980,0.035678,0.017965,0.023244,0.002443,0.162772,0.048768,0.234735,0.049644
2,6923,1,11,2022-04-10,130.0,114.0,0.0,6.341111,1.136773,11.888889,...,0.118023,0.089840,0.074180,0.070265,0.013626,0.101475,0.023061,0.030411,0.038348,0.110892
3,3093,30,4,2022-04-10,108.0,124.0,0.0,1.115298,7.080000,-12.722222,...,0.071830,0.085062,0.031664,0.013259,0.022134,0.094317,0.045249,0.033402,0.115354,0.045403
4,3364,17,6,2022-04-10,115.0,133.0,0.0,1.263102,4.042222,-8.388889,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5966,2,10,2022-06-05,88.0,107.0,0.0,1.495142,2.727778,-4.722222,...,0.071789,0.051152,0.044479,0.019747,0.015597,0.050313,0.031607,0.062771,0.075290,0.127350
96,1280,10,2,2022-06-08,100.0,116.0,0.0,1.658921,2.338000,-4.375000,...,0.119471,0.059651,0.022813,0.061109,0.007314,0.022243,0.019934,0.031978,0.091964,0.098992
97,1271,10,2,2022-06-10,107.0,97.0,0.0,1.638972,2.367059,-4.176471,...,0.118629,0.061014,0.021700,0.062400,0.006957,0.021158,0.023948,0.030418,0.087478,0.102709
98,5978,2,10,2022-06-13,94.0,104.0,0.0,1.605194,2.437778,-3.833333,...,0.138134,0.112842,0.027784,0.020641,0.013387,0.006618,0.091271,0.027626,0.099795,0.083211


time: 34.1 s (started: 2023-03-05 03:57:26 -05:00)


In [23]:
player_dict[1691]

Unnamed: 0_level_0,player_id,match_id,team_id,sp,inactive,ts_pct,efg_pct,threepar,ftr,orb_pct,...,orb_avg_ewm_40,drb_avg_ewm_40,trb_avg_ewm_40,ast_avg_ewm_40,stl_avg_ewm_40,blk_avg_ewm_40,tov_avg_ewm_40,pf_avg_ewm_40,sp_ewm_10,sp_ewm_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-20,1691,18005,27,0.0,1.0,0.000,0.0,0.0,0.000,0.0,...,,,,,,,,,,
2021-10-22,1691,4959,27,0.0,1.0,0.000,0.0,0.0,0.000,0.0,...,,,,,,,,,0.000000,0.000000
2021-10-23,1691,17970,27,0.0,1.0,0.000,0.0,0.0,0.000,0.0,...,,,,,,,,,0.000000,0.000000
2021-10-26,1691,17975,27,0.0,1.0,0.000,0.0,0.0,0.000,0.0,...,,,,,,,,,0.000000,0.000000
2021-10-28,1691,4559,27,0.0,1.0,0.000,0.0,0.0,0.000,0.0,...,,,,,,,,,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-05,1691,4824,27,198.0,0.0,0.000,0.0,1.0,0.000,0.0,...,0.032849,0.031504,0.064353,0.115104,0.054788,0.004230,0.004615,0.070837,186.121094,348.437500
2022-04-07,1691,11763,27,507.0,0.0,0.174,0.0,0.5,1.000,0.0,...,0.031246,0.044749,0.075995,0.109489,0.052115,0.004024,0.004390,0.067382,71.625000,310.828125
2022-04-09,1691,17764,27,1343.0,0.0,0.515,0.5,1.0,0.667,0.0,...,0.029722,0.048339,0.078061,0.104148,0.049573,0.003828,0.004176,0.075640,180.468750,268.568115
2022-04-10,1691,4298,27,527.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0.028272,0.050340,0.078612,0.103427,0.047155,0.003641,0.003972,0.071951,471.101562,537.176086


time: 10.3 ms (started: 2023-03-05 03:58:00 -05:00)


In [24]:
def parallelize_dataframe(df, func, player_dict, pp_df):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    list_df = pool.map(partial(func, player_dict = player_dict, pp_df = pp_df), df_split)
    df = pd.concat(list_df)
    pool.close()
    pool.join()
    return df

match_df = parallelize_dataframe(match_df, insert_player_features, player_dict, pp_df)

err: 9
err: 11
err: 11
err: 2
err: 2
err: 4
err: 1
err: 4
err: 1
err: 0
err: 3
err: 2
err: 2
err: 0
err: 2
err: 3
err: 2
err: 1
err: 3
err: 0
err: 2
err: 6
err: 5
time: 11min 6s (started: 2023-03-05 03:58:00 -05:00)


In [25]:
match_df['fav_p2_pf_avg_ewm_10']

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
         ...   
827    0.081345
828    0.102807
829    0.084706
830    0.079755
831    0.071826
Name: fav_p2_pf_avg_ewm_10, Length: 19158, dtype: float64

time: 4.37 ms (started: 2023-03-05 04:09:06 -05:00)


## Team Factors

## Team Fatigue(Away game streak) and Win Streak

In [26]:
d = defaultdict(list)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    prev_favorite_match = get_prev_matches(row['date'], \
        row['favorite_id'], match_df).tail(1)
    prev_underdog_match = get_prev_matches(row['date'], \
        row['underdog_id'], match_df).tail(1)
    if prev_favorite_match.shape[0] > 0:
        prev_f_win_streak = prev_favorite_match['post_favorite_win_streak'].values[0] if row['favorite_id'] == prev_favorite_match['favorite_id'].values[0] else prev_favorite_match['post_underdog_win_streak'].values[0]
        prev_f_home_streak = prev_favorite_match['post_favorite_home_streak'].values[0] if row['favorite_id'] == prev_favorite_match['favorite_id'].values[0] else prev_favorite_match['post_underdog_home_streak'].values[0]
    else:
        prev_f_win_streak = 0
        prev_f_home_streak = 0
        
    if prev_underdog_match.shape[0] > 0:
        prev_u_win_streak = prev_underdog_match['post_favorite_win_streak'].values[0] if row['underdog_id'] == prev_underdog_match['favorite_id'].values[0] else prev_underdog_match['post_underdog_win_streak'].values[0]
        prev_u_home_streak = prev_underdog_match['post_favorite_home_streak'].values[0] if row['underdog_id'] == prev_underdog_match['favorite_id'].values[0] else prev_underdog_match['post_underdog_home_streak'].values[0]
    else:
        prev_u_win_streak = 0
        prev_u_home_streak = 0
        
    if row['favorite_won']:
        f_win_streak = max(1,prev_f_win_streak+1)
        u_win_streak = min(-1, prev_u_win_streak-1)
    else:
        f_win_streak = min(-1, prev_f_win_streak-1)
        u_win_streak = max(1,prev_u_win_streak+1)

    if row['favorite_is_home']:
        f_home_streak = max(1, prev_f_home_streak+1)
        u_home_streak = min(-1, prev_u_home_streak-1)
    else:
        f_home_streak = min(-1, prev_f_home_streak-1)
        u_home_streak = max(1, prev_u_home_streak+1)
        
    match_df.at[idx, 'prev_favorite_win_streak'] = prev_f_win_streak
    match_df.at[idx, 'prev_favorite_home_streak'] = prev_f_home_streak
    match_df.at[idx, 'prev_underdog_win_streak'] = prev_u_win_streak
    match_df.at[idx, 'prev_underdog_home_streak'] = prev_u_home_streak
    
    match_df.at[idx, 'post_favorite_win_streak'] = f_win_streak
    match_df.at[idx, 'post_favorite_home_streak'] = f_home_streak
    match_df.at[idx, 'post_underdog_win_streak'] = u_win_streak
    match_df.at[idx, 'post_underdog_home_streak'] = u_home_streak

df = pd.DataFrame(d)
match_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)

streak_features = ['prev_favorite_win_streak', 
                   'prev_favorite_home_streak', 
                   'prev_underdog_win_streak', 
                   'prev_underdog_home_streak']


  0%|          | 0/19158 [00:00<?, ?it/s]

time: 3min 30s (started: 2023-03-05 04:09:06 -05:00)


## FG%, 3P%, FT%, ORB, DRB, TRB, TOV, AST, STL, BLK, DRTG, ORTG, EFG 

In [27]:
d = defaultdict(list)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    favorite_players = get_active_players(row['match_id'], row['favorite_id'], pp_df)
    underdog_players = get_active_players(row['match_id'], row['underdog_id'], pp_df)
    d['favorite_bpm'].append(favorite_players['bpm'].sum())
    d['underdog_bpm'].append(underdog_players['bpm'].sum())
    d['favorite_fga'].append(favorite_players['fga'].sum())
    d['underdog_fga'].append(underdog_players['fga'].sum())
    d['favorite_fg'].append(favorite_players['fg'].sum())
    d['underdog_fg'].append(underdog_players['fg'].sum())
    d['favorite_fg_pct'].append(favorite_players['fg_pct'].mean())
    d['underdog_fg_pct'].append(underdog_players['fg_pct'].mean())
    
    d['favorite_3p'].append(favorite_players['threep'].sum())
    d['underdog_3p'].append(underdog_players['threep'].sum())
    d['favorite_3pa'].append(favorite_players['threepa'].sum())
    d['underdog_3pa'].append(underdog_players['threepa'].sum())
    d['favorite_3p_pct'].append(favorite_players['threep_pct'].mean())
    d['underdog_3p_pct'].append(underdog_players['threep_pct'].mean())
    
    d['favorite_ft'].append(favorite_players['ft'].sum())
    d['underdog_ft'].append(underdog_players['ft'].sum())
    d['favorite_fta'].append(favorite_players['fta'].sum())
    d['underdog_fta'].append(underdog_players['fta'].sum())
    d['favorite_ft_pct'].append(favorite_players['ft_pct'].mean())
    d['underdog_ft_pct'].append(underdog_players['ft_pct'].mean())
    
    d['favorite_orb'].append(favorite_players['orb'].sum())
    d['underdog_orb'].append(underdog_players['orb'].sum())
    d['favorite_orb_pct'].append(favorite_players['orb_pct'].mean())
    d['underdog_orb_pct'].append(underdog_players['orb_pct'].mean())
    
    d['favorite_drb'].append(favorite_players['drb'].sum())
    d['underdog_drb'].append(underdog_players['drb'].sum())
    d['favorite_drb_pct'].append(favorite_players['drb_pct'].mean())
    d['underdog_drb_pct'].append(underdog_players['drb_pct'].mean())
    
    d['favorite_trb'].append(favorite_players['trb'].sum())
    d['underdog_trb'].append(underdog_players['trb'].sum())
    d['favorite_trb_pct'].append(favorite_players['trb_pct'].mean())
    d['underdog_trb_pct'].append(underdog_players['trb_pct'].mean())
    
    d['favorite_tov'].append(favorite_players['tov'].sum())
    d['underdog_tov'].append(underdog_players['tov'].sum())
    d['favorite_tov_pct'].append(favorite_players['tov_pct'].mean())
    d['underdog_tov_pct'].append(underdog_players['tov_pct'].mean())
    
    d['favorite_ast'].append(favorite_players['ast'].sum())
    d['underdog_ast'].append(underdog_players['ast'].sum())
    d['favorite_ast_pct'].append(favorite_players['ast_pct'].mean())
    d['underdog_ast_pct'].append(underdog_players['ast_pct'].mean())
    
    d['favorite_stl'].append(favorite_players['stl'].sum())
    d['underdog_stl'].append(underdog_players['stl'].sum())
    d['favorite_stl_pct'].append(favorite_players['stl_pct'].mean())
    d['underdog_stl_pct'].append(underdog_players['stl_pct'].mean())
    
    d['favorite_blk'].append(favorite_players['blk'].sum())
    d['underdog_blk'].append(underdog_players['blk'].sum())
    d['favorite_blk_pct'].append(favorite_players['blk_pct'].mean())
    d['underdog_blk_pct'].append(underdog_players['blk_pct'].mean())
    
    d['favorite_drtg'].append(favorite_players['drtg'].mean())
    d['underdog_drtg'].append(underdog_players['drtg'].mean())
    
    d['favorite_ortg'].append(favorite_players['ortg'].mean())
    d['underdog_ortg'].append(underdog_players['ortg'].mean())
    
    d['favorite_efg_pct'].append(favorite_players['efg_pct'].mean())
    d['underdog_efg_pct'].append(underdog_players['efg_pct'].mean())
        
    d['sp'].append(favorite_players['sp'].sum())

  0%|          | 0/19158 [00:00<?, ?it/s]

time: 1min 16s (started: 2023-03-05 04:12:37 -05:00)


In [28]:
df = pd.DataFrame(d)
match_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)

time: 151 ms (started: 2023-03-05 04:13:53 -05:00)


## Possessions and Pace

In [29]:
match_df['favorite_possessions'] = \
    0.5 * ((match_df['favorite_fga'] + 0.4*match_df['favorite_fta'] - \
            1.07*(match_df['favorite_orb']/(match_df['favorite_orb'] + match_df['underdog_drb'])) * \
            (match_df['favorite_fga']-match_df['favorite_fg']) + match_df['favorite_tov']) + \
           (match_df['underdog_fga'] + 0.4*match_df['underdog_fta'] -\
           1.07 * (match_df['underdog_orb']/(match_df['underdog_orb'] + match_df['favorite_drb'])) * \
           (match_df['underdog_fga']-match_df['underdog_fg']) + match_df['underdog_tov']))

match_df['underdog_possessions'] = \
    0.5 * ((match_df['underdog_fga'] + 0.4*match_df['underdog_fta'] - \
            1.07*(match_df['underdog_orb']/(match_df['underdog_orb'] + match_df['favorite_drb'])) * \
            (match_df['underdog_fga']-match_df['underdog_fg']) + match_df['underdog_tov']) + \
           (match_df['favorite_fga'] + 0.4*match_df['favorite_fta'] -\
           1.07 * (match_df['favorite_orb']/(match_df['favorite_orb'] + match_df['underdog_drb'])) * \
           (match_df['favorite_fga']-match_df['favorite_fg']) + match_df['favorite_tov']))

match_df['favorite_pace'] = 48 * ((match_df['favorite_possessions'] + match_df['underdog_possessions']) / \
                          (2*(match_df['sp']/(60*5))))
match_df['underdog_pace'] = 48 * ((match_df['favorite_possessions'] + match_df['underdog_possessions']) / \
                          (2*(match_df['sp']/(60*5))))


time: 4.84 ms (started: 2023-03-05 04:13:54 -05:00)


## EMA Team stats

In [30]:
def ema(current, prev_ema, window_size, smoothing=2.0):
    k = smoothing / (1 + window_size)
    return current * k + prev_ema * (1-k)

time: 1.85 ms (started: 2023-03-05 04:13:54 -05:00)


In [31]:
def get_prev_team_sum(team_id, home_col, prev_matches):
    away_col = home_col.replace('home', 'away')
    prev_matches['res'] =  prev_matches.apply(lambda x:
                             x[home_col] if x['home_id'] == team_id
                             else x[away_col], axis=1)
    return prev_matches['res'].sum()

time: 548 µs (started: 2023-03-05 04:13:54 -05:00)


In [32]:
smoothing = 2
window_sizes = [5,8,12]

for w in tqdm(range(len(window_sizes))):
    window_size = window_sizes[w]

    ema_favorite_features = \
        [(f'prev_favorite_pts_ema{window_size}',      f'post_favorite_pts_ema{window_size}'),
        (f'prev_favorite_bpm_ema{window_size}',       f'post_favorite_bpm_ema{window_size}'),
        (f'prev_favorite_fg_ema{window_size}',        f'post_favorite_fg_ema{window_size}'),
        (f'prev_favorite_fg_pct_ema{window_size}',    f'post_favorite_fg_pct_ema{window_size}'),
        (f'prev_favorite_3p_ema{window_size}',        f'post_favorite_3p_ema{window_size}'),
        (f'prev_favorite_3p_pct_ema{window_size}',    f'post_favorite_3p_pct_ema{window_size}'),
        (f'prev_favorite_ft_ema{window_size}',        f'post_favorite_ft_ema{window_size}'),
        (f'prev_favorite_ft_pct_ema{window_size}',    f'post_favorite_ft_pct_ema{window_size}'),
        (f'prev_favorite_orb_ema{window_size}',       f'post_favorite_orb_ema{window_size}'),
        (f'prev_favorite_orb_pct_ema{window_size}',   f'post_favorite_orb_pct_ema{window_size}'),
        (f'prev_favorite_drb_ema{window_size}',       f'post_favorite_drb_ema{window_size}'),
        (f'prev_favorite_drb_pct_ema{window_size}',   f'post_favorite_drb_pct_ema{window_size}'),
        (f'prev_favorite_trb_ema{window_size}',       f'post_favorite_trb_ema{window_size}'),
        (f'prev_favorite_trb_pct_ema{window_size}',   f'post_favorite_trb_pct_ema{window_size}'),
        (f'prev_favorite_tov_ema{window_size}',       f'post_favorite_tov_ema{window_size}'),
        (f'prev_favorite_tov_pct_ema{window_size}',   f'post_favorite_tov_pct_ema{window_size}'),
        (f'prev_favorite_ast_ema{window_size}',       f'post_favorite_ast_ema{window_size}'),
        (f'prev_favorite_ast_pct_ema{window_size}',   f'post_favorite_ast_pct_ema{window_size}'),
        (f'prev_favorite_stl_ema{window_size}',       f'post_favorite_stl_ema{window_size}'),
        (f'prev_favorite_stl_pct_ema{window_size}',   f'post_favorite_stl_pct_ema{window_size}'),
        (f'prev_favorite_blk_ema{window_size}',       f'post_favorite_blk_ema{window_size}'),
        (f'prev_favorite_blk_pct_ema{window_size}',   f'post_favorite_blk_pct_ema{window_size}'),
        (f'prev_favorite_drtg_ema{window_size}',      f'post_favorite_drtg_ema{window_size}'),
        (f'prev_favorite_ortg_ema{window_size}',      f'post_favorite_ortg_ema{window_size}'),
        (f'prev_favorite_efg_pct_ema{window_size}',   f'post_favorite_efg_pct_ema{window_size}'),
        (f'prev_favorite_pace_ema{window_size}',      f'post_favorite_pace_ema{window_size}'),
        ]

    ema_underdog_features = [(f[0].replace('favorite','underdog'), f[1].replace('favorite','underdog')) for f in ema_favorite_features]
    sma_favorite_features = [(f[0].replace('ema','sma'), f[1].replace('ema','sma')) for f in ema_favorite_features]
    sma_underdog_features = [(f[0].replace('favorite','underdog'), f[1].replace('favorite','underdog')) for f in sma_favorite_features]

    for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
        prev_favorite_matches = get_prev_matches(row['date'], \
            row['favorite_id'], match_df).tail(window_size)
        prev_underdog_matches = get_prev_matches(row['date'], \
            row['underdog_id'], match_df).tail(window_size)
        len_prev_favorite_matches = len(prev_favorite_matches)
        len_prev_underdog_matches = len(prev_underdog_matches)
        for i in range(len(ema_favorite_features)):
            favorite_feature = re.findall('favorite_.*_ema', ema_favorite_features[i][0])[0].replace('_ema', '')
            underdog_feature = favorite_feature.replace('favorite', 'underdog') 

            if not prev_favorite_matches.empty:
                prev_match = prev_favorite_matches.iloc[-1:]
                match_df.at[idx,sma_favorite_features[i][0]] = get_prev_team_sum(row['favorite_id'], 
                                                                                 favorite_feature, 
                                                                                 prev_favorite_matches)/len_prev_favorite_matches 
                if len_prev_favorite_matches < window_size:
                    match_df.at[idx,ema_favorite_features[i][0]] = match_df.at[idx,sma_favorite_features[i][0]]
                    match_df.at[idx,ema_favorite_features[i][1]] = (match_df.at[idx,sma_favorite_features[i][0]] \
                        * len_prev_favorite_matches + row[favorite_feature])/(len_prev_favorite_matches + 1)
                    
                    match_df.at[idx,ema_favorite_features[i][0]] = match_df.loc[idx,sma_favorite_features[i][0]] 
                    match_df.at[idx,ema_favorite_features[i][1]] = (match_df.loc[idx,sma_favorite_features[i][0]] \
                                                                * len_prev_favorite_matches + row[favorite_feature])/(len_prev_favorite_matches + 1)

                else:
                    match_df.at[idx,ema_favorite_features[i][0]]= prev_match[ema_favorite_features[i][1]] \
                                        if prev_match['favorite_id'].values[0] == row['favorite_id'] \
                                        else prev_match[ema_underdog_features[i][1]]

                    match_df.at[idx,ema_favorite_features[i][1]] = ema(row[favorite_feature],  
                                        match_df.at[idx,ema_favorite_features[i][0]], 
                                        window_size)
            else:
                match_df.at[idx,ema_favorite_features[i][1]] = row[favorite_feature]


            if not prev_underdog_matches.empty:
                prev_match = prev_underdog_matches.iloc[-1:]
                match_df.at[idx,sma_underdog_features[i][0]] = get_prev_team_sum(row['underdog_id'], \
                    underdog_feature, prev_underdog_matches)/len_prev_underdog_matches

                if len_prev_underdog_matches < window_size:
                    match_df.at[idx,ema_underdog_features[i][0]] = match_df.at[idx,sma_underdog_features[i][0]]
                    match_df.at[idx,ema_underdog_features[i][1]] = (match_df.at[idx,sma_underdog_features[i][0]] \
                        * len_prev_underdog_matches + row[underdog_feature])/(len_prev_underdog_matches + 1)
                else:
                    match_df.at[idx,ema_underdog_features[i][0]] = (prev_match[ema_favorite_features[i][1]] \
                                if prev_match['favorite_id'].values[0] == row['underdog_id'] \
                                else prev_match[ema_underdog_features[i][1]])

                    match_df.at[idx,ema_underdog_features[i][1]] = ema(row[underdog_feature],  
                                            match_df.at[idx,ema_underdog_features[i][0]], 
                                            window_size)
            else:
                match_df.at[idx,ema_underdog_features[i][1]] = row[underdog_feature]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/19158 [00:00<?, ?it/s]

  0%|          | 0/19158 [00:00<?, ?it/s]

  0%|          | 0/19158 [00:00<?, ?it/s]

time: 1h 25min 54s (started: 2023-03-05 04:13:54 -05:00)


In [34]:
match_df.to_csv('nba_processed_features.csv')
pp_df.to_csv('player_df.csv')

time: 35.9 s (started: 2023-03-05 09:02:13 -05:00)


In [36]:
match_df['favorite_elo'] = 1500.0
match_df['underdog_elo'] = 1500.0

time: 1.13 ms (started: 2023-03-05 09:02:49 -05:00)


In [35]:
def get_prev_match(date, team_id, match_df):
    return match_df[(match_df["date"] < date) &
                    ((match_df["favorite_id"] == team_id) |
                     (match_df["underdog_id"] == team_id))].tail(1)

time: 252 µs (started: 2023-03-05 09:02:49 -05:00)


In [37]:
def get_prev_elo(team_id, season, prev_match):
    if prev_match.empty:
        prev_elo = 1500.0
    elif team_id == prev_match['favorite_id'].values[0]:
        prev_elo = prev_match['favorite_elo'].values[0]
    elif team_id == prev_match['underdog_id'].values[0]:
        prev_elo = prev_match['underdog_elo'].values[0]
    else: 
        print('err')

    if (not prev_match.empty and
            (prev_match['season'].values[0]
             != season)):
        prev_elo = prev_elo * 0.75 + 1505 * 0.25
    return prev_elo


def update_elo(favorite_elo, underdog_elo, movl):
    elo_diff = favorite_elo + 100.0 - underdog_elo
    if movl > 0:
        h_s = 1.0
        a_s = 0.0
        multiplier = ((movl+3)**(0.8))/(7.5+0.006*elo_diff)

    else:
        h_s = 0.0
        a_s = 1.0
        multiplier = ((-movl+3)**(0.8))/(7.5+0.006*(-elo_diff))
        
    exp_h_s = 1.0 / (1.0 + 10.0 ** (-elo_diff/400.0))
    exp_a_s = 1.0 - exp_h_s
    
    k = 20.0 * multiplier

    new_favorite_elo = favorite_elo + k * (h_s - exp_h_s)
    new_underdog_elo = underdog_elo + k * (a_s - exp_a_s)

    return (new_favorite_elo, new_underdog_elo)

time: 576 µs (started: 2023-03-05 09:02:49 -05:00)


In [38]:
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    prev_f_match = get_prev_match(row['date'], row['favorite_id'], match_df)
    prev_u_match = get_prev_match(row['date'], row['underdog_id'], match_df)

    prev_f_elo = get_prev_elo(
        row['favorite_id'], row['season'], prev_f_match)
    prev_u_elo = get_prev_elo(
        row['underdog_id'], row['season'], prev_u_match)    

    new_elos = update_elo(prev_f_elo, prev_u_elo, row['favorite_movl'])
    match_df.at[idx, 'favorite_elo'] = new_elos[0]
    match_df.at[idx, 'underdog_elo'] = new_elos[1]

    match_df.at[idx, 'prev_favorite_elo'] = prev_f_elo
    match_df.at[idx, 'prev_underdog_elo'] = prev_u_elo

  0%|          | 0/19158 [00:00<?, ?it/s]

time: 3min 57s (started: 2023-03-05 09:02:49 -05:00)


In [39]:
match_df.to_csv('nba_processed_features + elo.csv')
pp_df.to_csv('player_df.csv')

time: 35.8 s (started: 2023-03-05 09:06:47 -05:00)


In [2]:
match_df = pd.read_csv('nba_processed_features + elo.csv')
pp_df = pd.read_csv('player_df.csv')

time: 6.56 s (started: 2023-03-06 14:49:25 -05:00)


In [None]:
ema_favorite_features = \
        [(f'prev_favorite_pts_ema{window_size}',       f'post_favorite_pts_ema{window_size}'),
        (f'prev_favorite_bpm_ema{window_size}',       f'post_favorite_bpm_ema{window_size}'),
        (f'prev_favorite_fg_ema{window_size}',        f'post_favorite_fg_ema{window_size}'),
        (f'prev_favorite_fg_pct_ema{window_size}',    f'post_favorite_fg_pct_ema{window_size}'),
        (f'prev_favorite_3p_ema{window_size}',        f'post_favorite_3p_ema{window_size}'),
        (f'prev_favorite_3p_pct_ema{window_size}',    f'post_favorite_3p_pct_ema{window_size}'),
        (f'prev_favorite_ft_ema{window_size}',        f'post_favorite_ft_ema{window_size}'),
        (f'prev_favorite_ft_pct_ema{window_size}',    f'post_favorite_ft_pct_ema{window_size}'),
        (f'prev_favorite_orb_ema{window_size}',       f'post_favorite_orb_ema{window_size}'),
        (f'prev_favorite_orb_pct_ema{window_size}',   f'post_favorite_orb_pct_ema{window_size}'),
        (f'prev_favorite_drb_ema{window_size}',       f'post_favorite_drb_ema{window_size}'),
        (f'prev_favorite_drb_pct_ema{window_size}',   f'post_favorite_drb_pct_ema{window_size}'),
        (f'prev_favorite_trb_ema{window_size}',       f'post_favorite_trb_ema{window_size}'),
        (f'prev_favorite_trb_pct_ema{window_size}',   f'post_favorite_trb_pct_ema{window_size}'),
        (f'prev_favorite_tov_ema{window_size}',       f'post_favorite_tov_ema{window_size}'),
        (f'prev_favorite_tov_pct_ema{window_size}',   f'post_favorite_tov_pct_ema{window_size}'),
        (f'prev_favorite_ast_ema{window_size}',       f'post_favorite_ast_ema{window_size}'),
        (f'prev_favorite_ast_pct_ema{window_size}',   f'post_favorite_ast_pct_ema{window_size}'),
        (f'prev_favorite_stl_ema{window_size}',       f'post_favorite_stl_ema{window_size}'),
        (f'prev_favorite_stl_pct_ema{window_size}',   f'post_favorite_stl_pct_ema{window_size}'),
        (f'prev_favorite_blk_ema{window_size}',       f'post_favorite_blk_ema{window_size}'),
        (f'prev_favorite_blk_pct_ema{window_size}',   f'post_favorite_blk_pct_ema{window_size}'),
        (f'prev_favorite_drtg_ema{window_size}',      f'post_favorite_drtg_ema{window_size}'),
        (f'prev_favorite_ortg_ema{window_size}',      f'post_favorite_ortg_ema{window_size}'),
        (f'prev_favorite_efg_pct_ema{window_size}',   f'post_favorite_efg_pct_ema{window_size}'),
        (f'prev_favorite_pace_ema{window_size}',      f'post_favorite_pace_ema{window_size}')]

features = ['favorite_pace','underdog_pace']

#all_features = 