## Imports

In [1]:
%%capture
import warnings
import math
import db_func
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook, show
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
from tqdm.notebook import tqdm
import re
from collections import defaultdict, ChainMap
import multiprocessing
from functools import partial
import numpy as np
from itertools import repeat

warnings.filterwarnings('ignore')

try:
    %load_ext autotime
    %load_ext jupyterlab_notify
except:
    !pip3 install ipython-autotime
    %load_ext autotime

time: 903 ms (started: 2023-03-04 05:50:00 -05:00)


In [2]:
#pd.set_option('max_columns', 2)
pd.set_option('min_rows', 20)

time: 516 µs (started: 2023-03-04 05:50:01 -05:00)


## Database Connection

In [3]:
conn = db_func.get_conn()

time: 317 ms (started: 2023-03-04 05:50:01 -05:00)


# Preprocessing


match_df: The final processed dataset to be used in the machine learning models 

In [4]:
match_query = '''SELECT
				m.match_id,  m.away_id, m.home_id,
				m.date, m.away_pts, m.home_pts, m.playoff_game,
				h_ml.home_ml, a_ml.away_ml,
				h_ps.home_spread, a_ps.away_spread,
				h_ps.home_ps_odds, a_ps.away_ps_odds,
				over.over, under.under, ou.spread
			FROM match AS m
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS home_ml,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 1 AND
					o.match_id = m.match_id AND
					o.team_id = m.home_id
				GROUP BY m.match_id
			) AS h_ml ON m.match_id = h_ml.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS away_ml,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 1 AND
					o.match_id = m.match_id AND
					o.team_id = m.away_id
				GROUP BY m.match_id
			) AS a_ml ON m.match_id = a_ml.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS home_ps_odds,
					AVG(spread) AS home_spread,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 2 AND
					o.match_id = m.match_id AND
					o.team_id = m.home_id
				GROUP BY m.match_id
			) AS h_ps ON m.match_id = h_ps.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS away_ps_odds,
					AVG(spread) AS away_spread,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 2 AND
					o.match_id = m.match_id AND
					o.team_id = m.away_id
				GROUP BY m.match_id
			) AS a_ps ON m.match_id = a_ps.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS under,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.over_under = 'under' AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS under ON m.match_id = under.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS over,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.over_under = 'over' AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS over ON m.match_id = over.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(spread) AS spread,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS ou ON m.match_id = ou.match_id
			WHERE date >= DATE('2022-04-29')
			ORDER BY date ASC
			'''

season_query = '''SELECT *
				FROM season'''

player_performance_query = '''SELECT p.*, m.date
							FROM player_performance as p, match as m
							WHERE m.match_id = p.match_id
							AND m.date >= DATE('2022-04-29')
							ORDER BY date ASC'''
team_query = '''SELECT * 
				FROM team_name'''

injury_query = '''SELECT i.* 
				FROM injury as i, match as m
				WHERE m.match_id = i.match_id
				AND m.date >= DATE('2022-04-29')
				ORDER BY m.date ASC'''

match_df = pd.read_sql(match_query, conn)
#match_df.set_index('match_id', inplace=True)
season_df = pd.read_sql(season_query, conn)
pp_df = pd.read_sql(player_performance_query, conn)
team_df = pd.read_sql(team_query, conn)
injury_df = pd.read_sql(injury_query, conn)
match_df['date'] = match_df['date'].map(lambda x: datetime(x.year, x.month, x.day))
pp_df['date'] = pp_df['date'].map(lambda x: datetime(x.year, x.month, x.day))
season_df['start_date'] =season_df['start_date'].map(lambda x: datetime(x.year, x.month, x.day))
season_df['end_date'] = season_df['end_date'].map(lambda x: datetime(x.year, x.month, x.day))

time: 2min 38s (started: 2023-03-04 05:50:01 -05:00)


In [5]:
def get_season(date):
    return season_df[(season_df['start_date'] <= date) &
                     (season_df['end_date'] >= date)]['season'].values[0]

time: 236 µs (started: 2023-03-04 05:52:40 -05:00)


In [6]:
match_df['season'] = match_df['date'].map(get_season)
pp_df['season'] = pp_df['date'].map(get_season)

time: 345 ms (started: 2023-03-04 05:52:40 -05:00)


In [7]:
match_df

Unnamed: 0,match_id,away_id,home_id,date,away_pts,home_pts,playoff_game,home_ml,away_ml,home_spread,away_spread,home_ps_odds,away_ps_odds,over,under,spread,season
0,11731,15,18,2022-04-29,114.0,106.0,0.0,2.057789,1.809814,1.611111,-1.611111,1.904006,1.927978,1.926661,1.913073,229.444444,2022
1,10021,10,15,2022-05-01,117.0,116.0,0.0,2.196923,1.721238,2.416667,-2.416667,1.912198,1.923638,1.911865,1.922943,224.833333,2022
2,852,17,2,2022-05-01,101.0,89.0,0.0,1.503512,2.703333,-4.944444,4.944444,1.920056,1.920056,1.917828,1.914478,218.277778,2022
3,15894,7,24,2022-05-02,114.0,121.0,0.0,1.407886,3.071,-6.038462,6.038462,1.918662,1.915859,1.899785,1.933323,214.333333,2022
4,10688,23,16,2022-05-02,92.0,106.0,0.0,1.279298,3.894444,-7.5,7.5,1.902513,1.938335,1.902945,1.937661,208.777778,2022
5,840,17,2,2022-05-03,86.0,109.0,0.0,1.666079,2.458693,-3.944444,3.944444,1.924095,1.915972,2.000227,1.870268,216.666667,2022
6,10023,10,15,2022-05-03,101.0,106.0,0.0,2.105,1.780612,2.0,-2.0,1.92848,1.919945,1.907054,1.934125,227.5,2022
7,10094,23,16,2022-05-04,103.0,119.0,0.0,1.261254,4.068889,-7.833333,7.833333,1.926489,1.915557,1.919889,1.91953,208.5,2022
8,15907,7,24,2022-05-04,109.0,129.0,0.0,1.405084,3.086923,-6.05,6.05,1.911273,1.936247,1.904736,1.934266,216.03125,2022
9,4321,24,7,2022-05-06,94.0,103.0,0.0,2.190139,1.900734,-1.111111,1.111111,1.934775,1.906445,1.91163,1.921346,217.611111,2022


time: 16 ms (started: 2023-03-04 05:52:40 -05:00)


# Feature Engineering

## Basic stats with respect to the favorite (determined by bookies)

In [8]:
favorite_df = defaultdict(list)
p = re.compile('prev.*ema')
match_df['home_movl'] = match_df['home_pts'] - match_df['away_pts']
match_df['home_win'] = match_df['home_movl'].map(lambda x: 0 if x < 0 else 1)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):    
    favorite_won = False
    if row['home_ml'] < row['away_ml']:
        favorite_df['favorite_ml'].append(row['home_ml'])
        favorite_df['underdog_ml'].append(row['away_ml'])
        favorite_df['favorite_is_home'].append(1)
        favorite_df['favorite_movl'].append(row['home_pts']-row['away_pts'])
        favorite_df['point_spread'].append(abs(row['home_spread']))
        favorite_df['favorite_pts'].append(row['home_pts'])
        favorite_df['underdog_pts'].append(row['away_pts'])

        if row['home_win']:
            favorite_won = True
    else:
        if not row['home_win']:
            favorite_won = True
        favorite_df['favorite_ml'].append(row['away_ml'])
        favorite_df['underdog_ml'].append(row['home_ml'])
        favorite_df['favorite_is_home'].append(0)
        favorite_df['favorite_movl'].append(row['away_pts']-row['home_pts'])
        favorite_df['point_spread'].append(abs(row['away_spread']))
        favorite_df['favorite_pts'].append(row['away_pts'])
        favorite_df['underdog_pts'].append(row['home_pts'])


    favorite_df['favorite_won'].append(1 if favorite_won else 0)

favorite_df = pd.DataFrame(favorite_df)
favorite_df['vig'] = 1/favorite_df['favorite_ml'] + 1/favorite_df['underdog_ml'] - 1
favorite_df['favorite_implied'] = 1/favorite_df['favorite_ml'] - favorite_df['vig']/2
favorite_df['underdog_implied'] = 1/favorite_df['underdog_ml'] - favorite_df['vig']/2

match_df = pd.concat([match_df.reset_index(drop=True),
                      favorite_df.reset_index(drop=True)],axis=1)
match_df['favorite_id'] = match_df.apply(lambda x: x['home_id'] if x['favorite_is_home']==1 else x['away_id'],axis=1)
match_df['underdog_id'] = match_df.apply(lambda x: x['home_id'] if x['favorite_is_home']==0 else x['away_id'],axis=1)

print(match_df[['favorite_ml', 'home_ml', 'away_ml']][match_df.home_ml > match_df.away_ml])

  0%|          | 0/45 [00:00<?, ?it/s]

    favorite_ml   home_ml   away_ml
0      1.809814  2.057789  1.809814
1      1.721238  2.196923  1.721238
6      1.780612  2.105000  1.780612
9      1.900734  2.190139  1.900734
14     1.674758  2.464938  1.674758
20     1.541282  2.608889  1.541282
22     1.733700  2.173333  1.733700
35     1.666389  2.345764  1.666389
38     1.667990  2.305556  1.667990
time: 16.7 ms (started: 2023-03-04 05:52:40 -05:00)


In [9]:
def get_prev_matches(date, team_id, match_df, opponent_id = 0):
    if opponent_id:
        return match_df[(match_df["date"] < date) &
                        (((match_df["favorite_id"] == team_id) & 
                          (match_df["underdog_id"] == opponent_id)) |
                         ((match_df["favorite_id"] == opponent_id) & 
                          (match_df["underdog_id"] == team_id)))]
    else:
        return match_df[(match_df["date"] < date) &
                    ((match_df["favorite_id"] == team_id) |
                     (match_df["underdog_id"] == team_id))]


time: 332 µs (started: 2023-03-04 05:52:40 -05:00)


In [10]:
match_df.favorite_id.describe()

count    45.000000
mean     12.022222
std       8.024080
min       2.000000
25%       2.000000
50%      10.000000
75%      17.000000
max      24.000000
Name: favorite_id, dtype: float64

time: 1.92 ms (started: 2023-03-04 05:52:40 -05:00)


In [11]:
def get_win_ratio(team_id, prev_matches, i):
    if len(prev_matches) < i: 
        return None
    prev_matches['res'] =  prev_matches.apply(lambda x:
                             1 if (x['favorite_id'] == team_id and x['favorite_won']) or 
                                      (x['underdog_id'] == team_id and not x['favorite_won'])        
                             else 0, axis=1)
    return prev_matches['res'].sum()/i    

time: 471 µs (started: 2023-03-04 05:52:40 -05:00)


In [12]:
window_sizes = [5,7]

for w in tqdm(window_sizes):
    match_df[f'past_{w}_favorite_win_ratio'] = match_df.apply(lambda x: 
                                    get_win_ratio(x['favorite_id'], 
                                        get_prev_matches(x['date'], 
                                                         x['favorite_id'],
                                                         match_df
                                                        ).tail(w),
                                                    w), axis=1)
    match_df[f'past_{w}_underdog_win_ratio'] = match_df.apply(lambda x: 
                                    get_win_ratio(x['underdog_id'], 
                                        get_prev_matches(x['date'], 
                                                         x['underdog_id'],
                                                         match_df
                                                        ).tail(w),
                                                    w), axis=1)


  0%|          | 0/2 [00:00<?, ?it/s]

time: 124 ms (started: 2023-03-04 05:52:40 -05:00)


# Player Factors

In [13]:
def get_prev_player_match(date, player_id, pp_df):
    return pp_df[(pp_df['date'] < date) & 
                (pp_df['player_id'] == player_id)].tail(1)
def get_active_players(match_id, team_id, pp_df):
    return  pp_df[(pp_df['match_id'] == match_id) &
                      (pp_df['team_id'] == team_id) &
                  (pp_df['sp']>0)]

def get_complete_roster(match_id, team_id, match_df):
    return  pp_df[(pp_df['match_id'] == match_id) &
                      (pp_df['team_id'] == team_id)]

time: 352 µs (started: 2023-03-04 05:52:40 -05:00)


## Define Stats per minute played

In [14]:
pm_stats = ['pts', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk','tov', 'pf']

for f in pm_stats:
    pp_df[f'{f}_avg'] = pp_df.apply(lambda x: x[f]*60/x['sp'] if x['sp']>0 else 0, axis=1)

avg_stats = [f'{f}_avg' for f in pm_stats]

time: 91.6 ms (started: 2023-03-04 05:52:40 -05:00)


In [15]:
pp_df[['pts', 'pts_avg', 'sp']]

Unnamed: 0,pts,pts_avg,sp
0,2.0,0.174165,689.0
1,18.0,0.441718,2445.0
2,2.0,0.163265,735.0
3,0.0,0.000000,0.0
4,0.0,0.000000,0.0
5,7.0,0.257038,1634.0
6,0.0,0.000000,0.0
7,0.0,0.000000,0.0
8,0.0,0.000000,0.0
9,0.0,0.000000,0.0


time: 5.05 ms (started: 2023-03-04 05:52:40 -05:00)


In [16]:
avg_stats

['pts_avg',
 'orb_avg',
 'drb_avg',
 'trb_avg',
 'ast_avg',
 'stl_avg',
 'blk_avg',
 'tov_avg',
 'pf_avg']

time: 1.04 ms (started: 2023-03-04 05:52:40 -05:00)


## EWM Player Stats

In [17]:
def get_player_dict(gb):
    window_sizes = [10,20,40]
    player_dict = dict(tuple(gb))
    
    features = ['ts_pct',
       'efg_pct', 'threepar', 'ftr', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ortg', 'drtg',
       'fg', 'fga', 'fg_pct', 'threep', 'threepa',
       'threep_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'pts_avg', 'orb_avg', 'drb_avg', 'trb_avg',
       'ast_avg', 'stl_avg', 'blk_avg', 'tov_avg', 'pf_avg']
    for w in window_sizes:
        ewm_features = [f'{f}_ewm_{w}' for f in features]
        for p in player_dict:
            # set the index to date for use in finding variable length windows later

            tmp_df= player_dict[p][player_dict[p].sp > 0]
            tmp_df[ewm_features] = tmp_df[features].ewm(span=w,min_periods=0,adjust=False,ignore_na=False).mean().shift(+1)
            player_dict[p] = player_dict[p].merge(tmp_df[ewm_features+['match_id']], on='match_id', how='outer')
            player_dict[p].index = player_dict[p]['date']
            player_dict[p].fillna(method='ffill', inplace=True)
    return player_dict

time: 765 µs (started: 2023-03-04 05:52:40 -05:00)


In [18]:
def parallelize_gb(df, func):
    num_cores = multiprocessing.cpu_count()-2  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    gb = pp_df.groupby(['player_id'], group_keys=True)
    df_split = np.array_split(gb, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    list_dict = pool.map(func, df_split)
    res = dict(ChainMap(*list_dict))
    pool.close()
    pool.join()
    return res


player_dict = parallelize_gb(pp_df.tail(100), get_player_dict)
player_dict[149][['pts_avg_ewm_10','pts_ewm_10','pts', 'player_id', 'date', 'sp']]

Unnamed: 0_level_0,pts_avg_ewm_10,pts_ewm_10,pts,player_id,date,sp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-01,,,0.0,149,2022-05-01,0.0
2022-05-03,,,0.0,149,2022-05-03,0.0
2022-05-07,,,5.0,149,2022-05-07,252.0
2022-05-09,,,0.0,149,2022-05-09,0.0
2022-05-11,1.190476,5.0,5.0,149,2022-05-11,1180.0
2022-05-13,1.020251,5.0,0.0,149,2022-05-13,458.0
2022-05-18,0.834751,4.090909,3.0,149,2022-05-18,303.0
2022-05-20,0.834751,4.090909,0.0,149,2022-05-20,0.0
2022-05-22,0.834751,4.090909,0.0,149,2022-05-22,0.0
2022-05-24,0.790989,3.892562,4.0,149,2022-05-24,890.0


time: 465 ms (started: 2023-03-04 05:52:40 -05:00)


## Player Fatigue

Seconds played over the last 2 weeks

In [19]:
#player_dict[3] = player_dict[3].set_index(['date'])

def get_player_fatigue(x,w,p):
    window = player_dict[p].loc[x.date - timedelta(days=w):x.date - timedelta(days=1)]
    ewm_sp = np.nan
    if len(window):
        #set ewm span=7. A player is unlikely to play more than 7 games in 14 days.
        window['ewm_sp'] = window['sp'].ewm(span=7,min_periods=0,adjust=False,ignore_na=False).mean()
        ewm_sp = window.iloc[-1]['ewm_sp']
    return ewm_sp

window_sizes = [14]

for w in tqdm(window_sizes):
    for p in tqdm(player_dict):
        player_dict[p][f'sp_ewm_{w}'] = player_dict[p].apply(lambda x: get_player_fatigue(x,w+1, p), axis=1)

player_dict[149]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

Unnamed: 0_level_0,player_id,match_id,team_id,sp,inactive,ts_pct,efg_pct,threepar,ftr,orb_pct,...,pts_avg_ewm_40,orb_avg_ewm_40,drb_avg_ewm_40,trb_avg_ewm_40,ast_avg_ewm_40,stl_avg_ewm_40,blk_avg_ewm_40,tov_avg_ewm_40,pf_avg_ewm_40,sp_ewm_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-05-01,149,10021,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2022-05-03,149,10023,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2022-05-07,149,6459,10,252.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0.0
2022-05-09,149,6467,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,63.0
2022-05-11,149,10024,10,1180.0,0.0,0.625,0.625,0.25,0.0,0.0,...,1.190476,0.0,0.238095,0.238095,0.0,0.0,0.0,0.0,0.238095,47.25
2022-05-13,149,6490,10,458.0,0.0,0.0,0.0,1.0,0.0,21.3,...,1.144806,0.0,0.233922,0.233922,0.004961,0.0,0.0,0.00248,0.231442,330.4375
2022-05-18,149,6505,10,303.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.088962,0.012781,0.222511,0.235292,0.0175,0.0,0.0,0.002359,0.226542,362.328125
2022-05-20,149,6522,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.088962,0.012781,0.222511,0.235292,0.0175,0.0,0.0,0.002359,0.226542,407.296875
2022-05-22,149,4332,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.088962,0.012781,0.222511,0.235292,0.0175,0.0,0.0,0.002359,0.226542,305.472656
2022-05-24,149,4335,10,890.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.06482,0.012157,0.211657,0.223814,0.016646,0.0,0.0,0.002244,0.215491,184.253906


time: 732 ms (started: 2023-03-04 05:52:41 -05:00)


## Insert player factors into match dataframe

Most NBA teams rotate at least 8 players. 
We will use the top 8 players in terms of time played as features

In [20]:
def insert_player_features(match_df, player_dict, pp_df):
    window_sizes = [10,20,40]
    d = defaultdict(list)
    player_features = ['ts_pct',
       'efg_pct', 'threepar', 'ftr', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ortg', 'drtg',
       'fg', 'fga', 'fg_pct', 'threep', 'threepa',
       'threep_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'pts_avg', 'orb_avg', 'drb_avg', 'trb_avg',
       'ast_avg', 'stl_avg', 'blk_avg', 'tov_avg', 'pf_avg']
    
    n_features = len(player_features)
    for idx, row in match_df.iterrows():
        match_id = row['match_id']
        favorite_players = get_active_players(row['match_id'], row['favorite_id'], pp_df).sort_values(by=['sp'], ascending=False).head(8)
        underdog_players = get_active_players(row['match_id'], row['underdog_id'], pp_df).sort_values(by=['sp'], ascending=False).head(8)
        for w in window_sizes:
            for i in range(8):
                ewm_features = [f'{f}_ewm_{w}' for f in player_features]

                fav_player_ewm_features = [f'fav_p{i}_{f}_ewm_{w}' for f in player_features]
                und_player_ewm_features = [f'und_p{i}_{f}_ewm_{w}' for f in player_features]
                for j in range(n_features):

                    fav_player_id = favorite_players.iloc[i]['player_id']
                    und_player_id = underdog_players.iloc[i]['player_id']

                    fav_player_df = player_dict[fav_player_id]
                    und_player_df = player_dict[und_player_id]

                    fav_feature = fav_player_ewm_features[j]
                    und_feature = und_player_ewm_features[j]
                    d[fav_feature].append(fav_player_df[fav_player_df['match_id'] == match_id][ewm_features[j]].values[0])
                    d[und_feature].append(und_player_df[und_player_df['match_id'] == match_id][ewm_features[j]].values[0])
                    
    df = pd.DataFrame(d)
    new_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)
    return new_df

time: 973 µs (started: 2023-03-04 05:52:42 -05:00)


In [21]:
def parallelize_dataframe(df, func, player_dict):
    num_cores = multiprocessing.cpu_count()-2  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    list_df = pool.map(partial(func, player_dict = player_dict, pp_df = pp_df), df_split)
    df = pd.concat(list_df)
    pool.close()
    pool.join()
    return df

parallelize_dataframe(match_df.tail(24), insert_player_features, player_dict)

Unnamed: 0,match_id,away_id,home_id,date,away_pts,home_pts,playoff_game,home_ml,away_ml,home_spread,...,fav_p7_ast_avg_ewm_40,und_p7_ast_avg_ewm_40,fav_p7_stl_avg_ewm_40,und_p7_stl_avg_ewm_40,fav_p7_blk_avg_ewm_40,und_p7_blk_avg_ewm_40,fav_p7_tov_avg_ewm_40,und_p7_tov_avg_ewm_40,fav_p7_pf_avg_ewm_40,und_p7_pf_avg_ewm_40
0,15097,16,23,2022-05-12,99.0,90.0,0.0,1.712499,2.22,-2.388889,...,0.005877,0.003762,0.005877,0.0,0.090439,0.004184,0.008296,0.008155,0.009674,1.039753
1,4325,24,7,2022-05-12,86.0,113.0,0.0,2.173333,1.7337,2.166667,...,0.012298,0.004341,0.01345,0.004341,0.012298,0.0,0.316741,0.006313,0.01345,0.155489
0,6490,15,10,2022-05-13,96.0,110.0,0.0,1.28086,3.868889,-7.5,...,0.004961,0.04883,0.0,0.10549,0.0,0.039603,0.00248,0.049076,0.231442,0.004839
1,11461,2,17,2022-05-13,108.0,95.0,0.0,1.829482,2.028042,-1.166667,...,0.087158,0.036867,0.0,0.003524,0.0,0.0,0.0,0.001771,0.092891,0.091212
0,15944,7,24,2022-05-15,123.0,90.0,0.0,1.36097,3.332222,-6.5,...,0.105666,0.004302,0.00912,0.094545,0.0,0.0,0.005618,0.0,0.09717,0.112553
0,1400,17,2,2022-05-15,81.0,109.0,0.0,1.488278,2.747778,-4.944444,...,0.004031,0.082907,0.0,0.0,0.188709,0.0,0.207087,0.0,0.408543,0.101485
0,10108,2,16,2022-05-17,107.0,118.0,0.0,1.499502,2.723333,-4.5,...,0.045154,0.433957,0.020133,0.0,0.009006,0.0,0.041089,0.0,0.077049,0.0
0,6505,7,10,2022-05-18,87.0,112.0,0.0,1.441768,2.905556,-5.555556,...,0.23511,0.007623,0.002886,0.089933,0.0,0.003531,0.235107,0.0,0.011458,0.107062
0,10116,2,16,2022-05-19,127.0,102.0,0.0,1.74271,2.236272,-1.833333,...,0.042951,0.412788,0.019151,0.004304,0.008566,0.012912,0.039085,0.004304,0.073291,0.012912
0,6522,7,10,2022-05-20,117.0,126.0,0.0,1.420326,3.014375,-6.0,...,0.012386,0.007251,0.0,0.092845,0.002727,0.003358,0.227819,0.0,0.225092,0.105489


time: 1.73 s (started: 2023-03-04 05:52:42 -05:00)


## Team Factors

## Team Fatigue(Away game streak) and Win Streak

In [22]:
d = defaultdict(list)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    prev_favorite_match = get_prev_matches(row['date'], \
        row['favorite_id'], match_df).tail(1)
    prev_underdog_match = get_prev_matches(row['date'], \
        row['underdog_id'], match_df).tail(1)
    if len(prev_favorite_match):
        prev_f_win_streak = prev_favorite_match['post_favorite_win_streak'].values[0] if row['favorite_id'] == prev_favorite_match['favorite_id'].values[0] else prev_favorite_match['post_underdog_win_streak'].values[0]
        prev_f_home_streak = prev_favorite_match['post_favorite_home_streak'].values[0] if row['favorite_id'] == prev_favorite_match['favorite_id'].values[0] else prev_favorite_match['post_underdog_home_streak'].values[0]
    else:
        prev_f_win_streak = 0
        prev_f_home_streak = 0
        
    if len(prev_underdog_match):
        prev_u_win_streak = prev_underdog_match['post_favorite_win_streak'].values[0] if row['underdog_id'] == prev_underdog_match['favorite_id'].values[0] else prev_underdog_match['post_underdog_win_streak'].values[0]
        prev_u_home_streak = prev_underdog_match['post_favorite_home_streak'].values[0] if row['underdog_id'] == prev_underdog_match['favorite_id'].values[0] else prev_underdog_match['post_underdog_home_streak'].values[0]
    else:
        prev_u_win_streak = 0
        prev_u_home_streak = 0
        
    if row['favorite_won']:
        f_win_streak = max(1,prev_f_win_streak+1)
        u_win_streak = min(-1, prev_u_win_streak-1)
    else:
        f_win_streak = min(-1, prev_f_win_streak-1)
        u_win_streak = max(1,prev_u_win_streak+1)

    if row['favorite_is_home']:
        f_home_streak = max(1, prev_f_home_streak+1)
        u_home_streak = min(-1, prev_u_home_streak-1)
    else:
        f_home_streak = min(-1, prev_f_home_streak-1)
        u_home_streak = max(1, prev_u_home_streak+1)
        
    match_df.at[idx, 'prev_favorite_win_streak'] = prev_f_win_streak
    match_df.at[idx, 'prev_favorite_home_streak'] = prev_f_home_streak
    match_df.at[idx, 'prev_underdog_win_streak'] = prev_u_win_streak
    match_df.at[idx, 'prev_underdog_home_streak'] = prev_u_home_streak
    
    match_df.at[idx, 'post_favorite_win_streak'] = f_win_streak
    match_df.at[idx, 'post_favorite_home_streak'] = f_home_streak
    match_df.at[idx, 'post_underdog_win_streak'] = u_win_streak
    match_df.at[idx, 'post_underdog_home_streak'] = u_home_streak

df = pd.DataFrame(d)
match_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)

streak_features = ['prev_favorite_win_streak', 
                   'prev_favorite_home_streak', 
                   'prev_underdog_win_streak', 
                   'prev_underdog_home_streak']


  0%|          | 0/45 [00:00<?, ?it/s]

time: 50.3 ms (started: 2023-03-04 05:52:43 -05:00)


In [23]:
match_df[(match_df.favorite_id == 7) | (match_df.underdog_id == 7)]

Unnamed: 0,match_id,away_id,home_id,date,away_pts,home_pts,playoff_game,home_ml,away_ml,home_spread,...,past_7_favorite_win_ratio,past_7_underdog_win_ratio,prev_favorite_win_streak,prev_favorite_home_streak,prev_underdog_win_streak,prev_underdog_home_streak,post_favorite_win_streak,post_favorite_home_streak,post_underdog_win_streak,post_underdog_home_streak
3,15894,7,24,2022-05-02,114.0,121.0,0.0,1.407886,3.071,-6.038462,...,,,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,-1.0
8,15907,7,24,2022-05-04,109.0,129.0,0.0,1.405084,3.086923,-6.05,...,,,1.0,1.0,-1.0,-1.0,2.0,2.0,-2.0,-2.0
9,4321,24,7,2022-05-06,94.0,103.0,0.0,2.190139,1.900734,-1.111111,...,,,2.0,2.0,-2.0,-2.0,-1.0,-1.0,1.0,1.0
14,4322,24,7,2022-05-08,101.0,111.0,0.0,2.464938,1.674758,1.1875,...,,,-1.0,-1.0,1.0,1.0,-2.0,-2.0,2.0,2.0
17,15921,7,24,2022-05-10,80.0,110.0,0.0,1.336215,3.467778,-7.0,...,,,-2.0,-2.0,2.0,2.0,1.0,1.0,-1.0,-1.0
22,4325,24,7,2022-05-12,86.0,113.0,0.0,2.173333,1.7337,2.166667,...,,,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0
25,15944,7,24,2022-05-15,123.0,90.0,0.0,1.36097,3.332222,-6.5,...,,,-1.0,-1.0,1.0,1.0,-2.0,1.0,2.0,-1.0
28,6505,7,10,2022-05-18,87.0,112.0,0.0,1.441768,2.905556,-5.555556,...,,0.571429,1.0,1.0,2.0,-1.0,2.0,2.0,-1.0,-2.0
30,6522,7,10,2022-05-20,117.0,126.0,0.0,1.420326,3.014375,-6.0,...,0.714286,0.571429,2.0,2.0,-1.0,-2.0,3.0,3.0,-2.0,-3.0
32,4332,10,7,2022-05-22,109.0,100.0,0.0,1.648433,2.343333,-3.111111,...,0.571429,0.714286,-2.0,-3.0,3.0,3.0,-3.0,1.0,4.0,-1.0


time: 12.9 ms (started: 2023-03-04 05:52:43 -05:00)


## FG%, 3P%, FT%, ORB, DRB, TRB, TOV, AST, STL, BLK, DRTG, ORTG, EFG 

In [24]:
d = defaultdict(list)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    favorite_players = get_active_players(row['match_id'], row['favorite_id'], pp_df)
    underdog_players = get_active_players(row['match_id'], row['underdog_id'], pp_df)
    d['favorite_bpm'].append(favorite_players['bpm'].sum())
    d['underdog_bpm'].append(underdog_players['bpm'].sum())
    d['favorite_fga'].append(favorite_players['fga'].sum())
    d['underdog_fga'].append(underdog_players['fga'].sum())
    d['favorite_fg'].append(favorite_players['fg'].sum())
    d['underdog_fg'].append(underdog_players['fg'].sum())
    d['favorite_fg_pct'].append(favorite_players['fg_pct'].mean())
    d['underdog_fg_pct'].append(underdog_players['fg_pct'].mean())
    
    d['favorite_3p'].append(favorite_players['threep'].sum())
    d['underdog_3p'].append(underdog_players['threep'].sum())
    d['favorite_3pa'].append(favorite_players['threepa'].sum())
    d['underdog_3pa'].append(underdog_players['threepa'].sum())
    d['favorite_3p_pct'].append(favorite_players['threep_pct'].mean())
    d['underdog_3p_pct'].append(underdog_players['threep_pct'].mean())
    
    d['favorite_ft'].append(favorite_players['ft'].sum())
    d['underdog_ft'].append(underdog_players['ft'].sum())
    d['favorite_fta'].append(favorite_players['fta'].sum())
    d['underdog_fta'].append(underdog_players['fta'].sum())
    d['favorite_ft_pct'].append(favorite_players['ft_pct'].mean())
    d['underdog_ft_pct'].append(underdog_players['ft_pct'].mean())
    
    d['favorite_orb'].append(favorite_players['orb'].sum())
    d['underdog_orb'].append(underdog_players['orb'].sum())
    d['favorite_orb_pct'].append(favorite_players['orb_pct'].mean())
    d['underdog_orb_pct'].append(underdog_players['orb_pct'].mean())
    
    d['favorite_drb'].append(favorite_players['drb'].sum())
    d['underdog_drb'].append(underdog_players['drb'].sum())
    d['favorite_drb_pct'].append(favorite_players['drb_pct'].mean())
    d['underdog_drb_pct'].append(underdog_players['drb_pct'].mean())
    
    d['favorite_trb'].append(favorite_players['trb'].sum())
    d['underdog_trb'].append(underdog_players['trb'].sum())
    d['favorite_trb_pct'].append(favorite_players['trb_pct'].mean())
    d['underdog_trb_pct'].append(underdog_players['trb_pct'].mean())
    
    d['favorite_tov'].append(favorite_players['tov'].sum())
    d['underdog_tov'].append(underdog_players['tov'].sum())
    d['favorite_tov_pct'].append(favorite_players['tov_pct'].mean())
    d['underdog_tov_pct'].append(underdog_players['tov_pct'].mean())
    
    d['favorite_ast'].append(favorite_players['ast'].sum())
    d['underdog_ast'].append(underdog_players['ast'].sum())
    d['favorite_ast_pct'].append(favorite_players['ast_pct'].mean())
    d['underdog_ast_pct'].append(underdog_players['ast_pct'].mean())
    
    d['favorite_stl'].append(favorite_players['stl'].sum())
    d['underdog_stl'].append(underdog_players['stl'].sum())
    d['favorite_stl_pct'].append(favorite_players['stl_pct'].mean())
    d['underdog_stl_pct'].append(underdog_players['stl_pct'].mean())
    
    d['favorite_blk'].append(favorite_players['blk'].sum())
    d['underdog_blk'].append(underdog_players['blk'].sum())
    d['favorite_blk_pct'].append(favorite_players['blk_pct'].mean())
    d['underdog_blk_pct'].append(underdog_players['blk_pct'].mean())
    
    d['favorite_drtg'].append(favorite_players['drtg'].mean())
    d['underdog_drtg'].append(underdog_players['drtg'].mean())
    
    d['favorite_ortg'].append(favorite_players['ortg'].mean())
    d['underdog_ortg'].append(underdog_players['ortg'].mean())
    
    d['favorite_efg_pct'].append(favorite_players['efg_pct'].mean())
    d['underdog_efg_pct'].append(underdog_players['efg_pct'].mean())
        
    d['sp'].append(favorite_players['sp'].sum())

  0%|          | 0/45 [00:00<?, ?it/s]

time: 135 ms (started: 2023-03-04 05:52:43 -05:00)


In [25]:
df = pd.DataFrame(d)
match_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)

time: 2.24 ms (started: 2023-03-04 05:52:43 -05:00)


## Possessions and Pace

In [26]:
match_df['favorite_possessions'] = \
    0.5 * ((match_df['favorite_fga'] + 0.4*match_df['favorite_fta'] - \
            1.07*(match_df['favorite_orb']/(match_df['favorite_orb'] + match_df['underdog_drb'])) * \
            (match_df['favorite_fga']-match_df['favorite_fg']) + match_df['favorite_tov']) + \
           (match_df['underdog_fga'] + 0.4*match_df['underdog_fta'] -\
           1.07 * (match_df['underdog_orb']/(match_df['underdog_orb'] + match_df['favorite_drb'])) * \
           (match_df['underdog_fga']-match_df['underdog_fg']) + match_df['underdog_tov']))

match_df['underdog_possessions'] = \
    0.5 * ((match_df['underdog_fga'] + 0.4*match_df['underdog_fta'] - \
            1.07*(match_df['underdog_orb']/(match_df['underdog_orb'] + match_df['favorite_drb'])) * \
            (match_df['underdog_fga']-match_df['underdog_fg']) + match_df['underdog_tov']) + \
           (match_df['favorite_fga'] + 0.4*match_df['favorite_fta'] -\
           1.07 * (match_df['favorite_orb']/(match_df['favorite_orb'] + match_df['underdog_drb'])) * \
           (match_df['favorite_fga']-match_df['favorite_fg']) + match_df['favorite_tov']))

match_df['favorite_pace'] = 48 * ((match_df['favorite_possessions'] + match_df['underdog_possessions']) / \
                          (2*(match_df['sp']/(60*5))))
match_df['underdog_pace'] = 48 * ((match_df['favorite_possessions'] + match_df['underdog_possessions']) / \
                          (2*(match_df['sp']/(60*5))))


time: 7.04 ms (started: 2023-03-04 05:52:43 -05:00)


## EMA Team stats

In [27]:
def ema(current, prev_ema, window_size, smoothing=2.0):
    k = smoothing / (1 + window_size)
    return current * k + prev_ema * (1-k)

time: 638 µs (started: 2023-03-04 05:52:43 -05:00)


In [28]:
def get_prev_team_sum(team_id, home_col, prev_matches):
    away_col = home_col.replace('home', 'away')
    prev_matches['res'] =  prev_matches.apply(lambda x:
                             x[home_col] if x['home_id'] == team_id
                             else x[away_col], axis=1)
    return prev_matches['res'].sum()

time: 513 µs (started: 2023-03-04 05:52:43 -05:00)


In [36]:
smoothing = 2
window_sizes = [3,4,5,6,7]

for w in tqdm(range(len(window_sizes))):
    window_size = window_sizes[w]

    ema_favorite_features = \
        [(f'prev_favorite_pts_ema{window_size}',      f'post_favorite_pts_ema{window_size}'),
        (f'prev_favorite_bpm_ema{window_size}',       f'post_favorite_bpm_ema{window_size}'),
        (f'prev_favorite_fg_ema{window_size}',        f'post_favorite_fg_ema{window_size}'),
        (f'prev_favorite_fg_pct_ema{window_size}',    f'post_favorite_fg_pct_ema{window_size}'),
        (f'prev_favorite_3p_ema{window_size}',        f'post_favorite_3p_ema{window_size}'),
        (f'prev_favorite_3p_pct_ema{window_size}',    f'post_favorite_3p_pct_ema{window_size}'),
        (f'prev_favorite_ft_ema{window_size}',        f'post_favorite_ft_ema{window_size}'),
        (f'prev_favorite_ft_pct_ema{window_size}',    f'post_favorite_ft_pct_ema{window_size}'),
        (f'prev_favorite_orb_ema{window_size}',       f'post_favorite_orb_ema{window_size}'),
        (f'prev_favorite_orb_pct_ema{window_size}',   f'post_favorite_orb_pct_ema{window_size}'),
        (f'prev_favorite_drb_ema{window_size}',       f'post_favorite_drb_ema{window_size}'),
        (f'prev_favorite_drb_pct_ema{window_size}',   f'post_favorite_drb_pct_ema{window_size}'),
        (f'prev_favorite_trb_ema{window_size}',       f'post_favorite_trb_ema{window_size}'),
        (f'prev_favorite_trb_pct_ema{window_size}',   f'post_favorite_trb_pct_ema{window_size}'),
        (f'prev_favorite_tov_ema{window_size}',       f'post_favorite_tov_ema{window_size}'),
        (f'prev_favorite_tov_pct_ema{window_size}',   f'post_favorite_tov_pct_ema{window_size}'),
        (f'prev_favorite_ast_ema{window_size}',       f'post_favorite_ast_ema{window_size}'),
        (f'prev_favorite_ast_pct_ema{window_size}',   f'post_favorite_ast_pct_ema{window_size}'),
        (f'prev_favorite_stl_ema{window_size}',       f'post_favorite_stl_ema{window_size}'),
        (f'prev_favorite_stl_pct_ema{window_size}',   f'post_favorite_stl_pct_ema{window_size}'),
        (f'prev_favorite_blk_ema{window_size}',       f'post_favorite_blk_ema{window_size}'),
        (f'prev_favorite_blk_pct_ema{window_size}',   f'post_favorite_blk_pct_ema{window_size}'),
        (f'prev_favorite_drtg_ema{window_size}',      f'post_favorite_drtg_ema{window_size}'),
        (f'prev_favorite_ortg_ema{window_size}',      f'post_favorite_ortg_ema{window_size}'),
        (f'prev_favorite_efg_pct_ema{window_size}',   f'post_favorite_efg_pct_ema{window_size}'),
        (f'prev_favorite_pace_ema{window_size}',      f'post_favorite_pace_ema{window_size}'),
        ]

    ema_underdog_features = [(f[0].replace('favorite','underdog'), f[1].replace('favorite','underdog')) for f in ema_favorite_features]
    sma_favorite_features = [(f[0].replace('ema','sma'), f[1].replace('ema','sma')) for f in ema_favorite_features]
    sma_underdog_features = [(f[0].replace('favorite','underdog'), f[1].replace('favorite','underdog')) for f in sma_favorite_features]

    for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
        prev_favorite_matches = get_prev_matches(row['date'], \
            row['favorite_id'], match_df).tail(window_size)
        prev_underdog_matches = get_prev_matches(row['date'], \
            row['underdog_id'], match_df).tail(window_size)
        len_prev_favorite_matches = len(prev_favorite_matches)
        len_prev_underdog_matches = len(prev_underdog_matches)
        for i in range(len(ema_favorite_features)):
            favorite_feature = re.findall('favorite_.*_ema', ema_favorite_features[i][0])[0].replace('_ema', '')
            underdog_feature = favorite_feature.replace('favorite', 'underdog') 

            if not prev_favorite_matches.empty:
                prev_match = prev_favorite_matches.iloc[-1:]
                match_df.at[idx,sma_favorite_features[i][0]] = get_prev_team_sum(row['favorite_id'], 
                                                                                 favorite_feature, 
                                                                                 prev_favorite_matches)/len_prev_favorite_matches 
                if len_prev_favorite_matches < window_size:
                    match_df.at[idx,ema_favorite_features[i][0]] = match_df.at[idx,sma_favorite_features[i][0]]
                    match_df.at[idx,ema_favorite_features[i][1]] = (match_df.at[idx,sma_favorite_features[i][0]] \
                        * len_prev_favorite_matches + row[favorite_feature])/(len_prev_favorite_matches + 1)
                    
                    match_df.at[idx,ema_favorite_features[i][0]] = match_df.loc[idx,sma_favorite_features[i][0]] 
                    match_df.at[idx,ema_favorite_features[i][1]] = (match_df.loc[idx,sma_favorite_features[i][0]] \
                                                                * len_prev_favorite_matches + row[favorite_feature])/(len_prev_favorite_matches + 1)

                else:
                    match_df.at[idx,ema_favorite_features[i][0]]= prev_match[ema_favorite_features[i][1]] \
                                        if prev_match['favorite_id'].values[0] == row['favorite_id'] \
                                        else prev_match[ema_underdog_features[i][1]]

                    match_df.at[idx,ema_favorite_features[i][1]] = ema(row[favorite_feature],  
                                        match_df.at[idx,ema_favorite_features[i][0]], 
                                        window_size)
            else:
                match_df.at[idx,ema_favorite_features[i][1]] = row[favorite_feature]


            if not prev_underdog_matches.empty:
                prev_match = prev_underdog_matches.iloc[-1:]
                match_df.at[idx,sma_underdog_features[i][0]] = get_prev_team_sum(row['underdog_id'], \
                    underdog_feature, prev_underdog_matches)/len_prev_underdog_matches

                if len_prev_underdog_matches < window_size:
                    match_df.at[idx,ema_underdog_features[i][0]] = match_df.at[idx,sma_underdog_features[i][0]]
                    match_df.at[idx,ema_underdog_features[i][1]] = (match_df.at[idx,sma_underdog_features[i][0]] \
                        * len_prev_underdog_matches + row[underdog_feature])/(len_prev_underdog_matches + 1)
                else:
                    match_df.at[idx,ema_underdog_features[i][0]] = (prev_match[ema_favorite_features[i][1]] \
                                if prev_match['favorite_id'].values[0] == row['underdog_id'] \
                                else prev_match[ema_underdog_features[i][1]])

                    match_df.at[idx,ema_underdog_features[i][1]] = ema(row[underdog_feature],  
                                            match_df.at[idx,ema_underdog_features[i][0]], 
                                            window_size)
            else:
                match_df.at[idx,ema_underdog_features[i][1]] = row[underdog_feature]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

time: 9.66 s (started: 2023-03-04 05:56:36 -05:00)


In [37]:
match_df[(match_df.favorite_id == 7) | (match_df.underdog_id == 7)][['favorite_id', 'underdog_id', 'favorite_pts', 'underdog_pts', 'prev_favorite_pts_ema7', 'prev_underdog_pts_ema7']]

Unnamed: 0,favorite_id,underdog_id,favorite_pts,underdog_pts,prev_favorite_pts_ema7,prev_underdog_pts_ema7
3,24,7,121.0,114.0,,
8,24,7,129.0,109.0,121.0,114.0
9,24,7,94.0,103.0,125.0,111.5
14,24,7,101.0,111.0,114.666667,108.666667
17,24,7,110.0,80.0,111.25,109.25
22,24,7,86.0,113.0,111.0,103.4
25,24,7,90.0,123.0,106.833333,105.0
28,10,7,112.0,87.0,111.0,107.571429
30,10,7,126.0,117.0,111.142857,102.428571
32,7,10,100.0,109.0,106.071429,114.857143


time: 4.96 ms (started: 2023-03-04 05:56:48 -05:00)


In [32]:
match_df.to_csv('nba_processed_features.csv')

time: 28.7 ms (started: 2023-03-04 05:52:52 -05:00)


In [33]:
ema_favorite_features = \
        [(f'prev_favorite_pts_ema{window_size}',       f'post_favorite_pts_ema{window_size}'),
        (f'prev_favorite_bpm_ema{window_size}',       f'post_favorite_bpm_ema{window_size}'),
        (f'prev_favorite_fg_ema{window_size}',        f'post_favorite_fg_ema{window_size}'),
        (f'prev_favorite_fg_pct_ema{window_size}',    f'post_favorite_fg_pct_ema{window_size}'),
        (f'prev_favorite_3p_ema{window_size}',        f'post_favorite_3p_ema{window_size}'),
        (f'prev_favorite_3p_pct_ema{window_size}',    f'post_favorite_3p_pct_ema{window_size}'),
        (f'prev_favorite_ft_ema{window_size}',        f'post_favorite_ft_ema{window_size}'),
        (f'prev_favorite_ft_pct_ema{window_size}',    f'post_favorite_ft_pct_ema{window_size}'),
        (f'prev_favorite_orb_ema{window_size}',       f'post_favorite_orb_ema{window_size}'),
        (f'prev_favorite_orb_pct_ema{window_size}',   f'post_favorite_orb_pct_ema{window_size}'),
        (f'prev_favorite_drb_ema{window_size}',       f'post_favorite_drb_ema{window_size}'),
        (f'prev_favorite_drb_pct_ema{window_size}',   f'post_favorite_drb_pct_ema{window_size}'),
        (f'prev_favorite_trb_ema{window_size}',       f'post_favorite_trb_ema{window_size}'),
        (f'prev_favorite_trb_pct_ema{window_size}',   f'post_favorite_trb_pct_ema{window_size}'),
        (f'prev_favorite_tov_ema{window_size}',       f'post_favorite_tov_ema{window_size}'),
        (f'prev_favorite_tov_pct_ema{window_size}',   f'post_favorite_tov_pct_ema{window_size}'),
        (f'prev_favorite_ast_ema{window_size}',       f'post_favorite_ast_ema{window_size}'),
        (f'prev_favorite_ast_pct_ema{window_size}',   f'post_favorite_ast_pct_ema{window_size}'),
        (f'prev_favorite_stl_ema{window_size}',       f'post_favorite_stl_ema{window_size}'),
        (f'prev_favorite_stl_pct_ema{window_size}',   f'post_favorite_stl_pct_ema{window_size}'),
        (f'prev_favorite_blk_ema{window_size}',       f'post_favorite_blk_ema{window_size}'),
        (f'prev_favorite_blk_pct_ema{window_size}',   f'post_favorite_blk_pct_ema{window_size}'),
        (f'prev_favorite_drtg_ema{window_size}',      f'post_favorite_drtg_ema{window_size}'),
        (f'prev_favorite_ortg_ema{window_size}',      f'post_favorite_ortg_ema{window_size}'),
        (f'prev_favorite_efg_pct_ema{window_size}',   f'post_favorite_efg_pct_ema{window_size}'),
        (f'prev_favorite_pace_ema{window_size}',      f'post_favorite_pace_ema{window_size}')]

features = ['favorite_pace','underdog_pace']

#all_features = 

time: 1.07 ms (started: 2023-03-04 05:52:52 -05:00)
