## Imports

In [None]:
%%capture
import warnings
import math
import db_func
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook, show
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
from tqdm.notebook import tqdm
import re
from collections import defaultdict, ChainMap
import multiprocessing
from functools import partial
import numpy as np
from itertools import repeat

warnings.filterwarnings('ignore')

try:
    %load_ext autotime
    %load_ext jupyterlab_notify
except:
    !pip3 install ipython-autotime
    %load_ext autotime

In [None]:
#pd.set_option('max_columns', 2)
#pd.set_option('min_rows', 20)

## Database Connection

In [None]:
conn = db_func.get_conn()

# Preprocessing


match_df: The final processed dataset to be used in the machine learning models 

In [None]:
match_query = '''SELECT
				m.match_id,  m.away_id, m.home_id,
				m.date, m.away_pts, m.home_pts, m.playoff_game,
				h_ml.home_ml, a_ml.away_ml,
				h_ps.home_spread, a_ps.away_spread,
				h_ps.home_ps_odds, a_ps.away_ps_odds,
				over.over, under.under, ou.spread
			FROM match AS m
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS home_ml,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 1 AND
					o.match_id = m.match_id AND
					o.team_id = m.home_id
				GROUP BY m.match_id
			) AS h_ml ON m.match_id = h_ml.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS away_ml,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 1 AND
					o.match_id = m.match_id AND
					o.team_id = m.away_id
				GROUP BY m.match_id
			) AS a_ml ON m.match_id = a_ml.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS home_ps_odds,
					AVG(spread) AS home_spread,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 2 AND
					o.match_id = m.match_id AND
					o.team_id = m.home_id
				GROUP BY m.match_id
			) AS h_ps ON m.match_id = h_ps.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS away_ps_odds,
					AVG(spread) AS away_spread,
					m.match_id AS match_id
				FROM
					odds AS o, team AS t1, team as t2,
					match AS m
				WHERE
					o.bet_type_id = 2 AND
					o.match_id = m.match_id AND
					o.team_id = m.away_id
				GROUP BY m.match_id
			) AS a_ps ON m.match_id = a_ps.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS under,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.over_under = 'under' AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS under ON m.match_id = under.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(decimal_odds) AS over,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.over_under = 'over' AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS over ON m.match_id = over.match_id
			LEFT OUTER JOIN
			(
				SELECT
					AVG(spread) AS spread,
					m.match_id AS match_id
				FROM
					odds AS o, match AS m
				WHERE
					o.bet_type_id = 3 AND
					o.match_id = m.match_id
				GROUP BY m.match_id
			) AS ou ON m.match_id = ou.match_id
			WHERE date >= DATE('2007-10-30')
			ORDER BY date ASC
			'''

season_query = '''SELECT *
				FROM season'''

player_performance_query = '''SELECT p.*, m.date
							FROM player_performance as p, match as m
							WHERE m.match_id = p.match_id
							AND m.date >= DATE('2007-10-30')
							ORDER BY date ASC'''
team_query = '''SELECT * 
				FROM team_name'''

injury_query = '''SELECT i.* 
				FROM injury as i, match as m
				WHERE m.match_id = i.match_id
				AND m.date >= DATE('2007-10-30')
				ORDER BY m.date ASC'''

match_df = pd.read_sql(match_query, conn)
season_df = pd.read_sql(season_query, conn)
pp_df = pd.read_sql(player_performance_query, conn)
team_df = pd.read_sql(team_query, conn)
injury_df = pd.read_sql(injury_query, conn)
match_df['date'] = match_df['date'].map(lambda x: datetime(x.year, x.month, x.day))
pp_df['date'] = pp_df['date'].map(lambda x: datetime(x.year, x.month, x.day))
season_df['start_date'] =season_df['start_date'].map(lambda x: datetime(x.year, x.month, x.day))
season_df['end_date'] = season_df['end_date'].map(lambda x: datetime(x.year, x.month, x.day))

In [None]:
def get_season(date):
    return season_df[(season_df['start_date'] <= date) &
                     (season_df['end_date'] >= date)]['season'].values[0]

In [None]:
match_df['season'] = match_df['date'].map(get_season)
pp_df['season'] = pp_df['date'].map(get_season)

# Feature Engineering

## Basic stats with respect to the favorite (determined by bookies)

In [None]:
favorite_df = defaultdict(list)
p = re.compile('prev.*ema')
match_df['home_movl'] = match_df['home_pts'] - match_df['away_pts']
match_df['home_win'] = match_df['home_movl'].map(lambda x: 0 if x < 0 else 1)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):    
    favorite_won = False
    if row['home_ml'] < row['away_ml']:
        favorite_df['favorite_ml'].append(row['home_ml'])
        favorite_df['underdog_ml'].append(row['away_ml'])
        favorite_df['favorite_is_home'].append(1)
        favorite_df['favorite_movl'].append(row['home_pts']-row['away_pts'])
        favorite_df['point_spread'].append(abs(row['home_spread']))
        favorite_df['favorite_pts'].append(row['home_pts'])
        favorite_df['underdog_pts'].append(row['away_pts'])

        if row['home_win']:
            favorite_won = True
    else:
        if not row['home_win']:
            favorite_won = True
        favorite_df['favorite_ml'].append(row['away_ml'])
        favorite_df['underdog_ml'].append(row['home_ml'])
        favorite_df['favorite_is_home'].append(0)
        favorite_df['favorite_movl'].append(row['away_pts']-row['home_pts'])
        favorite_df['point_spread'].append(abs(row['away_spread']))
        favorite_df['favorite_pts'].append(row['away_pts'])
        favorite_df['underdog_pts'].append(row['home_pts'])


    favorite_df['favorite_won'].append(1 if favorite_won else 0)

favorite_df = pd.DataFrame(favorite_df)
favorite_df['vig'] = 1/favorite_df['favorite_ml'] + 1/favorite_df['underdog_ml'] - 1
favorite_df['favorite_implied'] = 1/favorite_df['favorite_ml'] - favorite_df['vig']/2
favorite_df['underdog_implied'] = 1/favorite_df['underdog_ml'] - favorite_df['vig']/2

match_df = pd.concat([match_df.reset_index(drop=True),
                      favorite_df.reset_index(drop=True)],axis=1)
match_df['favorite_id'] = match_df.apply(lambda x: x['home_id'] if x['favorite_is_home']==1 else x['away_id'],axis=1)
match_df['underdog_id'] = match_df.apply(lambda x: x['home_id'] if x['favorite_is_home']==0 else x['away_id'],axis=1)


In [None]:
def get_prev_matches(date, team_id, match_df, opponent_id = 0):
    if opponent_id:
        return match_df[(match_df["date"] < date) &
                        (((match_df["favorite_id"] == team_id) & 
                          (match_df["underdog_id"] == opponent_id)) |
                         ((match_df["favorite_id"] == opponent_id) & 
                          (match_df["underdog_id"] == team_id)))]
    else:
        return match_df[(match_df["date"] < date) &
                    ((match_df["favorite_id"] == team_id) |
                     (match_df["underdog_id"] == team_id))]


In [None]:
match_df.favorite_id.describe()

In [None]:
def get_win_ratio(team_id, prev_matches, i):
    if len(prev_matches) < i: 
        return None
    prev_matches['res'] =  prev_matches.apply(lambda x:
                             1 if (x['favorite_id'] == team_id and x['favorite_won']) or 
                                      (x['underdog_id'] == team_id and not x['favorite_won'])        
                             else 0, axis=1)
    return prev_matches['res'].sum()/i    

In [None]:
window_sizes = [5,7]

for w in tqdm(window_sizes):
    match_df[f'past_{w}_favorite_win_ratio'] = match_df.apply(lambda x: 
                                    get_win_ratio(x['favorite_id'], 
                                        get_prev_matches(x['date'], 
                                                         x['favorite_id'],
                                                         match_df
                                                        ).tail(w),
                                                    w), axis=1)
    match_df[f'past_{w}_underdog_win_ratio'] = match_df.apply(lambda x: 
                                    get_win_ratio(x['underdog_id'], 
                                        get_prev_matches(x['date'], 
                                                         x['underdog_id'],
                                                         match_df
                                                        ).tail(w),
                                                    w), axis=1)


# Player Factors

In [None]:
def get_prev_player_match(date, player_id, pp_df):
    return pp_df[(pp_df['date'] < date) & 
                (pp_df['player_id'] == player_id)].tail(1)
def get_active_players(match_id, team_id, pp_df):
    return  pp_df[(pp_df['match_id'] == match_id) &
                      (pp_df['team_id'] == team_id) &
                  (pp_df['sp']>0)]

def get_complete_roster(match_id, team_id, match_df):
    return  pp_df[(pp_df['match_id'] == match_id) &
                      (pp_df['team_id'] == team_id)]

## Define Stats per minute played

In [None]:
pm_stats = ['pts', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk','tov', 'pf']

for f in pm_stats:
    pp_df[f'{f}_avg'] = pp_df.apply(lambda x: x[f]*60/x['sp'] if x['sp']>0 else 0, axis=1)

avg_stats = [f'{f}_avg' for f in pm_stats]

## EWM Player Stats

In [None]:
def get_player_dict(gb):
    window_sizes = [10,20,40]
    player_dict = dict(tuple(gb))
    
    features = ['ts_pct',
       'efg_pct', 'threepar', 'ftr', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ortg', 'drtg',
       'fg', 'fga', 'fg_pct', 'threep', 'threepa',
       'threep_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'pts_avg', 'orb_avg', 'drb_avg', 'trb_avg',
       'ast_avg', 'stl_avg', 'blk_avg', 'tov_avg', 'pf_avg']
    for w in window_sizes:
        ewm_features = [f'{f}_ewm_{w}' for f in features]
        for p in player_dict:
            # set the index to date for use in finding variable length windows later

            tmp_df= player_dict[p][player_dict[p].sp > 0]
            tmp_df[ewm_features] = tmp_df[features].ewm(span=w,min_periods=0,adjust=False,ignore_na=False).mean().shift(+1)
            player_dict[p] = player_dict[p].merge(tmp_df[ewm_features+['match_id']], on='match_id', how='outer')
            player_dict[p].index = player_dict[p]['date']
            player_dict[p].fillna(method='ffill', inplace=True)
    return player_dict

In [None]:
def parallelize_gb(df, func):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    gb = pp_df.groupby(['player_id'], group_keys=True)
    df_split = np.array_split(gb, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    list_dict = pool.map(func, df_split)
    res = dict(ChainMap(*list_dict))
    pool.close()
    pool.join()
    return res


player_dict = parallelize_gb(pp_df, get_player_dict)

In [None]:
player_dict[3]

## Player Fatigue

Seconds played over the last 2 weeks

In [None]:
def get_player_fatigue(x,w,p):
    window = player_dict[p].loc[x.date - timedelta(days=w):x.date - timedelta(days=1)]
    ewm_sp = np.nan
    if len(window):
        #set ewm span=7. A player is unlikely to play more than 7 games in 14 days.
        window['ewm_sp'] = window['sp'].ewm(span=7,min_periods=0,adjust=False,ignore_na=False).mean()
        ewm_sp = window.iloc[-1]['ewm_sp']
    return ewm_sp

window_sizes = [14]

for w in tqdm(window_sizes):
    for p in tqdm(player_dict):
        player_dict[p][f'sp_ewm_{w}'] = player_dict[p].apply(lambda x: get_player_fatigue(x,w+1, p), axis=1)

In [None]:
player_dict[3]['sp_ewm_14']

## Insert player factors into match dataframe

Most NBA teams rotate at least 8 players. 
We will use the top 8 players in terms of time played as features

In [None]:
def insert_player_features(match_df, player_dict, pp_df):
    window_sizes = [10,20,40]
    d = defaultdict(list)
    player_features = ['ts_pct',
       'efg_pct', 'threepar', 'ftr', 'orb_pct', 'drb_pct', 'trb_pct',
       'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct', 'ortg', 'drtg',
       'fg', 'fga', 'fg_pct', 'threep', 'threepa',
       'threep_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'pts_avg', 'orb_avg', 'drb_avg', 'trb_avg',
       'ast_avg', 'stl_avg', 'blk_avg', 'tov_avg', 'pf_avg']
    err = 0
    n_features = len(player_features)
    for idx, row in match_df.iterrows():
        match_id = row['match_id']
        favorite_players = get_active_players(row['match_id'], row['favorite_id'], pp_df).sort_values(by=['sp'], ascending=False).head(8)
        underdog_players = get_active_players(row['match_id'], row['underdog_id'], pp_df).sort_values(by=['sp'], ascending=False).head(8)
        if favorite_players.shape[0] < 8 or underdog_players.shape[0] < 8:
            err += 1
        for w in window_sizes:
            for i in range(8):
                ewm_features = [f'{f}_ewm_{w}' for f in player_features]

                fav_player_ewm_features = [f'fav_p{i}_{f}_ewm_{w}' for f in player_features]
                und_player_ewm_features = [f'und_p{i}_{f}_ewm_{w}' for f in player_features]
                for j in range(n_features):
                    
                    if favorite_players.shape[0] < 8 or underdog_players.shape[0] < 8:
                        d[fav_feature].append(np.nan)
                        d[und_feature].append(np.nan)
                        continue

                    fav_player_id = favorite_players.iloc[i]['player_id']
                    und_player_id = underdog_players.iloc[i]['player_id']

                    fav_player_df = player_dict[fav_player_id]
                    und_player_df = player_dict[und_player_id]

                    fav_feature = fav_player_ewm_features[j]
                    und_feature = und_player_ewm_features[j]
                    d[fav_feature].append(fav_player_df[fav_player_df['match_id'] == match_id][ewm_features[j]].values[0])
                    d[und_feature].append(und_player_df[und_player_df['match_id'] == match_id][ewm_features[j]].values[0])
                    
    df = pd.DataFrame(d)
    new_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)
    print('err:', err)
    return new_df

In [None]:
def parallelize_dataframe(df, func, player_dict, pp_df):
    num_cores = multiprocessing.cpu_count()-1  #leave one free to not freeze machine
    num_partitions = num_cores #number of partitions to split dataframe
    df_split = np.array_split(df, num_partitions)
    pool = multiprocessing.Pool(num_cores)
    list_df = pool.map(partial(func, player_dict = player_dict, pp_df = pp_df), df_split)
    df = pd.concat(list_df)
    pool.close()
    pool.join()
    return df

match_df = parallelize_dataframe(match_df, insert_player_features, player_dict, pp_df)

In [None]:
match_df['fav_p2_pf_avg_ewm_10']

## Team Factors

## Team Fatigue(Away game streak) and Win Streak

In [None]:
d = defaultdict(list)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    prev_favorite_match = get_prev_matches(row['date'], \
        row['favorite_id'], match_df).tail(1)
    prev_underdog_match = get_prev_matches(row['date'], \
        row['underdog_id'], match_df).tail(1)
    if prev_favorite_match.shape[0] > 0:
        prev_f_win_streak = prev_favorite_match['post_favorite_win_streak'].values[0] if row['favorite_id'] == prev_favorite_match['favorite_id'].values[0] else prev_favorite_match['post_underdog_win_streak'].values[0]
        prev_f_home_streak = prev_favorite_match['post_favorite_home_streak'].values[0] if row['favorite_id'] == prev_favorite_match['favorite_id'].values[0] else prev_favorite_match['post_underdog_home_streak'].values[0]
    else:
        prev_f_win_streak = 0
        prev_f_home_streak = 0
        
    if prev_underdog_match.shape[0] > 0:
        prev_u_win_streak = prev_underdog_match['post_favorite_win_streak'].values[0] if row['underdog_id'] == prev_underdog_match['favorite_id'].values[0] else prev_underdog_match['post_underdog_win_streak'].values[0]
        prev_u_home_streak = prev_underdog_match['post_favorite_home_streak'].values[0] if row['underdog_id'] == prev_underdog_match['favorite_id'].values[0] else prev_underdog_match['post_underdog_home_streak'].values[0]
    else:
        prev_u_win_streak = 0
        prev_u_home_streak = 0
        
    if row['favorite_won']:
        f_win_streak = max(1,prev_f_win_streak+1)
        u_win_streak = min(-1, prev_u_win_streak-1)
    else:
        f_win_streak = min(-1, prev_f_win_streak-1)
        u_win_streak = max(1,prev_u_win_streak+1)

    if row['favorite_is_home']:
        f_home_streak = max(1, prev_f_home_streak+1)
        u_home_streak = min(-1, prev_u_home_streak-1)
    else:
        f_home_streak = min(-1, prev_f_home_streak-1)
        u_home_streak = max(1, prev_u_home_streak+1)
        
    match_df.at[idx, 'prev_favorite_win_streak'] = prev_f_win_streak
    match_df.at[idx, 'prev_favorite_home_streak'] = prev_f_home_streak
    match_df.at[idx, 'prev_underdog_win_streak'] = prev_u_win_streak
    match_df.at[idx, 'prev_underdog_home_streak'] = prev_u_home_streak
    
    match_df.at[idx, 'post_favorite_win_streak'] = f_win_streak
    match_df.at[idx, 'post_favorite_home_streak'] = f_home_streak
    match_df.at[idx, 'post_underdog_win_streak'] = u_win_streak
    match_df.at[idx, 'post_underdog_home_streak'] = u_home_streak

df = pd.DataFrame(d)
match_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)

streak_features = ['prev_favorite_win_streak', 
                   'prev_favorite_home_streak', 
                   'prev_underdog_win_streak', 
                   'prev_underdog_home_streak']


## FG%, 3P%, FT%, ORB, DRB, TRB, TOV, AST, STL, BLK, DRTG, ORTG, EFG 

In [None]:
d = defaultdict(list)
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    favorite_players = get_active_players(row['match_id'], row['favorite_id'], pp_df)
    underdog_players = get_active_players(row['match_id'], row['underdog_id'], pp_df)
    d['favorite_bpm'].append(favorite_players['bpm'].sum())
    d['underdog_bpm'].append(underdog_players['bpm'].sum())
    d['favorite_fga'].append(favorite_players['fga'].sum())
    d['underdog_fga'].append(underdog_players['fga'].sum())
    d['favorite_fg'].append(favorite_players['fg'].sum())
    d['underdog_fg'].append(underdog_players['fg'].sum())
    d['favorite_fg_pct'].append(favorite_players['fg_pct'].mean())
    d['underdog_fg_pct'].append(underdog_players['fg_pct'].mean())
    
    d['favorite_3p'].append(favorite_players['threep'].sum())
    d['underdog_3p'].append(underdog_players['threep'].sum())
    d['favorite_3pa'].append(favorite_players['threepa'].sum())
    d['underdog_3pa'].append(underdog_players['threepa'].sum())
    d['favorite_3p_pct'].append(favorite_players['threep_pct'].mean())
    d['underdog_3p_pct'].append(underdog_players['threep_pct'].mean())
    
    d['favorite_ft'].append(favorite_players['ft'].sum())
    d['underdog_ft'].append(underdog_players['ft'].sum())
    d['favorite_fta'].append(favorite_players['fta'].sum())
    d['underdog_fta'].append(underdog_players['fta'].sum())
    d['favorite_ft_pct'].append(favorite_players['ft_pct'].mean())
    d['underdog_ft_pct'].append(underdog_players['ft_pct'].mean())
    
    d['favorite_orb'].append(favorite_players['orb'].sum())
    d['underdog_orb'].append(underdog_players['orb'].sum())
    d['favorite_orb_pct'].append(favorite_players['orb_pct'].mean())
    d['underdog_orb_pct'].append(underdog_players['orb_pct'].mean())
    
    d['favorite_drb'].append(favorite_players['drb'].sum())
    d['underdog_drb'].append(underdog_players['drb'].sum())
    d['favorite_drb_pct'].append(favorite_players['drb_pct'].mean())
    d['underdog_drb_pct'].append(underdog_players['drb_pct'].mean())
    
    d['favorite_trb'].append(favorite_players['trb'].sum())
    d['underdog_trb'].append(underdog_players['trb'].sum())
    d['favorite_trb_pct'].append(favorite_players['trb_pct'].mean())
    d['underdog_trb_pct'].append(underdog_players['trb_pct'].mean())
    
    d['favorite_tov'].append(favorite_players['tov'].sum())
    d['underdog_tov'].append(underdog_players['tov'].sum())
    d['favorite_tov_pct'].append(favorite_players['tov_pct'].mean())
    d['underdog_tov_pct'].append(underdog_players['tov_pct'].mean())
    
    d['favorite_ast'].append(favorite_players['ast'].sum())
    d['underdog_ast'].append(underdog_players['ast'].sum())
    d['favorite_ast_pct'].append(favorite_players['ast_pct'].mean())
    d['underdog_ast_pct'].append(underdog_players['ast_pct'].mean())
    
    d['favorite_stl'].append(favorite_players['stl'].sum())
    d['underdog_stl'].append(underdog_players['stl'].sum())
    d['favorite_stl_pct'].append(favorite_players['stl_pct'].mean())
    d['underdog_stl_pct'].append(underdog_players['stl_pct'].mean())
    
    d['favorite_blk'].append(favorite_players['blk'].sum())
    d['underdog_blk'].append(underdog_players['blk'].sum())
    d['favorite_blk_pct'].append(favorite_players['blk_pct'].mean())
    d['underdog_blk_pct'].append(underdog_players['blk_pct'].mean())
    
    d['favorite_drtg'].append(favorite_players['drtg'].mean())
    d['underdog_drtg'].append(underdog_players['drtg'].mean())
    
    d['favorite_ortg'].append(favorite_players['ortg'].mean())
    d['underdog_ortg'].append(underdog_players['ortg'].mean())
    
    d['favorite_efg_pct'].append(favorite_players['efg_pct'].mean())
    d['underdog_efg_pct'].append(underdog_players['efg_pct'].mean())
        
    d['sp'].append(favorite_players['sp'].sum())

In [None]:
df = pd.DataFrame(d)
match_df = pd.concat([match_df.reset_index(drop=True),
                      df.reset_index(drop=True)],axis=1)

## Possessions and Pace

In [None]:
match_df['favorite_possessions'] = \
    0.5 * ((match_df['favorite_fga'] + 0.4*match_df['favorite_fta'] - \
            1.07*(match_df['favorite_orb']/(match_df['favorite_orb'] + match_df['underdog_drb'])) * \
            (match_df['favorite_fga']-match_df['favorite_fg']) + match_df['favorite_tov']) + \
           (match_df['underdog_fga'] + 0.4*match_df['underdog_fta'] -\
           1.07 * (match_df['underdog_orb']/(match_df['underdog_orb'] + match_df['favorite_drb'])) * \
           (match_df['underdog_fga']-match_df['underdog_fg']) + match_df['underdog_tov']))

match_df['underdog_possessions'] = \
    0.5 * ((match_df['underdog_fga'] + 0.4*match_df['underdog_fta'] - \
            1.07*(match_df['underdog_orb']/(match_df['underdog_orb'] + match_df['favorite_drb'])) * \
            (match_df['underdog_fga']-match_df['underdog_fg']) + match_df['underdog_tov']) + \
           (match_df['favorite_fga'] + 0.4*match_df['favorite_fta'] -\
           1.07 * (match_df['favorite_orb']/(match_df['favorite_orb'] + match_df['underdog_drb'])) * \
           (match_df['favorite_fga']-match_df['favorite_fg']) + match_df['favorite_tov']))

match_df['favorite_pace'] = 48 * ((match_df['favorite_possessions'] + match_df['underdog_possessions']) / \
                          (2*(match_df['sp']/(60*5))))
match_df['underdog_pace'] = 48 * ((match_df['favorite_possessions'] + match_df['underdog_possessions']) / \
                          (2*(match_df['sp']/(60*5))))


## EMA Team stats

In [None]:
def ema(current, prev_ema, window_size, smoothing=2.0):
    k = smoothing / (1 + window_size)
    return current * k + prev_ema * (1-k)

In [None]:
def get_prev_team_sum(team_id, home_col, prev_matches):
    away_col = home_col.replace('home', 'away')
    prev_matches['res'] =  prev_matches.apply(lambda x:
                             x[home_col] if x['home_id'] == team_id
                             else x[away_col], axis=1)
    return prev_matches['res'].sum()

In [None]:
smoothing = 2
window_sizes = [5,7,9]

for w in tqdm(range(len(window_sizes))):
    window_size = window_sizes[w]

    ema_favorite_features = \
        [(f'prev_favorite_pts_ema{window_size}',      f'post_favorite_pts_ema{window_size}'),
        (f'prev_favorite_bpm_ema{window_size}',       f'post_favorite_bpm_ema{window_size}'),
        (f'prev_favorite_fg_ema{window_size}',        f'post_favorite_fg_ema{window_size}'),
        (f'prev_favorite_fg_pct_ema{window_size}',    f'post_favorite_fg_pct_ema{window_size}'),
        (f'prev_favorite_3p_ema{window_size}',        f'post_favorite_3p_ema{window_size}'),
        (f'prev_favorite_3p_pct_ema{window_size}',    f'post_favorite_3p_pct_ema{window_size}'),
        (f'prev_favorite_ft_ema{window_size}',        f'post_favorite_ft_ema{window_size}'),
        (f'prev_favorite_ft_pct_ema{window_size}',    f'post_favorite_ft_pct_ema{window_size}'),
        (f'prev_favorite_orb_ema{window_size}',       f'post_favorite_orb_ema{window_size}'),
        (f'prev_favorite_orb_pct_ema{window_size}',   f'post_favorite_orb_pct_ema{window_size}'),
        (f'prev_favorite_drb_ema{window_size}',       f'post_favorite_drb_ema{window_size}'),
        (f'prev_favorite_drb_pct_ema{window_size}',   f'post_favorite_drb_pct_ema{window_size}'),
        (f'prev_favorite_trb_ema{window_size}',       f'post_favorite_trb_ema{window_size}'),
        (f'prev_favorite_trb_pct_ema{window_size}',   f'post_favorite_trb_pct_ema{window_size}'),
        (f'prev_favorite_tov_ema{window_size}',       f'post_favorite_tov_ema{window_size}'),
        (f'prev_favorite_tov_pct_ema{window_size}',   f'post_favorite_tov_pct_ema{window_size}'),
        (f'prev_favorite_ast_ema{window_size}',       f'post_favorite_ast_ema{window_size}'),
        (f'prev_favorite_ast_pct_ema{window_size}',   f'post_favorite_ast_pct_ema{window_size}'),
        (f'prev_favorite_stl_ema{window_size}',       f'post_favorite_stl_ema{window_size}'),
        (f'prev_favorite_stl_pct_ema{window_size}',   f'post_favorite_stl_pct_ema{window_size}'),
        (f'prev_favorite_blk_ema{window_size}',       f'post_favorite_blk_ema{window_size}'),
        (f'prev_favorite_blk_pct_ema{window_size}',   f'post_favorite_blk_pct_ema{window_size}'),
        (f'prev_favorite_drtg_ema{window_size}',      f'post_favorite_drtg_ema{window_size}'),
        (f'prev_favorite_ortg_ema{window_size}',      f'post_favorite_ortg_ema{window_size}'),
        (f'prev_favorite_efg_pct_ema{window_size}',   f'post_favorite_efg_pct_ema{window_size}'),
        (f'prev_favorite_pace_ema{window_size}',      f'post_favorite_pace_ema{window_size}'),
        ]

    ema_underdog_features = [(f[0].replace('favorite','underdog'), f[1].replace('favorite','underdog')) for f in ema_favorite_features]
    sma_favorite_features = [(f[0].replace('ema','sma'), f[1].replace('ema','sma')) for f in ema_favorite_features]
    sma_underdog_features = [(f[0].replace('favorite','underdog'), f[1].replace('favorite','underdog')) for f in sma_favorite_features]

    for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
        prev_favorite_matches = get_prev_matches(row['date'], \
            row['favorite_id'], match_df).tail(window_size)
        prev_underdog_matches = get_prev_matches(row['date'], \
            row['underdog_id'], match_df).tail(window_size)
        len_prev_favorite_matches = len(prev_favorite_matches)
        len_prev_underdog_matches = len(prev_underdog_matches)
        for i in range(len(ema_favorite_features)):
            favorite_feature = re.findall('favorite_.*_ema', ema_favorite_features[i][0])[0].replace('_ema', '')
            underdog_feature = favorite_feature.replace('favorite', 'underdog') 

            if not prev_favorite_matches.empty:
                prev_match = prev_favorite_matches.iloc[-1:]
                match_df.at[idx,sma_favorite_features[i][0]] = get_prev_team_sum(row['favorite_id'], 
                                                                                 favorite_feature, 
                                                                                 prev_favorite_matches)/len_prev_favorite_matches 
                if len_prev_favorite_matches < window_size:
                    match_df.at[idx,ema_favorite_features[i][0]] = match_df.at[idx,sma_favorite_features[i][0]]
                    match_df.at[idx,ema_favorite_features[i][1]] = (match_df.at[idx,sma_favorite_features[i][0]] \
                        * len_prev_favorite_matches + row[favorite_feature])/(len_prev_favorite_matches + 1)
                    
                    match_df.at[idx,ema_favorite_features[i][0]] = match_df.loc[idx,sma_favorite_features[i][0]] 
                    match_df.at[idx,ema_favorite_features[i][1]] = (match_df.loc[idx,sma_favorite_features[i][0]] \
                                                                * len_prev_favorite_matches + row[favorite_feature])/(len_prev_favorite_matches + 1)

                else:
                    match_df.at[idx,ema_favorite_features[i][0]]= prev_match[ema_favorite_features[i][1]] \
                                        if prev_match['favorite_id'].values[0] == row['favorite_id'] \
                                        else prev_match[ema_underdog_features[i][1]]

                    match_df.at[idx,ema_favorite_features[i][1]] = ema(row[favorite_feature],  
                                        match_df.at[idx,ema_favorite_features[i][0]], 
                                        window_size)
            else:
                match_df.at[idx,ema_favorite_features[i][1]] = row[favorite_feature]


            if not prev_underdog_matches.empty:
                prev_match = prev_underdog_matches.iloc[-1:]
                match_df.at[idx,sma_underdog_features[i][0]] = get_prev_team_sum(row['underdog_id'], \
                    underdog_feature, prev_underdog_matches)/len_prev_underdog_matches

                if len_prev_underdog_matches < window_size:
                    match_df.at[idx,ema_underdog_features[i][0]] = match_df.at[idx,sma_underdog_features[i][0]]
                    match_df.at[idx,ema_underdog_features[i][1]] = (match_df.at[idx,sma_underdog_features[i][0]] \
                        * len_prev_underdog_matches + row[underdog_feature])/(len_prev_underdog_matches + 1)
                else:
                    match_df.at[idx,ema_underdog_features[i][0]] = (prev_match[ema_favorite_features[i][1]] \
                                if prev_match['favorite_id'].values[0] == row['underdog_id'] \
                                else prev_match[ema_underdog_features[i][1]])

                    match_df.at[idx,ema_underdog_features[i][1]] = ema(row[underdog_feature],  
                                            match_df.at[idx,ema_underdog_features[i][0]], 
                                            window_size)
            else:
                match_df.at[idx,ema_underdog_features[i][1]] = row[underdog_feature]

In [None]:
match_df[(match_df.favorite_id == 7) | (match_df.underdog_id == 7)][['favorite_id', 'underdog_id', 'favorite_pts', 'underdog_pts', 'prev_favorite_pts_ema7', 'prev_underdog_pts_ema7']]

In [None]:
match_df.to_csv('nba_processed_features.csv')
pp_df.to_csv('player_df.csv')

In [3]:
match_df['favorite_elo'] = 1500.0
match_df['underdog_elo'] = 1500.0

time: 1.48 ms (started: 2023-03-04 13:44:08 -05:00)


In [4]:
def get_prev_match(date, team_id, match_df):
    return match_df[(match_df["date"] < date) &
                    ((match_df["favorite_id"] == team_id) |
                     (match_df["underdog_id"] == team_id))].tail(1)

time: 485 µs (started: 2023-03-04 13:44:09 -05:00)


In [None]:
def get_prev_elo(team_id, season, prev_match):
    if prev_match.empty:
        prev_elo = 1500.0
    elif team_id == prev_match['favorite_id'].values[0]:
        prev_elo = prev_match['favorite_elo'].values[0]
    elif team_id == prev_match['underdog_id'].values[0]:
        prev_elo = prev_match['underdog_elo'].values[0]
    else: 
        print('err')

    if (not prev_match.empty and
            (prev_match['season'].values[0]
             != season)):
        prev_elo = prev_elo * 0.75 + 1505 * 0.25
    return prev_elo


def update_elo(favorite_elo, underdog_elo, movl):
    elo_diff = favorite_elo + 100.0 - underdog_elo
    if movl > 0:
        h_s = 1.0
        a_s = 0.0
        multiplier = ((movl+3)**(0.8))/(7.5+0.006*elo_diff)

    else:
        h_s = 0.0
        a_s = 1.0
        multiplier = ((-movl+3)**(0.8))/(7.5+0.006*(-elo_diff))
        
    exp_h_s = 1.0 / (1.0 + 10.0 ** (-elo_diff/400.0))
    exp_a_s = 1.0 - exp_h_s
    
    k = 20.0 * multiplier

    new_favorite_elo = favorite_elo + k * (h_s - exp_h_s)
    new_underdog_elo = underdog_elo + k * (a_s - exp_a_s)

    return (new_favorite_elo, new_underdog_elo)

In [None]:
for idx, row in tqdm(match_df.iterrows(), total=match_df.shape[0]):
    prev_f_match = get_prev_match(row['date'], row['favorite_id'], match_df)
    prev_u_match = get_prev_match(row['date'], row['underdog_id'], match_df)

    prev_f_elo = get_prev_elo(
        row['favorite_id'], row['season'], prev_f_match)
    prev_u_elo = get_prev_elo(
        row['underdog_id'], row['season'], prev_u_match)    

    new_elos = update_elo(prev_f_elo, prev_u_elo, row['favorite_movl'])
    match_df.at[idx, 'favorite_elo'] = new_elos[0]
    match_df.at[idx, 'underdog_elo'] = new_elos[1]

    match_df.at[idx, 'prev_favorite_elo'] = prev_f_elo
    match_df.at[idx, 'prev_underdog_elo'] = prev_u_elo

In [None]:
match_df.to_csv('nba_processed_features.csv')
pp_df.to_csv('player_df.csv')

In [2]:
match_df = pd.read_csv('nba_processed_features.csv')
pp_df = pd.read_csv('player_df.csv')

time: 6.23 s (started: 2023-03-04 13:43:35 -05:00)


In [None]:
ema_favorite_features = \
        [(f'prev_favorite_pts_ema{window_size}',       f'post_favorite_pts_ema{window_size}'),
        (f'prev_favorite_bpm_ema{window_size}',       f'post_favorite_bpm_ema{window_size}'),
        (f'prev_favorite_fg_ema{window_size}',        f'post_favorite_fg_ema{window_size}'),
        (f'prev_favorite_fg_pct_ema{window_size}',    f'post_favorite_fg_pct_ema{window_size}'),
        (f'prev_favorite_3p_ema{window_size}',        f'post_favorite_3p_ema{window_size}'),
        (f'prev_favorite_3p_pct_ema{window_size}',    f'post_favorite_3p_pct_ema{window_size}'),
        (f'prev_favorite_ft_ema{window_size}',        f'post_favorite_ft_ema{window_size}'),
        (f'prev_favorite_ft_pct_ema{window_size}',    f'post_favorite_ft_pct_ema{window_size}'),
        (f'prev_favorite_orb_ema{window_size}',       f'post_favorite_orb_ema{window_size}'),
        (f'prev_favorite_orb_pct_ema{window_size}',   f'post_favorite_orb_pct_ema{window_size}'),
        (f'prev_favorite_drb_ema{window_size}',       f'post_favorite_drb_ema{window_size}'),
        (f'prev_favorite_drb_pct_ema{window_size}',   f'post_favorite_drb_pct_ema{window_size}'),
        (f'prev_favorite_trb_ema{window_size}',       f'post_favorite_trb_ema{window_size}'),
        (f'prev_favorite_trb_pct_ema{window_size}',   f'post_favorite_trb_pct_ema{window_size}'),
        (f'prev_favorite_tov_ema{window_size}',       f'post_favorite_tov_ema{window_size}'),
        (f'prev_favorite_tov_pct_ema{window_size}',   f'post_favorite_tov_pct_ema{window_size}'),
        (f'prev_favorite_ast_ema{window_size}',       f'post_favorite_ast_ema{window_size}'),
        (f'prev_favorite_ast_pct_ema{window_size}',   f'post_favorite_ast_pct_ema{window_size}'),
        (f'prev_favorite_stl_ema{window_size}',       f'post_favorite_stl_ema{window_size}'),
        (f'prev_favorite_stl_pct_ema{window_size}',   f'post_favorite_stl_pct_ema{window_size}'),
        (f'prev_favorite_blk_ema{window_size}',       f'post_favorite_blk_ema{window_size}'),
        (f'prev_favorite_blk_pct_ema{window_size}',   f'post_favorite_blk_pct_ema{window_size}'),
        (f'prev_favorite_drtg_ema{window_size}',      f'post_favorite_drtg_ema{window_size}'),
        (f'prev_favorite_ortg_ema{window_size}',      f'post_favorite_ortg_ema{window_size}'),
        (f'prev_favorite_efg_pct_ema{window_size}',   f'post_favorite_efg_pct_ema{window_size}'),
        (f'prev_favorite_pace_ema{window_size}',      f'post_favorite_pace_ema{window_size}')]

features = ['favorite_pace','underdog_pace']

#all_features = 