# 3 Pointers Made against advanced_details.csv

### Import packages

In [58]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr
import itertools

pd.set_option("display.max_columns", None)

### Set working directory

In [59]:
# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

# Change working directory
os.chdir('/Users/tyler/OneDrive/Documents/Python/NBA')

# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA
Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA


## Exploratory Data Analysis

### Import data

In [60]:
df = pd.read_csv('backend/data/details/advanced_details.csv').drop(['Unnamed: 0'], axis=1)
shooting_df = pd.read_csv('backend/data/totals/game_totals.csv').drop(['Unnamed: 0'], axis=1)
shooting_df = shooting_df[['date', 'visitor', 'home', 'team', '3p']]

### Basic exploration

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479226 entries, 0 to 479225
Data columns (total 22 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   date      479226 non-null  object 
 1   visitor   479226 non-null  object 
 2   home      479226 non-null  object 
 3   team      479226 non-null  int64  
 4   starter   479226 non-null  int64  
 5   player    479226 non-null  object 
 6   mp        479226 non-null  object 
 7   ts_perc   461554 non-null  float64
 8   efg_perc  458901 non-null  float64
 9   3par      458901 non-null  float64
 10  ftr       458901 non-null  float64
 11  orb_perc  479151 non-null  float64
 12  drb_perc  479151 non-null  float64
 13  trb_perc  479151 non-null  float64
 14  ast_perc  479147 non-null  float64
 15  stl_perc  479151 non-null  float64
 16  blk_perc  479151 non-null  float64
 17  tov_perc  465166 non-null  float64
 18  usg_perc  479151 non-null  float64
 19  ortg      479151 non-null  float64
 20  drtg

In [62]:
df.sample(5)

Unnamed: 0,date,visitor,home,team,starter,player,mp,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,bpm
278727,"Sun, Mar 22, 2015",New Orleans Pelicans,Los Angeles Clippers,1,0,Spencer Hawes,10:34,,,,,0.0,13.0,5.5,0.0,0.0,0.0,100.0,4.1,0.0,118.0,-11.9
73894,"Mon, Dec 29, 2008",Toronto Raptors,Golden State Warriors,1,0,Rob Kurz,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299001,"Mon, Jan 4, 2016",Sacramento Kings,Oklahoma City Thunder,0,0,Marco Belinelli,34:07,0.636,0.538,0.385,0.615,0.0,6.1,3.2,4.5,1.4,0.0,5.7,20.3,131.0,107.0,2.7
271000,"Wed, Feb 4, 2015",Oklahoma City Thunder,New Orleans Pelicans,0,0,Reggie Jackson,22:35,0.286,0.286,0.143,0.0,0.0,23.2,13.3,20.9,0.0,0.0,12.5,16.4,79.0,98.0,-6.7
402530,"Wed, Jan 16, 2019",Orlando Magic,Detroit Pistons,0,0,Melvin Frazier,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
def convert_mp(mp):
    if mp == '0':
        return 0
    else:
        mins = int(mp.split(':')[0])
        secs = int(mp.split(':')[1]) / 60
        return mins + secs

In [64]:
# Fill NaN
df = df.fillna(0)

# Convert 'date' column to Date object
df['date'] = pd.to_datetime(df['date'])

# Conver 'team' column to Team Name
df['team'] = np.where(df['team'], df['home'], df['visitor'])

# Convert 'minutes played' to float
df['mp'] = df['mp'].apply(lambda x: convert_mp(x))

# Set stats
stats = ['ts_perc', 'efg_perc', '3par', 'ftr', 'orb_perc', 'drb_perc', 'trb_perc', 
         'ast_perc', 'stl_perc', 'blk_perc', 'tov_perc', 'usg_perc', 'ortg', 'drtg', 'bpm', 'mp']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479226 entries, 0 to 479225
Data columns (total 22 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   date      479226 non-null  datetime64[ns]
 1   visitor   479226 non-null  object        
 2   home      479226 non-null  object        
 3   team      479226 non-null  object        
 4   starter   479226 non-null  int64         
 5   player    479226 non-null  object        
 6   mp        479226 non-null  float64       
 7   ts_perc   479226 non-null  float64       
 8   efg_perc  479226 non-null  float64       
 9   3par      479226 non-null  float64       
 10  ftr       479226 non-null  float64       
 11  orb_perc  479226 non-null  float64       
 12  drb_perc  479226 non-null  float64       
 13  trb_perc  479226 non-null  float64       
 14  ast_perc  479226 non-null  float64       
 15  stl_perc  479226 non-null  float64       
 16  blk_perc  479226 non-null  float64    

In [65]:
# Team total stats
teams_df = df.groupby(['date', 'visitor', 'home', 'team']).sum().reset_index()

In [66]:
# Rename target variable
shooting_df = shooting_df.rename({'3p': 'target'}, axis=1)

# Convert 'date' column to Date object
shooting_df['date'] = pd.to_datetime(shooting_df['date'])

# Convert 'team' column to Team Name
shooting_df['team'] = np.where(shooting_df['team'], shooting_df['home'], shooting_df['visitor'])

In [67]:
# Starters total stats
starters_df = df[df['starter'] == 1].groupby(['date', 'visitor', 'home', 'team']).aggregate(['sum', 'mean'])
cols = [col for col in starters_df.columns
        if (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
           (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0])]
starters_df = starters_df[cols]
starters_df.columns = [col[0] for col in starters_df.columns]
starters_df = starters_df.reset_index()


# Merge dataframes to have target variable
starters_df = pd.merge(starters_df, shooting_df, 
              left_on=['date', 'visitor', 'home', 'team'], right_on=['date', 'visitor', 'home', 'team'],
              how='left')

In [68]:
# Bench total stats
bench_df = df[df['starter'] == 0].groupby(['date', 'visitor', 'home', 'team']).aggregate(['sum', 'mean'])
cols = [col for col in bench_df.columns
        if (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
           (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0])]
bench_df = bench_df[cols]
bench_df.columns = [col[0] for col in bench_df.columns]
bench_df = bench_df.reset_index()

# Merge dataframes to have target variable
bench_df = pd.merge(bench_df, shooting_df, 
              left_on=['date', 'visitor', 'home', 'team'], right_on=['date', 'visitor', 'home', 'team'],
              how='left')

# Dataframe of team's last 15 performances

In [69]:
# Return ten lastest dates team played
def last_15_date(team, date):
    schedule = teams_df[teams_df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 15 < 0:
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
    else:
        date_1, date_2 = schedule.iloc[date_index - 1]['date'], schedule.iloc[date_index - 2]['date']
        date_3, date_4 = schedule.iloc[date_index - 3]['date'], schedule.iloc[date_index - 4]['date']
        date_5, date_6 = schedule.iloc[date_index - 5]['date'], schedule.iloc[date_index - 6]['date']
        date_7, date_8 = schedule.iloc[date_index - 7]['date'], schedule.iloc[date_index - 8]['date']
        date_9, date_10 = schedule.iloc[date_index - 9]['date'], schedule.iloc[date_index - 10]['date']
        date_11, date_12 = schedule.iloc[date_index - 11]['date'], schedule.iloc[date_index - 12]['date']
        date_13, date_14 = schedule.iloc[date_index - 13]['date'], schedule.iloc[date_index - 14]['date']
        date_15 = schedule.iloc[date_index - 15]['date']
        return date_1, date_2, date_3, date_4, date_5, date_6, date_7, date_8, date_9, date_10, date_11, date_12, date_13, date_14, date_15

teams_df['dates'] = teams_df.apply(lambda x: last_15_date(x.team, x.date), axis=1)
teams_df['date_1'], teams_df['date_2'] = teams_df['dates'].apply(lambda x: x[0]), teams_df['dates'].apply(lambda x: x[1])
teams_df['date_3'], teams_df['date_4'] = teams_df['dates'].apply(lambda x: x[2]), teams_df['dates'].apply(lambda x: x[3])
teams_df['date_5'], teams_df['date_6'] = teams_df['dates'].apply(lambda x: x[4]), teams_df['dates'].apply(lambda x: x[5])
teams_df['date_7'], teams_df['date_8'] = teams_df['dates'].apply(lambda x: x[6]), teams_df['dates'].apply(lambda x: x[7])
teams_df['date_9'], teams_df['date_10'] = teams_df['dates'].apply(lambda x: x[8]), teams_df['dates'].apply(lambda x: x[9])
teams_df['date_11'], teams_df['date_12'] = teams_df['dates'].apply(lambda x: x[10]), teams_df['dates'].apply(lambda x: x[11])
teams_df['date_13'], teams_df['date_14'] = teams_df['dates'].apply(lambda x: x[12]), teams_df['dates'].apply(lambda x: x[13])
teams_df['date_15'] = teams_df['dates'].apply(lambda x: x[14])

In [70]:
# Keep date columns in teams
cols = [col for col in teams_df.columns
        if ('date_' in col) or \
        (col in ['date', 'visitor', 'home', 'team'])]
teams_df = teams_df[cols]

# Merge dates with starters
starters_df = pd.merge(starters_df, teams_df, 
                       left_on=['date', 'visitor', 'home', 'team'], 
                       right_on=['date', 'visitor', 'home', 'team'],
                       how='left')

# Merge dates with bench
bench_df = pd.merge(bench_df, teams_df, 
                    left_on=['date', 'visitor', 'home', 'team'], 
                    right_on=['date', 'visitor', 'home', 'team'],
                    how='left')

In [71]:
# Calculate z-score
def z_score(value, mean, std):
    return (value - mean) / std

In [72]:
# Calculate perc difference
def perc_diff(value, mean):
    return (value - mean) / mean

# Starters Analysis

In [73]:
# X and y column names to merge on
y_cols = starters_df.columns
x_cols = ['date', 'team'] + stats

last_15_games = starters_df[y_cols]
X = starters_df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for date in dates:
    last_15_games = pd.merge(last_15_games, X, left_on=['date' + date, 'team'], right_on=['date', 'team'], how='left', suffixes=('', date))

last_15_games.head()

Unnamed: 0,date,visitor,home,team,mp,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,bpm,target,date_1,date_2,date_3,date_4,date_5,date_6,date_7,date_8,date_9,date_10,date_11,date_12,date_13,date_14,date_15,date_1.1,ts_perc_1,efg_perc_1,3par_1,ftr_1,orb_perc_1,drb_perc_1,trb_perc_1,ast_perc_1,stl_perc_1,blk_perc_1,tov_perc_1,usg_perc_1,ortg_1,drtg_1,bpm_1,mp_1,date_2.1,ts_perc_2,efg_perc_2,3par_2,ftr_2,orb_perc_2,drb_perc_2,trb_perc_2,ast_perc_2,stl_perc_2,blk_perc_2,tov_perc_2,usg_perc_2,ortg_2,drtg_2,bpm_2,mp_2,date_3.1,ts_perc_3,efg_perc_3,3par_3,ftr_3,orb_perc_3,drb_perc_3,trb_perc_3,ast_perc_3,stl_perc_3,blk_perc_3,tov_perc_3,usg_perc_3,ortg_3,drtg_3,bpm_3,mp_3,date_4.1,ts_perc_4,efg_perc_4,3par_4,ftr_4,orb_perc_4,drb_perc_4,trb_perc_4,ast_perc_4,stl_perc_4,blk_perc_4,tov_perc_4,usg_perc_4,ortg_4,drtg_4,bpm_4,mp_4,date_5.1,ts_perc_5,efg_perc_5,3par_5,ftr_5,orb_perc_5,drb_perc_5,trb_perc_5,ast_perc_5,stl_perc_5,blk_perc_5,tov_perc_5,usg_perc_5,ortg_5,drtg_5,bpm_5,mp_5,date_6.1,ts_perc_6,efg_perc_6,3par_6,ftr_6,orb_perc_6,drb_perc_6,trb_perc_6,ast_perc_6,stl_perc_6,blk_perc_6,tov_perc_6,usg_perc_6,ortg_6,drtg_6,bpm_6,mp_6,date_7.1,ts_perc_7,efg_perc_7,3par_7,ftr_7,orb_perc_7,drb_perc_7,trb_perc_7,ast_perc_7,stl_perc_7,blk_perc_7,tov_perc_7,usg_perc_7,ortg_7,drtg_7,bpm_7,mp_7,date_8.1,ts_perc_8,efg_perc_8,3par_8,ftr_8,orb_perc_8,drb_perc_8,trb_perc_8,ast_perc_8,stl_perc_8,blk_perc_8,tov_perc_8,usg_perc_8,ortg_8,drtg_8,bpm_8,mp_8,date_9.1,ts_perc_9,efg_perc_9,3par_9,ftr_9,orb_perc_9,drb_perc_9,trb_perc_9,ast_perc_9,stl_perc_9,blk_perc_9,tov_perc_9,usg_perc_9,ortg_9,drtg_9,bpm_9,mp_9,date_10.1,ts_perc_10,efg_perc_10,3par_10,ftr_10,orb_perc_10,drb_perc_10,trb_perc_10,ast_perc_10,stl_perc_10,blk_perc_10,tov_perc_10,usg_perc_10,ortg_10,drtg_10,bpm_10,mp_10,date_11.1,ts_perc_11,efg_perc_11,3par_11,ftr_11,orb_perc_11,drb_perc_11,trb_perc_11,ast_perc_11,stl_perc_11,blk_perc_11,tov_perc_11,usg_perc_11,ortg_11,drtg_11,bpm_11,mp_11,date_12.1,ts_perc_12,efg_perc_12,3par_12,ftr_12,orb_perc_12,drb_perc_12,trb_perc_12,ast_perc_12,stl_perc_12,blk_perc_12,tov_perc_12,usg_perc_12,ortg_12,drtg_12,bpm_12,mp_12,date_13.1,ts_perc_13,efg_perc_13,3par_13,ftr_13,orb_perc_13,drb_perc_13,trb_perc_13,ast_perc_13,stl_perc_13,blk_perc_13,tov_perc_13,usg_perc_13,ortg_13,drtg_13,bpm_13,mp_13,date_14.1,ts_perc_14,efg_perc_14,3par_14,ftr_14,orb_perc_14,drb_perc_14,trb_perc_14,ast_perc_14,stl_perc_14,blk_perc_14,tov_perc_14,usg_perc_14,ortg_14,drtg_14,bpm_14,mp_14,date_15.1,ts_perc_15,efg_perc_15,3par_15,ftr_15,orb_perc_15,drb_perc_15,trb_perc_15,ast_perc_15,stl_perc_15,blk_perc_15,tov_perc_15,usg_perc_15,ortg_15,drtg_15,bpm_15,mp_15
0,2006-10-31,Chicago Bulls,Miami Heat,Chicago Bulls,128.55,0.519,0.4188,0.5,2.645,6.36,12.96,9.74,9.22,2.06,1.78,15.34,19.44,537.0,366.0,2.5,7,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,
1,2006-10-31,Chicago Bulls,Miami Heat,Miami Heat,156.583333,0.4474,0.3998,1.667,2.267,3.64,14.72,9.04,14.96,1.28,1.42,19.26,22.68,375.0,583.0,-16.4,3,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,
2,2006-10-31,Phoenix Suns,Los Angeles Lakers,Los Angeles Lakers,157.6,0.5192,0.5024,1.542,2.747,5.76,22.18,13.76,19.74,1.78,0.0,20.84,18.7,526.0,535.0,7.5,6,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,
3,2006-10-31,Phoenix Suns,Los Angeles Lakers,Phoenix Suns,159.483333,0.6788,0.6594,1.34,0.547,3.66,17.0,10.5,19.72,0.26,1.44,22.4,17.24,551.0,579.0,-7.9,13,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,
4,2006-11-01,Atlanta Hawks,Philadelphia 76ers,Atlanta Hawks,164.683333,0.3112,0.2202,0.691,2.063,4.08,14.38,8.72,8.66,2.28,2.08,24.1,17.9,298.0,516.0,-26.4,4,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,


### Last 15 Performances (Unweighted)

In [74]:
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_15_games_unweighted = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15_games_unweighted[stat] = 0
    for date in dates:
        last_15_games_unweighted[stat] = last_15_games_unweighted[stat] + last_15_games_unweighted[stat + date]
    
    last_15_games_unweighted[stat] = last_15_games_unweighted[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_15_games_unweighted[stat + '_std'] = 0
    for date in dates:
        last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] + \
                                                    ((last_15_games_unweighted[stat + date] - last_15_games_unweighted[stat]) ** 2)
    
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] / len(dates)
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_15_games_unweighted[stat + '_trend'] = 0
    for date in dates[:10]:
        last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] + \
                                        z_score(last_15_games_unweighted[stat + date], last_15_games_unweighted[stat], last_15_games_unweighted[stat + '_std']).fillna(0)
    
    last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] / len(dates[:10])

last_15_games_unweighted = last_15_games_unweighted.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_15_game_cols = [col 
                    for col in last_15_games_unweighted.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

starters_15_games = last_15_games_unweighted[last_15_game_cols].dropna(axis=0).copy()
starters_15_games.columns = [col[0] for col in starters_15_games.columns]

## Correlations

In [75]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in starters_15_games:
    corr_p = pearsonr(starters_15_games['target'], starters_15_games[col])
    row = {'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print statistically significant correlations
starters_corr = corr_df[corr_df['p-value'] < .05].sort_values(['corr'], axis=0, ascending=False)
starters_corr

Unnamed: 0,corr,p-value,stat
0,1.0,0.0,target
3,0.69,0.0,3par
2,0.44,0.0,efg_perc
1,0.39,0.0,ts_perc
6,0.34,0.0,drb_perc
13,0.28,0.0,ortg
12,0.25,0.0,usg_perc
8,0.25,0.0,ast_perc
14,0.24,0.0,drtg
10,0.16,0.0,blk_perc


## Bench Analysis

In [76]:
# X and y column names to merge on
y_cols = bench_df.columns
x_cols = ['date', 'team'] + stats

last_15_games = bench_df[y_cols]
X = bench_df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for date in dates:
    last_15_games = pd.merge(last_15_games, X, left_on=['date' + date, 'team'], right_on=['date', 'team'], how='left', suffixes=('', date))

### Last 15 Performances (Unweighted)

In [77]:
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_15_games_unweighted = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15_games_unweighted[stat] = 0
    for date in dates:
        last_15_games_unweighted[stat] = last_15_games_unweighted[stat] + last_15_games_unweighted[stat + date]
    
    last_15_games_unweighted[stat] = last_15_games_unweighted[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_15_games_unweighted[stat + '_std'] = 0
    for date in dates:
        last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] + \
                                                    ((last_15_games_unweighted[stat + date] - last_15_games_unweighted[stat]) ** 2)
    
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] / len(dates)
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_15_games_unweighted[stat + '_trend'] = 0
    for date in dates[:10]:
        last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] + \
                                        z_score(last_15_games_unweighted[stat + date], last_15_games_unweighted[stat], last_15_games_unweighted[stat + '_std']).fillna(0)
    
    last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] / len(dates[:10])

last_15_games_unweighted = last_15_games_unweighted.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_15_game_cols = [col 
                    for col in last_15_games_unweighted.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

bench_15_games = last_15_games_unweighted[last_15_game_cols].dropna(axis=0).copy()
bench_15_games.columns = [col[0] for col in bench_15_games.columns]

## Correlations of Bench

In [78]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in bench_15_games:
    corr_p = pearsonr(bench_15_games['target'], bench_15_games[col])
    row = {'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print statistically significant correlations
bench_corr = corr_df[corr_df['p-value'] < .05].sort_values(['corr'], axis=0, ascending=False)
bench_corr

Unnamed: 0,corr,p-value,stat
0,1.0,0.0,target
3,0.62,0.0,3par
13,0.38,0.0,ortg
14,0.32,0.0,drtg
2,0.28,0.0,efg_perc
16,0.26,0.0,mp
1,0.23,0.0,ts_perc
6,0.15,0.0,drb_perc
10,0.11,0.0,blk_perc
15,0.1,0.0,bpm


# Comparison of Starters to Bench

In [79]:
corr_df = pd.merge(starters_corr.drop(['p-value'], axis=1), 
                   bench_corr.drop(['p-value'], axis=1),
                   left_on=['stat'], right_on=['stat'],
                   how='outer',
                   suffixes=['_starter', '_bench'])
corr_df.sort_values(['stat'], axis=0)

Unnamed: 0,corr_starter,stat,corr_bench
1,0.69,3par,0.62
11,0.03,3par_trend,
7,0.25,ast_perc,0.08
9,0.16,blk_perc,0.11
10,0.11,bpm,0.1
4,0.34,drb_perc,0.15
18,,drb_perc_trend,-0.01
8,0.24,drtg,0.32
2,0.44,efg_perc,0.28
16,-0.24,ftr,-0.12


## Save dataframe with significantly correlated stats

In [81]:
starter_stats = starters_corr[starters_corr['corr'].abs() >= .4]['stat']
starters_df = starters_15_games[starter_stats]

bench_stats = bench_corr[bench_corr['corr'].abs() >= .4]['stat']
bench_df = bench_15_games[bench_stats]

df = pd.merge(starters_df, bench_df, 
              left_on=['date', 'visitor', 'home'], 
              right_on=['date', 'visitor', 'home'], 
              how='outer', suffixes=['_starters', '_bench'])
df.to_csv('backend/data/inputs/advanced_details.csv')