# 3 Pointers Made against game_details.csv

### Import packages

In [98]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr
import itertools

pd.set_option("display.max_columns", None)

### Set working directory

In [99]:
# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

# Change working directory
os.chdir('/Users/tyler/OneDrive/Documents/Python/NBA')

# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA
Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA


## Exploratory Data Analysis

### Import data

In [100]:
df = pd.read_csv('backend/data/details/game_details.csv').drop(['Unnamed: 0'], axis=1)
shooting_df = pd.read_csv('backend/data/totals/game_totals.csv').drop(['Unnamed: 0'], axis=1)
shooting_df = shooting_df[['date', 'visitor', 'home', 'team', '3p']]

### Basic exploration

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479226 entries, 0 to 479225
Data columns (total 26 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        479226 non-null  object 
 1   visitor     479226 non-null  object 
 2   home        479226 non-null  object 
 3   team        479226 non-null  int64  
 4   starter     479226 non-null  int64  
 5   player      479226 non-null  object 
 6   mp          479226 non-null  object 
 7   fg          479226 non-null  int64  
 8   fga         479226 non-null  int64  
 9   fg_perc     458891 non-null  float64
 10  3p          479226 non-null  int64  
 11  3pa         479226 non-null  int64  
 12  3p_perc     332949 non-null  float64
 13  ft          479226 non-null  int64  
 14  fta         479226 non-null  int64  
 15  ft_perc     311653 non-null  float64
 16  orb         479226 non-null  int64  
 17  drb         479226 non-null  int64  
 18  trb         479226 non-null  int64  
 19  as

In [102]:
df.sample(5)

Unnamed: 0,date,visitor,home,team,starter,player,mp,fg,fga,fg_perc,3p,3pa,3p_perc,ft,fta,ft_perc,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus
224551,"Fri, Nov 29, 2013",Washington Wizards,Indiana Pacers,0,1,Trevor Ariza,33:43,6,14,0.429,2,6,0.333,0,0,,0,4,4,0,1,2,2,1,14,-28.0
448841,"Mon, Dec 28, 2020",Houston Rockets,Denver Nuggets,0,1,David Nwaba,31:58,5,11,0.455,1,3,0.333,3,5,0.6,1,4,5,1,2,0,1,2,14,12.0
135017,"Thu, Dec 16, 2010",San Antonio Spurs,Denver Nuggets,1,0,Anthony Carter,9:48,0,1,0.0,0,0,,0,0,,0,1,1,2,0,0,0,1,0,-9.0
145472,"Tue, Feb 15, 2011",Miami Heat,Indiana Pacers,1,1,Danny Granger,35:54,5,13,0.385,1,5,0.2,3,5,0.6,3,6,9,0,2,0,4,3,14,-17.0
260973,"Sat, Dec 13, 2014",Brooklyn Nets,Charlotte Hornets,0,0,Jerome Jordan,7:19,0,0,,0,0,,0,0,,0,1,1,0,1,0,0,1,0,-1.0


In [103]:
def convert_mp(mp):
    if mp == '0':
        return 0
    else:
        mins = int(mp.split(':')[0])
        secs = int(mp.split(':')[1]) / 60
        return mins + secs

In [104]:
# Fill NaN
df = df.fillna(0)

# Convert 'date' column to Date object
df['date'] = pd.to_datetime(df['date'])

# Convert 'team' column to Team Name
df['team'] = np.where(df['team'], df['home'], df['visitor'])

# Convert 'minutes played' to float
df['mp'] = df['mp'].apply(lambda x: convert_mp(x))

# Set stats
stats = ['fg', 'fga', 'fg_perc', '3p', '3pa', '3p_perc', 'ft', 'fta', 'ft_perc', 
         'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'plus_minus', 'mp']

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479226 entries, 0 to 479225
Data columns (total 26 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   date        479226 non-null  datetime64[ns]
 1   visitor     479226 non-null  object        
 2   home        479226 non-null  object        
 3   team        479226 non-null  object        
 4   starter     479226 non-null  int64         
 5   player      479226 non-null  object        
 6   mp          479226 non-null  float64       
 7   fg          479226 non-null  int64         
 8   fga         479226 non-null  int64         
 9   fg_perc     479226 non-null  float64       
 10  3p          479226 non-null  int64         
 11  3pa         479226 non-null  int64         
 12  3p_perc     479226 non-null  float64       
 13  ft          479226 non-null  int64         
 14  fta         479226 non-null  int64         
 15  ft_perc     479226 non-null  float64       
 16  or

In [105]:
# Team total stats
teams_df = df.groupby(['date', 'visitor', 'home', 'team']).sum().reset_index()

In [106]:
# Rename target variable
shooting_df = shooting_df.rename({'3p': 'target'}, axis=1)

# Convert 'date' column to Date object
shooting_df['date'] = pd.to_datetime(shooting_df['date'])

# Convert 'team' column to Team Name
shooting_df['team'] = np.where(shooting_df['team'], shooting_df['home'], shooting_df['visitor'])

In [107]:
# Starters total stats
starters_df = df[df['starter'] == 1].groupby(['date', 'visitor', 'home', 'team']).aggregate(['sum', 'mean'])
cols = [col for col in starters_df.columns
        if (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
           (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0])]
starters_df = starters_df[cols]
starters_df.columns = [col[0] for col in starters_df.columns]
starters_df = starters_df.reset_index()

# Merge dataframes to have target variable
starters_df = pd.merge(starters_df, shooting_df, 
                       left_on=['date', 'visitor', 'home', 'team'], right_on=['date', 'visitor', 'home', 'team'],
                       how='left')

In [108]:
# Bench total stats
bench_df = df[df['starter'] == 0].groupby(['date', 'visitor', 'home', 'team']).aggregate(['sum', 'mean'])
cols = [col for col in bench_df.columns
        if (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
           (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0])]
bench_df = bench_df[cols]
bench_df.columns = [col[0] for col in bench_df.columns]
bench_df = bench_df.reset_index()

# Merge dataframes to have target variable
bench_df = pd.merge(bench_df, shooting_df, 
                    left_on=['date', 'visitor', 'home', 'team'], right_on=['date', 'visitor', 'home', 'team'],
                    how='left')

# Dataframe of team's last 15 performances

In [109]:
# Return ten lastest dates team played
def last_15_date(team, date):
    schedule = teams_df[teams_df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 15 < 0:
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
    else:
        date_1, date_2 = schedule.iloc[date_index - 1]['date'], schedule.iloc[date_index - 2]['date']
        date_3, date_4 = schedule.iloc[date_index - 3]['date'], schedule.iloc[date_index - 4]['date']
        date_5, date_6 = schedule.iloc[date_index - 5]['date'], schedule.iloc[date_index - 6]['date']
        date_7, date_8 = schedule.iloc[date_index - 7]['date'], schedule.iloc[date_index - 8]['date']
        date_9, date_10 = schedule.iloc[date_index - 9]['date'], schedule.iloc[date_index - 10]['date']
        date_11, date_12 = schedule.iloc[date_index - 11]['date'], schedule.iloc[date_index - 12]['date']
        date_13, date_14 = schedule.iloc[date_index - 13]['date'], schedule.iloc[date_index - 14]['date']
        date_15 = schedule.iloc[date_index - 15]['date']
        return date_1, date_2, date_3, date_4, date_5, date_6, date_7, date_8, date_9, date_10, date_11, date_12, date_13, date_14, date_15

teams_df['dates'] = teams_df.apply(lambda x: last_15_date(x.team, x.date), axis=1)
teams_df['date_1'], teams_df['date_2'] = teams_df['dates'].apply(lambda x: x[0]), teams_df['dates'].apply(lambda x: x[1])
teams_df['date_3'], teams_df['date_4'] = teams_df['dates'].apply(lambda x: x[2]), teams_df['dates'].apply(lambda x: x[3])
teams_df['date_5'], teams_df['date_6'] = teams_df['dates'].apply(lambda x: x[4]), teams_df['dates'].apply(lambda x: x[5])
teams_df['date_7'], teams_df['date_8'] = teams_df['dates'].apply(lambda x: x[6]), teams_df['dates'].apply(lambda x: x[7])
teams_df['date_9'], teams_df['date_10'] = teams_df['dates'].apply(lambda x: x[8]), teams_df['dates'].apply(lambda x: x[9])
teams_df['date_11'], teams_df['date_12'] = teams_df['dates'].apply(lambda x: x[10]), teams_df['dates'].apply(lambda x: x[11])
teams_df['date_13'], teams_df['date_14'] = teams_df['dates'].apply(lambda x: x[12]), teams_df['dates'].apply(lambda x: x[13])
teams_df['date_15'] = teams_df['dates'].apply(lambda x: x[14])

In [110]:
# Keep date columns in teams
cols = [col for col in teams_df.columns
        if ('date_' in col) or \
        (col in ['date', 'visitor', 'home', 'team'])]
teams_df = teams_df[cols]

# Merge dates with starters
starters_df = pd.merge(starters_df, teams_df, 
                       left_on=['date', 'visitor', 'home', 'team'], 
                       right_on=['date', 'visitor', 'home', 'team'],
                       how='left')

# Merge dates with bench
bench_df = pd.merge(bench_df, teams_df, 
                    left_on=['date', 'visitor', 'home', 'team'], 
                    right_on=['date', 'visitor', 'home', 'team'],
                    how='left')

In [111]:
# Calculate z-score
def z_score(value, mean, std):
    return (value - mean) / std

In [112]:
# Calculate perc difference
def perc_diff(value, mean):
    return (value - mean) / mean

# Starters Analysis

In [113]:
# X and y column names to merge on
y_cols = starters_df.columns
x_cols = ['date', 'team'] + stats

last_15_games = starters_df[y_cols]
X = starters_df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for date in dates:
    last_15_games = pd.merge(last_15_games, X, left_on=['date' + date, 'team'], right_on=['date', 'team'], how='left', suffixes=('', date))

last_15_games.head()

Unnamed: 0,date,visitor,home,team,mp,fg,fga,fg_perc,3p,3pa,3p_perc,ft,fta,ft_perc,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus,target,date_1,date_2,date_3,date_4,date_5,date_6,date_7,date_8,date_9,date_10,date_11,date_12,date_13,date_14,date_15,date_1.1,fg_1,fga_1,fg_perc_1,3p_1,3pa_1,3p_perc_1,ft_1,fta_1,ft_perc_1,orb_1,drb_1,trb_1,ast_1,stl_1,blk_1,tov_1,pf_1,pts_1,plus_minus_1,mp_1,date_2.1,fg_2,fga_2,fg_perc_2,3p_2,3pa_2,3p_perc_2,ft_2,fta_2,ft_perc_2,orb_2,drb_2,trb_2,ast_2,stl_2,blk_2,tov_2,pf_2,pts_2,plus_minus_2,mp_2,date_3.1,fg_3,fga_3,fg_perc_3,3p_3,3pa_3,3p_perc_3,ft_3,fta_3,ft_perc_3,orb_3,drb_3,trb_3,ast_3,stl_3,blk_3,tov_3,pf_3,pts_3,plus_minus_3,mp_3,date_4.1,fg_4,fga_4,fg_perc_4,3p_4,3pa_4,3p_perc_4,ft_4,fta_4,ft_perc_4,orb_4,drb_4,trb_4,ast_4,stl_4,blk_4,tov_4,pf_4,pts_4,plus_minus_4,mp_4,date_5.1,fg_5,fga_5,fg_perc_5,3p_5,3pa_5,3p_perc_5,ft_5,fta_5,ft_perc_5,orb_5,drb_5,trb_5,ast_5,stl_5,blk_5,tov_5,pf_5,pts_5,plus_minus_5,mp_5,date_6.1,fg_6,fga_6,fg_perc_6,3p_6,3pa_6,3p_perc_6,ft_6,fta_6,ft_perc_6,orb_6,drb_6,trb_6,ast_6,stl_6,blk_6,tov_6,pf_6,pts_6,plus_minus_6,mp_6,date_7.1,fg_7,fga_7,fg_perc_7,3p_7,3pa_7,3p_perc_7,ft_7,fta_7,ft_perc_7,orb_7,drb_7,trb_7,ast_7,stl_7,blk_7,tov_7,pf_7,pts_7,plus_minus_7,mp_7,date_8.1,fg_8,fga_8,fg_perc_8,3p_8,3pa_8,3p_perc_8,ft_8,fta_8,ft_perc_8,orb_8,drb_8,trb_8,ast_8,stl_8,blk_8,tov_8,pf_8,pts_8,plus_minus_8,mp_8,date_9.1,fg_9,fga_9,fg_perc_9,3p_9,3pa_9,3p_perc_9,ft_9,fta_9,ft_perc_9,orb_9,drb_9,trb_9,ast_9,stl_9,blk_9,tov_9,pf_9,pts_9,plus_minus_9,mp_9,date_10.1,fg_10,fga_10,fg_perc_10,3p_10,3pa_10,3p_perc_10,ft_10,fta_10,ft_perc_10,orb_10,drb_10,trb_10,ast_10,stl_10,blk_10,tov_10,pf_10,pts_10,plus_minus_10,mp_10,date_11.1,fg_11,fga_11,fg_perc_11,3p_11,3pa_11,3p_perc_11,ft_11,fta_11,ft_perc_11,orb_11,drb_11,trb_11,ast_11,stl_11,blk_11,tov_11,pf_11,pts_11,plus_minus_11,mp_11,date_12.1,fg_12,fga_12,fg_perc_12,3p_12,3pa_12,3p_perc_12,ft_12,fta_12,ft_perc_12,orb_12,drb_12,trb_12,ast_12,stl_12,blk_12,tov_12,pf_12,pts_12,plus_minus_12,mp_12,date_13.1,fg_13,fga_13,fg_perc_13,3p_13,3pa_13,3p_perc_13,ft_13,fta_13,ft_perc_13,orb_13,drb_13,trb_13,ast_13,stl_13,blk_13,tov_13,pf_13,pts_13,plus_minus_13,mp_13,date_14.1,fg_14,fga_14,fg_perc_14,3p_14,3pa_14,3p_perc_14,ft_14,fta_14,ft_perc_14,orb_14,drb_14,trb_14,ast_14,stl_14,blk_14,tov_14,pf_14,pts_14,plus_minus_14,mp_14,date_15.1,fg_15,fga_15,fg_perc_15,3p_15,3pa_15,3p_perc_15,ft_15,fta_15,ft_perc_15,orb_15,drb_15,trb_15,ast_15,stl_15,blk_15,tov_15,pf_15,pts_15,plus_minus_15,mp_15
0,2006-10-31,Chicago Bulls,Miami Heat,Chicago Bulls,128.55,18,43,0.4022,3,7,0.12,14,18,0.8434,7,14,21,8,5,2,7,12,53,66.0,7,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,
1,2006-10-31,Chicago Bulls,Miami Heat,Miami Heat,156.583333,20,48,0.3666,3,13,0.1834,11,19,0.46,4,18,22,9,4,3,14,14,54,-95.0,3,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,
2,2006-10-31,Phoenix Suns,Los Angeles Lakers,Los Angeles Lakers,157.6,26,51,0.4522,5,10,0.55,15,20,0.46,7,26,33,24,7,0,13,10,72,-5.0,6,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,
3,2006-10-31,Phoenix Suns,Los Angeles Lakers,Phoenix Suns,159.483333,24,49,0.6158,6,18,0.1972,5,5,0.4,4,20,24,22,1,4,11,13,59,-11.0,13,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,
4,2006-11-01,Atlanta Hawks,Philadelphia 76ers,Atlanta Hawks,164.683333,15,49,0.207,3,12,0.0666,15,18,0.625,8,19,27,7,7,4,15,16,48,-83.0,4,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,,NaT,,,,,,,,,,,,,,,,,,,,


### Last 15 Performances (Unweighted)

In [114]:
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_15_games_unweighted = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15_games_unweighted[stat] = 0
    for date in dates:
        last_15_games_unweighted[stat] = last_15_games_unweighted[stat] + last_15_games_unweighted[stat + date]
    
    last_15_games_unweighted[stat] = last_15_games_unweighted[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_15_games_unweighted[stat + '_std'] = 0
    for date in dates:
        last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] + \
                                                    ((last_15_games_unweighted[stat + date] - last_15_games_unweighted[stat]) ** 2)
    
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] / len(dates)
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_15_games_unweighted[stat + '_trend'] = 0
    for date in dates[:10]:
        last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] + \
                                        z_score(last_15_games_unweighted[stat + date], last_15_games_unweighted[stat], last_15_games_unweighted[stat + '_std']).fillna(0)
    
    last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] / len(dates[:10])

last_15_games_unweighted = last_15_games_unweighted.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_15_game_cols = [col 
                    for col in last_15_games_unweighted.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

starters_15_games = last_15_games_unweighted[last_15_game_cols].dropna(axis=0).copy()
starters_15_games.columns = [col[0] for col in starters_15_games.columns]

## Correlations

In [115]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in starters_15_games:
    corr_p = pearsonr(starters_15_games['target'], starters_15_games[col])
    row = {'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print statistically significant correlations
starters_corr = corr_df[corr_df['p-value'] < .05].sort_values(['corr'], axis=0, ascending=False)
starters_corr

Unnamed: 0,corr,p-value,stat
0,1.0,0.0,target
5,0.7,0.0,3pa
4,0.68,0.0,3p
6,0.6,0.0,3p_perc
18,0.33,0.0,pts
11,0.28,0.0,drb
13,0.25,0.0,ast
2,0.24,0.0,fga
1,0.24,0.0,fg
3,0.12,0.0,fg_perc


## Bench Analysis

In [116]:
# X and y column names to merge on
y_cols = bench_df.columns
x_cols = ['date', 'team'] + stats

last_15_games = bench_df[y_cols]
X = bench_df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for date in dates:
    last_15_games = pd.merge(last_15_games, X, left_on=['date' + date, 'team'], right_on=['date', 'team'], how='left', suffixes=('', date))

### Last 15 Performances (Unweighted)

In [117]:
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_15_games_unweighted = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15_games_unweighted[stat] = 0
    for date in dates:
        last_15_games_unweighted[stat] = last_15_games_unweighted[stat] + last_15_games_unweighted[stat + date]
    
    last_15_games_unweighted[stat] = last_15_games_unweighted[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_15_games_unweighted[stat + '_std'] = 0
    for date in dates:
        last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] + \
                                                    ((last_15_games_unweighted[stat + date] - last_15_games_unweighted[stat]) ** 2)
    
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] / len(dates)
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_15_games_unweighted[stat + '_trend'] = 0
    for date in dates[:10]:
        last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] + \
                                        z_score(last_15_games_unweighted[stat + date], last_15_games_unweighted[stat], last_15_games_unweighted[stat + '_std']).fillna(0)
    
    last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] / len(dates[:10])

last_15_games_unweighted = last_15_games_unweighted.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_15_game_cols = [col 
                    for col in last_15_games_unweighted.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

bench_15_games = last_15_games_unweighted[last_15_game_cols].dropna(axis=0).copy()
bench_15_games.columns = [col[0] for col in bench_15_games.columns]

## Correlations of Bench

In [118]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in bench_15_games:
    corr_p = pearsonr(bench_15_games['target'], bench_15_games[col])
    row = {'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print statistically significant correlations
bench_corr = corr_df[corr_df['p-value'] < .05].sort_values(['corr'], axis=0, ascending=False)
bench_corr

Unnamed: 0,corr,p-value,stat
0,1.0,0.0,target
5,0.61,0.0,3pa
4,0.58,0.0,3p
6,0.51,0.0,3p_perc
11,0.39,0.0,drb
18,0.35,0.0,pts
1,0.32,0.0,fg
13,0.32,0.0,ast
2,0.3,0.0,fga
12,0.29,0.0,trb


# Comparison of Starters to Bench

In [119]:
corr_df = pd.merge(starters_corr.drop(['p-value'], axis=1), 
                   bench_corr.drop(['p-value'], axis=1),
                   left_on=['stat'], right_on=['stat'],
                   how='outer',
                   suffixes=['_starter', '_bench'])
corr_df.sort_values(['stat'], axis=0)

Unnamed: 0,corr_starter,stat,corr_bench
2,0.68,3p,0.58
3,0.6,3p_perc,0.51
13,0.03,3p_trend,
1,0.7,3pa,0.61
12,0.04,3pa_trend,0.02
6,0.25,ast,0.32
15,0.02,ast_trend,
20,-0.09,blk,0.1
5,0.28,drb,0.39
8,0.24,fg,0.32


## Save dataframe with significantly correlated stats

In [121]:
starter_stats = starters_corr[starters_corr['corr'].abs() >= .4]['stat']
starters_df = starters_15_games[starter_stats]

bench_stats = bench_corr[bench_corr['corr'].abs() >= .4]['stat']
bench_df = bench_15_games[bench_stats]

df = pd.merge(starters_df, bench_df, 
              left_on=['date', 'visitor', 'home'], 
              right_on=['date', 'visitor', 'home'], 
              how='outer', suffixes=['_starters', '_bench'])
df.to_csv('backend/data/inputs/game_details.csv')