# 3 Pointers Made against advanced_totals.csv

### Import packages

In [16]:
import os
import numpy as np
import pandas as pd
%matplotlib inline
from scipy.stats import pearsonr, zscore
import itertools

pd.set_option("display.max_columns", None)

### Set working directory

In [17]:
# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

# Change working directory
os.chdir('/Users/tyler/OneDrive/Documents/Python/NBA')

# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA
Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA


## Exploratory Data Analysis

### Import data

In [18]:
advanced_df = pd.read_csv('backend/data/totals/advanced_totals.csv').drop(['Unnamed: 0'], axis=1)
shooting_df = pd.read_csv('backend/data/totals/game_totals.csv').drop(['Unnamed: 0'], axis=1)
shooting_df = shooting_df[['date', 'visitor', 'home', 'team', '3p']]

In [19]:
# Merge dataframes to have target variable
df = pd.merge(shooting_df, advanced_df, 
              left_on=['date', 'visitor', 'home', 'team'], right_on=['date', 'visitor', 'home', 'team'],
              how='left')

### Basic exploration

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38288 entries, 0 to 38287
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      38288 non-null  object 
 1   visitor   38288 non-null  object 
 2   home      38288 non-null  object 
 3   team      38288 non-null  int64  
 4   3p        38288 non-null  int64  
 5   ts_perc   38288 non-null  float64
 6   efg_perc  38288 non-null  float64
 7   3par      38288 non-null  float64
 8   ftr       38288 non-null  float64
 9   orb_perc  38288 non-null  float64
 10  drb_perc  38288 non-null  float64
 11  trb_perc  38288 non-null  float64
 12  ast_perc  38288 non-null  float64
 13  stl_perc  38288 non-null  float64
 14  blk_perc  38288 non-null  float64
 15  tov_perc  38288 non-null  float64
 16  usg_perc  38288 non-null  float64
 17  ortg      38288 non-null  float64
 18  drtg      38288 non-null  float64
dtypes: float64(14), int64(2), object(3)
memory usage: 5.8+ MB


In [21]:
# Convert 'date' column to Date object
df['date'] = pd.to_datetime(df['date'])

# Convert 'team' column to Team Name
df['team'] = np.where(df['team'], df['home'], df['visitor'])

# Rename target variable
df = df.rename({'3p': 'target'}, axis=1)

# Define statistics
stats = ['ts_perc', 'efg_perc', '3par', 'ftr', 'orb_perc', 'drb_perc', 'trb_perc', 
         'ast_perc', 'stl_perc', 'blk_perc', 'tov_perc', 'usg_perc', 'ortg', 'drtg']

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38288 entries, 0 to 38287
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      38288 non-null  datetime64[ns]
 1   visitor   38288 non-null  object        
 2   home      38288 non-null  object        
 3   team      38288 non-null  object        
 4   target    38288 non-null  int64         
 5   ts_perc   38288 non-null  float64       
 6   efg_perc  38288 non-null  float64       
 7   3par      38288 non-null  float64       
 8   ftr       38288 non-null  float64       
 9   orb_perc  38288 non-null  float64       
 10  drb_perc  38288 non-null  float64       
 11  trb_perc  38288 non-null  float64       
 12  ast_perc  38288 non-null  float64       
 13  stl_perc  38288 non-null  float64       
 14  blk_perc  38288 non-null  float64       
 15  tov_perc  38288 non-null  float64       
 16  usg_perc  38288 non-null  float64       
 17  ortg      38

# Dataframe of team's last 15 performances

In [22]:
# Return ten lastest dates team played
def last_15_date(team, date):
    schedule = df[df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 15 < 0:
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
    else:
        date_1, date_2 = schedule.iloc[date_index - 1]['date'], schedule.iloc[date_index - 2]['date']
        date_3, date_4 = schedule.iloc[date_index - 3]['date'], schedule.iloc[date_index - 4]['date']
        date_5, date_6 = schedule.iloc[date_index - 5]['date'], schedule.iloc[date_index - 6]['date']
        date_7, date_8 = schedule.iloc[date_index - 7]['date'], schedule.iloc[date_index - 8]['date']
        date_9, date_10 = schedule.iloc[date_index - 9]['date'], schedule.iloc[date_index - 10]['date']
        date_11, date_12 = schedule.iloc[date_index - 11]['date'], schedule.iloc[date_index - 12]['date']
        date_13, date_14 = schedule.iloc[date_index - 13]['date'], schedule.iloc[date_index - 14]['date']
        date_15 = schedule.iloc[date_index - 15]['date']
        return date_1, date_2, date_3, date_4, date_5, date_6, date_7, date_8, date_9, date_10, date_11, date_12, date_13, date_14, date_15

df['dates'] = df.apply(lambda x: last_15_date(x.team, x.date), axis=1)
df['date_1'], df['date_2'] = df['dates'].apply(lambda x: x[0]), df['dates'].apply(lambda x: x[1])
df['date_3'], df['date_4'] = df['dates'].apply(lambda x: x[2]), df['dates'].apply(lambda x: x[3])
df['date_5'], df['date_6'] = df['dates'].apply(lambda x: x[4]), df['dates'].apply(lambda x: x[5])
df['date_7'], df['date_8'] = df['dates'].apply(lambda x: x[6]), df['dates'].apply(lambda x: x[7])
df['date_9'], df['date_10'] = df['dates'].apply(lambda x: x[8]), df['dates'].apply(lambda x: x[9])
df['date_11'], df['date_12'] = df['dates'].apply(lambda x: x[10]), df['dates'].apply(lambda x: x[11])
df['date_13'], df['date_14'] = df['dates'].apply(lambda x: x[12]), df['dates'].apply(lambda x: x[13])
df['date_15'] = df['dates'].apply(lambda x: x[14])

In [23]:
# X and y column names to merge on
y_cols = df.columns
x_cols = ['date', 'team'] + stats

last_15_games = df[y_cols]
X = df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 15 games stats for each team)
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for date in dates:
    last_15_games = pd.merge(last_15_games, X, left_on=['date' + date, 'team'], right_on=['date', 'team'], how='left', suffixes=('', date))

In [24]:
# Calculate z-score
def z_score(value, mean, std):
    return (value - mean) / std

In [25]:
# Calculate perc difference
def perc_diff(value, mean):
    return (value - mean) / mean

### Last 15 Performances (Unweighted)

In [26]:
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_15_games_unweighted = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15_games_unweighted[stat] = 0
    for date in dates:
        last_15_games_unweighted[stat] = last_15_games_unweighted[stat] + last_15_games_unweighted[stat + date]
    
    last_15_games_unweighted[stat] = last_15_games_unweighted[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_15_games_unweighted[stat + '_std'] = 0
    for date in dates:
        last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] + \
                                                    ((last_15_games_unweighted[stat + date] - last_15_games_unweighted[stat]) ** 2)
    
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] / len(dates)
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_15_games_unweighted[stat + '_trend'] = 0
    for date in dates[:10]:
        last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] + \
                                        z_score(last_15_games_unweighted[stat + date], last_15_games_unweighted[stat], last_15_games_unweighted[stat + '_std']).fillna(0)
    
    last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] / len(dates[:10])

last_15_games_unweighted = last_15_games_unweighted.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_15_game_cols = [col 
                    for col in last_15_games_unweighted.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

last_15_games_unweighted = last_15_games_unweighted[last_15_game_cols].dropna(axis=0)
last_15_games_unweighted.columns = [col[0] for col in last_15_games_unweighted.columns]
last_15_games_unweighted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,ts_perc_trend,efg_perc_trend,3par_trend,ftr_trend,orb_perc_trend,drb_perc_trend,trb_perc_trend,ast_perc_trend,stl_perc_trend,blk_perc_trend,tov_perc_trend,usg_perc_trend,ortg_trend,drtg_trend
date,visitor,home,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2006-11-28,Indiana Pacers,Portland Trail Blazers,13,0.5468,0.4926,0.201067,0.369,25.98,71.826667,48.606667,51.186667,7.366667,6.78,14.386667,100.0,106.546667,113.226667,-0.097118,-0.144831,-0.05304,-0.074359,-0.301369,0.1242,-0.286233,0.174165,-0.089934,0.078359,-0.11718,0.0,-0.058079,0.157559
2006-11-28,New York Knicks,Chicago Bulls,9,0.536333,0.482333,0.192533,0.3896,29.1,74.733333,51.486667,49.206667,6.966667,5.466667,14.866667,100.0,105.1,107.0,-0.033644,-0.010098,0.048868,-0.129499,-0.065472,0.073056,-0.005873,0.003482,0.123734,0.088057,-0.222667,0.0,0.139882,-0.1494
2006-11-29,Indiana Pacers,Golden State Warriors,18,0.541533,0.500067,0.502667,0.6584,27.246667,68.43,47.986667,63.093333,8.46,10.166667,14.58,100.0,212.833333,210.56,0.224904,0.269204,0.171426,-0.225155,-0.08669,0.095122,0.014912,0.505146,0.054346,0.200865,-0.066212,0.0,0.19555,-0.17583
2006-11-29,New York Knicks,Cleveland Cavaliers,13,0.535333,0.4836,0.185733,0.393333,28.06,74.78,51.073333,49.38,7.373333,5.493333,15.126667,100.0,103.966667,107.106667,0.013717,0.016305,-0.02392,-0.052209,0.027138,-0.034974,0.089373,0.137145,-0.05026,0.035379,0.090339,0.0,0.033061,-0.252895
2006-11-29,Orlando Magic,Seattle SuperSonics,8,0.549067,0.504367,0.391067,0.7238,29.463333,72.186667,50.903333,51.423333,8.66,7.746667,15.363333,100.0,215.006667,211.246667,-0.402509,-0.448691,-0.314079,0.124696,-0.330792,0.039241,-0.138943,-0.121717,-0.523082,0.118736,-0.516522,0.0,-0.125602,-0.255873


## Correlations

In [27]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in last_15_games_unweighted:
    corr_p = pearsonr(last_15_games_unweighted['target'], last_15_games_unweighted[col])
    row = {'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print correlations
corr_df = corr_df[corr_df['p-value'] < .05].drop(['p-value'], axis=1).sort_values(['corr'], axis=0, ascending=False)
corr_df



Unnamed: 0,corr,stat
0,1.0,target
3,0.72,3par
2,0.51,efg_perc
6,0.45,drb_perc
1,0.45,ts_perc
13,0.29,ortg
14,0.25,drtg
10,0.22,blk_perc
8,0.18,ast_perc
17,0.04,3par_trend
