# 3 Pointers Made against shooting.csv

### Import packages

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from scipy.stats import pearsonr
import itertools

pd.set_option("display.max_columns", None)

### Set working directory

In [2]:
# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

# Change working directory
os.chdir('/Users/tyler/OneDrive/Documents/Python/NBA')

# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA\backend\analysis\3p
Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA


## Exploratory Data Analysis

### Import data

In [45]:
df = pd.read_csv('backend/data/shooting.csv').drop(['Unnamed: 0'], axis=1)

### Basic exploration

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203702 entries, 0 to 203701
Data columns (total 17 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   date      203702 non-null  object 
 1   visitor   203702 non-null  object 
 2   home      203702 non-null  object 
 3   team      203702 non-null  int64  
 4   quarter   203680 non-null  object 
 5   fg        203662 non-null  float64
 6   fga       203662 non-null  float64
 7   fg_perc   203662 non-null  float64
 8   2p        203662 non-null  float64
 9   2pa       203662 non-null  float64
 10  2p_perc   203659 non-null  float64
 11  3p        203662 non-null  float64
 12  3pa       203662 non-null  float64
 13  3p_perc   202199 non-null  float64
 14  efg_perc  203662 non-null  float64
 15  ast       203662 non-null  float64
 16  ast_perc  203639 non-null  float64
dtypes: float64(12), int64(1), object(4)
memory usage: 26.4+ MB


In [47]:
# Convert 'date' column to Date object
df['date'] = pd.to_datetime(df['date'])


# Conver 'team' column to Team Name
df['team'] = np.where(df['team'], df['home'], df['visitor'])


# Number of effective fgs
df['efg'] = df['fg'] * df['efg_perc']


# Merge in opponents (see team defensive stats)
df = pd.merge(
    df, 
    df, 
    left_on=['date', 'visitor', 'home', 'quarter'], 
    right_on=['date', 'visitor', 'home', 'quarter'],
    suffixes=('', '_opp'),
    how='left')

df = df[df['team'] != df['team_opp']]


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203702 entries, 1 to 407402
Data columns (total 32 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   date          203702 non-null  datetime64[ns]
 1   visitor       203702 non-null  object        
 2   home          203702 non-null  object        
 3   team          203702 non-null  object        
 4   quarter       203680 non-null  object        
 5   fg            203662 non-null  float64       
 6   fga           203662 non-null  float64       
 7   fg_perc       203662 non-null  float64       
 8   2p            203662 non-null  float64       
 9   2pa           203662 non-null  float64       
 10  2p_perc       203659 non-null  float64       
 11  3p            203662 non-null  float64       
 12  3pa           203662 non-null  float64       
 13  3p_perc       202199 non-null  float64       
 14  efg_perc      203662 non-null  float64       
 15  ast           203

In [48]:
total_df = df[df['quarter'] == 'total']
q1_df = df[df['quarter'] == 'q1']
q2_df = df[df['quarter'] == 'q2']
q3_df = df[df['quarter'] == 'q3']
q4_df = df[df['quarter'] == 'q4']

In [49]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40190 entries, 9 to 407402
Data columns (total 32 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          40190 non-null  datetime64[ns]
 1   visitor       40190 non-null  object        
 2   home          40190 non-null  object        
 3   team          40190 non-null  object        
 4   quarter       40190 non-null  object        
 5   fg            40172 non-null  float64       
 6   fga           40172 non-null  float64       
 7   fg_perc       40172 non-null  float64       
 8   2p            40172 non-null  float64       
 9   2pa           40172 non-null  float64       
 10  2p_perc       40172 non-null  float64       
 11  3p            40172 non-null  float64       
 12  3pa           40172 non-null  float64       
 13  3p_perc       40172 non-null  float64       
 14  efg_perc      40172 non-null  float64       
 15  ast           40172 non-null  float

# Dataframe of team's last 15 performances

In [51]:
# Totals
total_df = df[df['quarter'] == 'total'].copy()

# Return ten lastest dates team played
def last_15_date(team, date):
    schedule = total_df[total_df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 15 < 0:
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
    else:
        date_1, date_2 = schedule.iloc[date_index - 1]['date'], schedule.iloc[date_index - 2]['date']
        date_3, date_4 = schedule.iloc[date_index - 3]['date'], schedule.iloc[date_index - 4]['date']
        date_5, date_6 = schedule.iloc[date_index - 5]['date'], schedule.iloc[date_index - 6]['date']
        date_7, date_8 = schedule.iloc[date_index - 7]['date'], schedule.iloc[date_index - 8]['date']
        date_9, date_10 = schedule.iloc[date_index - 9]['date'], schedule.iloc[date_index - 10]['date']
        date_11, date_12 = schedule.iloc[date_index - 11]['date'], schedule.iloc[date_index - 12]['date']
        date_13, date_14 = schedule.iloc[date_index - 13]['date'], schedule.iloc[date_index - 14]['date']
        date_15 = schedule.iloc[date_index - 15]['date']
        return date_1, date_2, date_3, date_4, date_5, date_6, date_7, date_8, date_9, date_10, date_11, date_12, date_13, date_14, date_15

total_df['dates'] = total_df.apply(lambda x: last_15_date(x.team, x.date), axis=1)
total_df['date_1'], total_df['date_2'] = total_df['dates'].apply(lambda x: x[0]), total_df['dates'].apply(lambda x: x[1])
total_df['date_3'], total_df['date_4'] = total_df['dates'].apply(lambda x: x[2]), total_df['dates'].apply(lambda x: x[3])
total_df['date_5'], total_df['date_6'] = total_df['dates'].apply(lambda x: x[4]), total_df['dates'].apply(lambda x: x[5])
total_df['date_7'], total_df['date_8'] = total_df['dates'].apply(lambda x: x[6]), total_df['dates'].apply(lambda x: x[7])
total_df['date_9'], total_df['date_10'] = total_df['dates'].apply(lambda x: x[8]), total_df['dates'].apply(lambda x: x[9])
total_df['date_11'], total_df['date_12'] = total_df['dates'].apply(lambda x: x[10]), total_df['dates'].apply(lambda x: x[11])
total_df['date_13'], total_df['date_14'] = total_df['dates'].apply(lambda x: x[12]), total_df['dates'].apply(lambda x: x[13])
total_df['date_15'] = total_df['dates'].apply(lambda x: x[14])

In [75]:
# Define statistics
stats = ['fg', 'fga', '2p', '2pa', '3p', '3pa', 'efg', 'ast',
         'fg_opp', 'fga_opp', '2p_opp', '2pa_opp', '3p_opp', '3pa_opp', 'efg_opp', 'ast_opp']

perc_stats = ['fg_perc', '2p_perc', '3p_perc', 'efg_perc', 'ast_perc',
              'fg_perc_opp', '2p_perc_opp', '3p_perc_opp', 'efg_perc_opp', 'ast_perc_opp']

# X and y column names to merge on
x_cols = ['date', 'team'] + stats

last_15_games = total_df.copy()
last_15_games.loc[:, 'target'] = last_15_games.loc[:, '3p']
X = total_df[x_cols].copy()

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for date in dates:
    last_15_games = pd.merge(last_15_games, X, left_on=['date' + date, 'team'], right_on=['date', 'team'], how='left', suffixes=('', date))

In [69]:
# Calculate z-score
def z_score(value, mean, std):
    return (value - mean) / std

### Last 15 Performances (Unweighted)

In [80]:
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))] 

last_15 = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15[stat] = 0
    for date in dates:
        last_15[stat] = last_15[stat] + last_15[stat + date]
    
    last_15[stat] = last_15[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's last 15 game performance
for stat in stats:
    last_15[stat + '_std'] = 0
    for date in dates:
        last_15[stat + '_std'] = last_15[stat + '_std'] + ((last_15[stat + date] - last_15[stat]) ** 2)
    
    last_15[stat + '_std'] = last_15[stat + '_std'] / len(dates)
    last_15[stat + '_std'] = last_15[stat + '_std'] ** .5

# Feature engineer trends (how a team is trending in their last 3 games)
for stat in stats:
    last_15[stat + '_trend'] = 0
    for date in dates[:3]:
        last_15[stat + '_trend'] = last_15[stat + '_trend'] + \
                                    z_score(last_15[stat + date], last_15[stat], last_15[stat + '_std']).fillna(0)
    
    last_15[stat + '_trend'] = last_15[stat + '_trend'] / len(dates[:3])

# Sum stats for opposing teams for each game
last_15 = last_15.groupby(['date', 'visitor', 'home']).sum()

# Standard deviation and trending cols
std_cols = [stat + '_std' for stat in stats]
trend_cols = [stat + '_trend' for stat in stats]

# Keep columns
last_15 = last_15[['target'] + stats + std_cols + trend_cols].dropna(axis=0)

for perc in perc_stats:
    stat = perc.split('_')[0]
    opp = perc.split('_')[-1]
    if opp == 'opp':
        if stat == 'ast' or stat == 'efg':
            last_15[perc] = last_15[stat + '_opp'] / last_15['fg_opp']
        else:
            last_15[perc] = last_15[stat + '_opp'] / last_15[stat + 'a_opp']
    else:
        if stat == 'ast' or stat == 'efg':
            last_15[perc] = last_15[stat] / last_15['fg']
        else:
            last_15[perc] = last_15[stat] / last_15[stat + 'a']

        
last_15 = last_15.dropna(axis=0)
last_15.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,fg,fga,2p,2pa,3p,3pa,efg,ast,fg_opp,fga_opp,2p_opp,2pa_opp,3p_opp,3pa_opp,efg_opp,ast_opp,fg_std,fga_std,2p_std,2pa_std,3p_std,3pa_std,efg_std,ast_std,fg_opp_std,fga_opp_std,2p_opp_std,2pa_opp_std,3p_opp_std,3pa_opp_std,efg_opp_std,ast_opp_std,fg_trend,fga_trend,2p_trend,2pa_trend,3p_trend,3pa_trend,efg_trend,ast_trend,fg_opp_trend,fga_opp_trend,2p_opp_trend,2pa_opp_trend,3p_opp_trend,3pa_opp_trend,efg_opp_trend,ast_opp_trend,fg_perc,2p_perc,3p_perc,efg_perc,ast_perc,fg_perc_opp,2p_perc_opp,3p_perc_opp,efg_perc_opp,ast_perc_opp
date,visitor,home,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
2013-12-14,Portland Trail Blazers,Philadelphia 76ers,30.0,77.866667,173.866667,61.533333,127.733333,16.333333,46.133333,39.0374,44.2,78.6,171.866667,64.733333,131.133333,13.866667,40.733333,39.601933,43.733333,6.727877,13.856302,6.313352,11.799876,5.608992,8.995166,7.420775,8.164255,9.299457,11.075411,9.41576,14.17295,5.496479,9.128756,8.898387,9.016897,-0.611969,-0.424985,-0.196195,-0.639953,-0.52837,0.044385,-0.449033,0.058451,0.054011,0.470392,0.60377,0.682649,-0.696572,-0.475748,-0.185962,-0.777196,0.447853,0.481733,0.354046,0.501336,0.567637,0.457331,0.493645,0.340426,0.503841,0.556404
2019-05-07,Portland Trail Blazers,Denver Nuggets,18.0,83.466667,180.6,61.4,120.8,22.066667,59.8,44.1842,45.266667,81.2,178.333333,61.266667,121.066667,19.933333,57.266667,42.111333,45.266667,10.048283,19.430527,9.382534,15.816384,5.775736,9.605185,8.828869,7.431495,11.17239,20.340507,10.024845,16.508976,6.302701,12.150483,9.593691,5.999619,0.35246,2.01719,0.434634,1.767643,-0.325637,1.07167,-0.66625,-0.770204,0.69225,2.141924,0.341746,1.660574,0.748227,1.164621,-0.179783,-0.925868,0.462163,0.508278,0.369008,0.529363,0.542332,0.455327,0.506057,0.348079,0.518612,0.557471
2015-03-04,Portland Trail Blazers,Los Angeles Clippers,19.0,76.533333,170.866667,58.2,115.866667,18.333333,55.0,38.701067,43.133333,73.2,170.2,56.133333,122.466667,17.066667,47.733333,35.619667,44.266667,6.966181,11.608933,6.206328,9.846857,4.947253,7.96086,6.329356,7.720339,8.92455,11.03208,11.81016,20.975288,7.491496,18.144759,7.676936,7.267344,0.832156,0.107523,0.315845,0.382472,0.828562,-0.474934,0.964972,-0.544263,0.024134,0.650085,0.903768,1.681641,-1.708037,-1.572623,-0.353632,-0.041705,0.447913,0.502301,0.333333,0.505676,0.563589,0.430082,0.458356,0.357542,0.486607,0.604736
2009-02-25,Utah Jazz,Minnesota Timberwolves,8.0,76.133333,163.533333,62.933333,124.933333,13.2,38.6,38.9644,44.866667,76.666667,158.533333,64.266667,124.066667,12.4,34.466667,40.331067,43.933333,7.148347,12.635969,7.586397,15.131034,5.219119,8.608102,6.958486,8.799793,8.077652,14.529369,6.755818,12.852417,5.720869,9.649601,6.87079,9.0682,0.790978,0.415076,0.342345,0.703647,0.485289,-0.658523,0.709012,0.614598,-0.298816,-0.523758,-0.094742,-0.103446,-0.02663,-0.437331,0.032716,0.059022,0.465552,0.503735,0.341969,0.511792,0.589317,0.4836,0.518001,0.359768,0.526057,0.573043
2007-12-20,Houston Rockets,Denver Nuggets,17.0,73.733333,165.533333,61.466667,129.2,12.266667,36.333333,36.017467,44.266667,74.333333,165.4,61.933333,131.933333,12.4,33.466667,36.904267,42.733333,10.190298,15.566623,10.365623,16.099187,4.5568,7.383342,8.146383,9.7712,10.072923,12.702231,10.198621,13.74241,4.552522,8.456636,9.54125,9.260748,-0.750752,-0.785343,-0.864481,-0.55738,0.475493,-0.207632,-0.667694,0.2887,0.331858,-0.115604,-0.013728,-0.267385,0.699254,0.268857,0.60355,0.230858,0.445429,0.475748,0.337615,0.488483,0.600362,0.449416,0.469429,0.370518,0.49647,0.574888


## Correlations

In [81]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in last_15:
    corr_p = pearsonr(last_15['target'], last_15[col])
    row = {'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print correlation
corr_df = corr_df[corr_df['p-value'] < .05].drop(['p-value'], axis=1).sort_values(['corr'], axis=0, ascending=False)
corr_df

Unnamed: 0,corr,stat
0,1.0,target
6,0.74,3pa
5,0.73,3p
14,0.69,3pa_opp
13,0.68,3p_opp
7,0.53,efg
50,0.51,2p_perc
52,0.51,efg_perc
10,0.5,fga_opp
15,0.5,efg_opp


## Save dataframe with significantly correlated stats

In [84]:
stats = corr_df[corr_df['corr'].abs() >= .5]['stat']
df = last_15_games_unweighted[stats]

stats
# df.to_csv('backend/data/inputs/3p/shooting.csv')

0      target
6         3pa
5          3p
14    3pa_opp
13     3p_opp
Name: stat, dtype: object