# 3 Pointers Made against shooting.csv

### Import packages

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from scipy.stats import pearsonr
import itertools

pd.set_option("display.max_columns", None)

### Set working directory

In [2]:
# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

# Change working directory
os.chdir('/Users/tyler/OneDrive/Documents/Python/NBA')

# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA\backend\analysis\3p
Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA


## Exploratory Data Analysis

### Import data

In [3]:
df = pd.read_csv('backend/data/shooting.csv').drop(['Unnamed: 0'], axis=1)

### Basic exploration

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204706 entries, 0 to 204705
Data columns (total 17 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   date      204706 non-null  object 
 1   visitor   204706 non-null  object 
 2   home      204706 non-null  object 
 3   team      204706 non-null  int64  
 4   quarter   204684 non-null  object 
 5   fg        204682 non-null  float64
 6   fga       204682 non-null  float64
 7   fg_perc   204682 non-null  float64
 8   2p        204682 non-null  float64
 9   2pa       204682 non-null  float64
 10  2p_perc   204679 non-null  float64
 11  3p        204682 non-null  float64
 12  3pa       204682 non-null  float64
 13  3p_perc   203219 non-null  float64
 14  efg_perc  204682 non-null  float64
 15  ast       204682 non-null  float64
 16  ast_perc  204659 non-null  float64
dtypes: float64(12), int64(1), object(4)
memory usage: 26.6+ MB


In [5]:
df.head()

Unnamed: 0,date,visitor,home,team,quarter,fg,fga,fg_perc,2p,2pa,2p_perc,3p,3pa,3p_perc,efg_perc,ast,ast_perc
0,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,q1,5.0,20.0,0.25,5.0,16.0,0.313,0.0,4.0,0.0,0.25,3.0,0.6
1,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,q2,15.0,19.0,0.789,12.0,16.0,0.75,3.0,3.0,1.0,0.868,10.0,0.667
2,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,q3,8.0,21.0,0.381,5.0,16.0,0.313,3.0,5.0,0.6,0.452,4.0,0.5
3,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,q4,11.0,19.0,0.579,10.0,18.0,0.556,1.0,1.0,1.0,0.605,5.0,0.455
4,"Tue, Oct 31, 2006",Chicago Bulls,Miami Heat,0,total,39.0,79.0,0.494,32.0,66.0,0.485,7.0,13.0,0.538,0.538,22.0,0.564


In [6]:
# Convert 'date' column to Date object
df['date'] = pd.to_datetime(df['date'])


# Conver 'team' column to Team Name
df['team'] = np.where(df['team'], df['home'], df['visitor'])


# Merge in opponents (see team defensive stats)
df = pd.merge(
    df, 
    df, 
    left_on=['date', 'visitor', 'home', 'quarter'], 
    right_on=['date', 'visitor', 'home', 'quarter'],
    suffixes=('', '_opp'),
    how='left')

df = df[df['team'] != df['team_opp']]


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204706 entries, 1 to 409410
Data columns (total 30 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   date          204706 non-null  datetime64[ns]
 1   visitor       204706 non-null  object        
 2   home          204706 non-null  object        
 3   team          204706 non-null  object        
 4   quarter       204684 non-null  object        
 5   fg            204682 non-null  float64       
 6   fga           204682 non-null  float64       
 7   fg_perc       204682 non-null  float64       
 8   2p            204682 non-null  float64       
 9   2pa           204682 non-null  float64       
 10  2p_perc       204679 non-null  float64       
 11  3p            204682 non-null  float64       
 12  3pa           204682 non-null  float64       
 13  3p_perc       203219 non-null  float64       
 14  efg_perc      204682 non-null  float64       
 15  ast           204

In [7]:
total_df = df[df['quarter'] == 'total']
q1_df = df[df['quarter'] == 'q1']
q2_df = df[df['quarter'] == 'q2']
q3_df = df[df['quarter'] == 'q3']
q4_df = df[df['quarter'] == 'q4']

In [8]:
total_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40376 entries, 9 to 409410
Data columns (total 30 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          40376 non-null  datetime64[ns]
 1   visitor       40376 non-null  object        
 2   home          40376 non-null  object        
 3   team          40376 non-null  object        
 4   quarter       40376 non-null  object        
 5   fg            40374 non-null  float64       
 6   fga           40374 non-null  float64       
 7   fg_perc       40374 non-null  float64       
 8   2p            40374 non-null  float64       
 9   2pa           40374 non-null  float64       
 10  2p_perc       40374 non-null  float64       
 11  3p            40374 non-null  float64       
 12  3pa           40374 non-null  float64       
 13  3p_perc       40374 non-null  float64       
 14  efg_perc      40374 non-null  float64       
 15  ast           40374 non-null  float

# Dataframe of team's last 15 performances

In [9]:
# Totals
total_df = df[df['quarter'] == 'total'].copy()

# Return ten lastest dates team played
def last_15_date(team, date):
    schedule = total_df[total_df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 15 < 0:
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
    else:
        date_1, date_2 = schedule.iloc[date_index - 1]['date'], schedule.iloc[date_index - 2]['date']
        date_3, date_4 = schedule.iloc[date_index - 3]['date'], schedule.iloc[date_index - 4]['date']
        date_5, date_6 = schedule.iloc[date_index - 5]['date'], schedule.iloc[date_index - 6]['date']
        date_7, date_8 = schedule.iloc[date_index - 7]['date'], schedule.iloc[date_index - 8]['date']
        date_9, date_10 = schedule.iloc[date_index - 9]['date'], schedule.iloc[date_index - 10]['date']
        date_11, date_12 = schedule.iloc[date_index - 11]['date'], schedule.iloc[date_index - 12]['date']
        date_13, date_14 = schedule.iloc[date_index - 13]['date'], schedule.iloc[date_index - 14]['date']
        date_15 = schedule.iloc[date_index - 15]['date']
        return date_1, date_2, date_3, date_4, date_5, date_6, date_7, date_8, date_9, date_10, date_11, date_12, date_13, date_14, date_15

total_df['dates'] = total_df.apply(lambda x: last_15_date(x.team, x.date), axis=1)
total_df['date_1'], total_df['date_2'] = total_df['dates'].apply(lambda x: x[0]), total_df['dates'].apply(lambda x: x[1])
total_df['date_3'], total_df['date_4'] = total_df['dates'].apply(lambda x: x[2]), total_df['dates'].apply(lambda x: x[3])
total_df['date_5'], total_df['date_6'] = total_df['dates'].apply(lambda x: x[4]), total_df['dates'].apply(lambda x: x[5])
total_df['date_7'], total_df['date_8'] = total_df['dates'].apply(lambda x: x[6]), total_df['dates'].apply(lambda x: x[7])
total_df['date_9'], total_df['date_10'] = total_df['dates'].apply(lambda x: x[8]), total_df['dates'].apply(lambda x: x[9])
total_df['date_11'], total_df['date_12'] = total_df['dates'].apply(lambda x: x[10]), total_df['dates'].apply(lambda x: x[11])
total_df['date_13'], total_df['date_14'] = total_df['dates'].apply(lambda x: x[12]), total_df['dates'].apply(lambda x: x[13])
total_df['date_15'] = total_df['dates'].apply(lambda x: x[14])

In [10]:
# Define statistics
stats = ['fg', 'fga', '2p', '2pa', '3p', '3pa', 'ast',
         'fg_opp', 'fga_opp', '2p_opp', '2pa_opp', '3p_opp', '3pa_opp', 'ast_opp']

perc_stats = ['fg_perc', '2p_perc', '3p_perc', 'efg_perc', 'ast_perc',
              'fg_perc_opp', '2p_perc_opp', '3p_perc_opp', 'efg_perc_opp', 'ast_perc_opp']

# X and y column names to merge on
x_cols = ['date', 'team'] + stats

last_15_games = total_df.copy()
last_15_games.loc[:, 'target'] = last_15_games.loc[:, '3p']
X = total_df[x_cols].copy()

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for date in dates:
    last_15_games = pd.merge(last_15_games, X, left_on=['date' + date, 'team'], right_on=['date', 'team'], how='left', suffixes=('', date))

In [11]:
# Calculate z-score
def z_score(value, mean, std):
    return (value - mean) / std

### Last 15 Performances (Unweighted)

In [12]:
# Define statistics
stats = ['fg', 'fga', '2p', '2pa', '3p', '3pa', 'ast',
         'fg_opp', 'fga_opp', '2p_opp', '2pa_opp', '3p_opp', '3pa_opp', 'ast_opp']

perc_stats = ['fg_perc', '2p_perc', '3p_perc', 'efg_perc', 'ast_perc',
              'fg_perc_opp', '2p_perc_opp', '3p_perc_opp', 'efg_perc_opp', 'ast_perc_opp']

dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']

last_15 = last_15_games.copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15[stat] = 0
    for date in dates:
        last_15[stat] = last_15[stat] + last_15[stat + date]
    
    last_15[stat] = last_15[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's last 15 game performance
for stat in stats:
    last_15[stat + '_std'] = 0
    for date in dates:
        last_15[stat + '_std'] = last_15[stat + '_std'] + ((last_15[stat + date] - last_15[stat]) ** 2)
    
    last_15[stat + '_std'] = last_15[stat + '_std'] / len(dates)
    last_15[stat + '_std'] = last_15[stat + '_std'] ** .5

# Feature engineer trends (how a team is trending in their last 3 games)
for stat in stats:
    last_15[stat + '_trend'] = 0
    for date in dates[:3]:
        last_15[stat + '_trend'] = \
                            last_15[stat + '_trend'] + \
                            z_score(last_15[stat + date], last_15[stat], last_15[stat + '_std']).fillna(0)
    
    last_15[stat + '_trend'] = last_15[stat + '_trend'] / len(dates[:3])

# Sum stats for opposing teams for each game
last_15 = last_15.groupby(['date', 'visitor', 'home']).sum()

# Standard deviation and trending cols
std_cols = [stat + '_std' for stat in stats]
trend_cols = [stat + '_trend' for stat in stats]

# Keep columns
last_15 = last_15[['target'] + stats + std_cols + trend_cols]

for perc in perc_stats:
    stat = perc.split('_')[0]
    opp = perc.split('_')[-1]
    if opp == 'opp':
        if stat == 'ast':
            last_15[perc] = last_15[stat + '_opp'] / last_15['fg_opp']
        elif stat == 'efg':
            last_15[perc] = (last_15['fg_opp'] + (.5 * last_15['3p_opp'])) / last_15['fga_opp']
        else:
            last_15[perc] = last_15[stat + '_opp'] / last_15[stat + 'a_opp']
    else:
        if stat == 'ast':
            last_15[perc] = last_15[stat] / last_15['fg']
        elif stat == 'efg':
            last_15[perc] = (last_15['fg'] + (.5 * last_15['3p'])) / last_15['fga']
        else:
            last_15[perc] = last_15[stat] / last_15[stat + 'a']

        
last_15 = last_15.dropna(axis=0)
last_15.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,fg,fga,2p,2pa,3p,3pa,ast,fg_opp,fga_opp,2p_opp,2pa_opp,3p_opp,3pa_opp,ast_opp,fg_std,fga_std,2p_std,2pa_std,3p_std,3pa_std,ast_std,fg_opp_std,fga_opp_std,2p_opp_std,2pa_opp_std,3p_opp_std,3pa_opp_std,ast_opp_std,fg_trend,fga_trend,2p_trend,2pa_trend,3p_trend,3pa_trend,ast_trend,fg_opp_trend,fga_opp_trend,2p_opp_trend,2pa_opp_trend,3p_opp_trend,3pa_opp_trend,ast_opp_trend,fg_perc,2p_perc,3p_perc,efg_perc,ast_perc,fg_perc_opp,2p_perc_opp,3p_perc_opp,efg_perc_opp,ast_perc_opp
date,visitor,home,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1
2022-03-16,Dallas Mavericks,Brooklyn Nets,21.0,80.933333,171.133333,54.333333,100.866667,26.6,70.266667,45.333333,79.466667,172.266667,56.666667,107.666667,22.8,64.6,47.266667,9.426165,9.051244,8.546743,10.200618,6.807747,8.089314,9.628179,9.524607,11.54959,9.953878,15.473788,6.650503,11.384821,8.182081,0.214923,0.19429,0.912617,0.01528,-0.755843,0.219591,0.48371,-1.579232,0.677698,-1.18118,0.009282,-0.428568,0.819719,-0.478853,0.472926,0.538665,0.378558,0.550643,0.560132,0.4613,0.526316,0.352941,0.527477,0.594799
2022-03-16,Denver Nuggets,Washington Wizards,20.0,83.933333,171.4,59.533333,104.466667,24.4,66.933333,55.066667,83.733333,176.466667,60.6,111.6,23.133333,64.866667,49.333333,9.00075,9.932933,8.702596,9.915427,5.434399,10.620829,9.380423,10.36875,12.386655,10.426656,15.567022,6.614035,9.689282,7.028032,-0.711107,-1.578965,-0.011892,0.278258,-1.154243,-1.472528,-0.715998,0.899695,0.668529,0.136793,-0.161767,1.112602,1.065127,0.481109,0.489693,0.569879,0.364542,0.560871,0.656076,0.474499,0.543011,0.356629,0.540045,0.589172
2022-03-16,Los Angeles Lakers,Minnesota Timberwolves,27.0,84.266667,176.933333,56.466667,99.6,27.8,77.333333,51.066667,85.0,179.866667,60.333333,110.4,24.666667,69.466667,51.8,10.212163,13.955389,9.792829,15.146753,7.183855,11.181222,8.843471,13.078591,14.430134,11.480014,15.408622,6.105162,10.651684,10.212108,-0.500532,-0.420322,-0.868477,-0.350233,0.610167,0.078928,-0.791867,0.394293,0.676776,0.49298,0.451194,-0.058087,0.441978,0.791668,0.476262,0.566934,0.359483,0.554823,0.606013,0.472572,0.546498,0.355086,0.541142,0.609412
2022-03-16,Milwaukee Bucks,Sacramento Kings,34.0,85.133333,176.333333,60.466667,108.866667,24.666667,67.466667,49.6,87.933333,180.733333,59.8,106.4,28.133333,74.333333,53.533333,9.581951,10.069974,9.338423,10.969029,4.624754,8.170212,9.430921,8.551803,10.185051,10.577072,12.829455,7.017409,12.135128,8.213951,-0.449858,0.264791,-0.598456,0.264136,0.681709,0.011952,-0.042249,-1.152635,-0.867391,-0.666168,-1.691999,-0.325259,0.946494,-0.980733,0.482798,0.555419,0.365613,0.552741,0.582616,0.486536,0.56203,0.378475,0.564367,0.608795
2022-03-16,Oklahoma City Thunder,San Antonio Spurs,32.0,85.466667,185.2,63.2,116.866667,22.266667,68.333333,52.933333,89.733333,184.533333,62.933333,114.133333,26.8,70.4,52.666667,10.25929,12.179594,9.80824,15.323906,6.354208,9.333803,8.366935,9.905591,14.22373,9.858095,17.503026,7.914361,15.263408,10.394503,-0.589376,0.127777,-1.051806,-0.704373,0.630805,1.411506,0.249433,0.506036,0.020438,-0.691332,-0.65483,1.522519,0.698484,0.669093,0.461483,0.540787,0.325854,0.521598,0.619345,0.486272,0.551402,0.380682,0.558887,0.586924
2022-03-16,Philadelphia 76ers,Cleveland Cavaliers,21.0,75.866667,166.4,53.6,104.2,22.266667,62.2,46.866667,80.666667,172.466667,56.866667,107.733333,23.8,64.733333,48.733333,10.236627,10.883712,9.693616,14.873819,6.243257,10.788072,8.602013,8.938504,12.927132,7.357648,16.000167,8.878595,13.413087,8.938878,-0.91018,1.216321,-0.999249,1.015836,0.081037,-0.041923,-0.247934,1.286063,0.954538,0.815494,0.159479,0.497133,0.609855,1.411211,0.455929,0.514395,0.357985,0.522837,0.61775,0.467723,0.527847,0.367662,0.536722,0.604132
2022-03-16,Phoenix Suns,Houston Rockets,29.0,85.666667,178.533333,59.666667,106.333333,26.0,72.2,54.333333,84.466667,178.866667,60.066667,109.133333,24.4,69.733333,49.733333,10.951952,13.443009,11.807589,14.440545,6.224529,9.807223,7.623691,9.311272,11.760574,9.311244,13.246857,7.069238,10.321161,8.398227,1.580199,0.458542,1.690525,1.095707,-0.160438,-0.686681,1.179363,0.200501,0.260973,-0.370983,-0.316116,0.789319,0.693982,-0.123807,0.479836,0.561129,0.360111,0.552651,0.634241,0.472233,0.550397,0.349904,0.54044,0.588792
2022-03-16,Portland Trail Blazers,New York Knicks,27.0,76.266667,176.933333,52.733333,106.666667,23.533333,70.266667,43.933333,80.666667,174.4,53.866667,98.466667,26.8,75.933333,51.933333,11.476196,14.163039,10.165441,16.008976,6.32591,12.280986,7.746501,9.958866,14.965299,9.244948,16.205589,6.465289,11.044928,11.474634,0.168031,-0.100119,0.976466,1.224681,-1.167678,-1.776917,0.409841,-0.263117,1.240186,0.78532,1.049677,-1.620585,0.02049,-0.43769,0.431047,0.494375,0.334915,0.497551,0.576049,0.462538,0.547055,0.352941,0.539373,0.643802
2022-03-16,Toronto Raptors,Los Angeles Clippers,22.0,82.2,180.333333,59.6,118.0,22.6,62.333333,45.533333,80.133333,172.466667,55.466667,103.266667,24.666667,69.2,50.533333,11.373343,11.736108,8.29218,10.796251,6.395286,10.168754,11.162867,8.580494,9.810483,9.814302,12.949416,4.79039,9.953214,9.258342,0.297115,1.185115,0.258193,0.602173,0.180784,0.710873,0.576301,0.057465,-0.588028,0.244585,0.203793,-0.425527,-0.808741,0.446722,0.455823,0.505085,0.362567,0.518484,0.553933,0.464631,0.537121,0.356455,0.536142,0.630616
2022-03-17,Detroit Pistons,Orlando Magic,0.0,79.733333,179.333333,55.6,109.333333,24.133333,70.0,49.8,82.133333,176.8,57.666667,108.933333,24.466667,67.866667,49.733333,8.065075,14.046907,8.151181,13.555814,5.660427,7.05008,7.880728,11.591336,14.21724,10.742432,14.984558,6.057209,11.287801,11.10955,-1.034457,-0.904224,-1.86593,-1.600638,0.938248,1.254576,0.085424,-0.44812,-0.422909,-0.407232,-0.464994,0.122858,-0.004063,-0.025272,0.44461,0.508537,0.344762,0.511896,0.624582,0.464555,0.529376,0.360511,0.533748,0.605519


## Correlations

In [13]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in last_15:
    corr_p = pearsonr(last_15['target'], last_15[col])
    row = {'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print correlation
corr_df = corr_df[corr_df['p-value'] < .05].drop(['p-value'], axis=1).sort_values(['corr'], axis=0, ascending=False)
corr_df

Unnamed: 0,corr,stat
0,1.0,target
5,0.74,3p
6,0.74,3pa
13,0.7,3pa_opp
12,0.68,3p_opp
46,0.52,efg_perc
44,0.52,2p_perc
9,0.5,fga_opp
19,0.49,3p_std
2,0.49,fga


## Save dataframe with significantly correlated stats

In [14]:
stats = corr_df[corr_df['corr'].abs() >= .6]['stat']
df = last_15[stats]

df.to_csv('backend/data/inputs/3p/shooting.csv')