# 3 Pointers Made against advanced_totals.csv

### Import packages

In [55]:
import os
import numpy as np
import pandas as pd
%matplotlib inline
from scipy.stats import pearsonr, zscore
import itertools

pd.set_option("display.max_columns", None)

### Set working directory

In [56]:
# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

# Change working directory
os.chdir('/Users/tyler/OneDrive/Documents/Python/NBA')

# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA
Directory: C:\Users\tyler\OneDrive\Documents\Python\NBA


## Exploratory Data Analysis

### Import data

In [57]:
advanced_df = pd.read_csv('backend/data/totals/advanced_totals.csv').drop(['Unnamed: 0'], axis=1)
shooting_df = pd.read_csv('backend/data/totals/game_totals.csv').drop(['Unnamed: 0'], axis=1)
shooting_df = shooting_df[['date', 'visitor', 'home', 'team', '3p']]

In [58]:
# Merge dataframes to have target variable
df = pd.merge(shooting_df, advanced_df, 
              left_on=['date', 'visitor', 'home', 'team'], right_on=['date', 'visitor', 'home', 'team'],
              how='left')

### Basic exploration

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38288 entries, 0 to 38287
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      38288 non-null  object 
 1   visitor   38288 non-null  object 
 2   home      38288 non-null  object 
 3   team      38288 non-null  int64  
 4   3p        38288 non-null  int64  
 5   ts_perc   38288 non-null  float64
 6   efg_perc  38288 non-null  float64
 7   3par      38288 non-null  float64
 8   ftr       38288 non-null  float64
 9   orb_perc  38288 non-null  float64
 10  drb_perc  38288 non-null  float64
 11  trb_perc  38288 non-null  float64
 12  ast_perc  38288 non-null  float64
 13  stl_perc  38288 non-null  float64
 14  blk_perc  38288 non-null  float64
 15  tov_perc  38288 non-null  float64
 16  usg_perc  38288 non-null  float64
 17  ortg      38288 non-null  float64
 18  drtg      38288 non-null  float64
dtypes: float64(14), int64(2), object(3)
memory usage: 5.8+ MB


In [60]:
# Convert 'date' column to Date object
df['date'] = pd.to_datetime(df['date'])

# Convert 'team' column to Team Name
df['team'] = np.where(df['team'], df['home'], df['visitor'])

# Rename target variable
df = df.rename({'3p': 'target'}, axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38288 entries, 0 to 38287
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      38288 non-null  datetime64[ns]
 1   visitor   38288 non-null  object        
 2   home      38288 non-null  object        
 3   team      38288 non-null  object        
 4   target    38288 non-null  int64         
 5   ts_perc   38288 non-null  float64       
 6   efg_perc  38288 non-null  float64       
 7   3par      38288 non-null  float64       
 8   ftr       38288 non-null  float64       
 9   orb_perc  38288 non-null  float64       
 10  drb_perc  38288 non-null  float64       
 11  trb_perc  38288 non-null  float64       
 12  ast_perc  38288 non-null  float64       
 13  stl_perc  38288 non-null  float64       
 14  blk_perc  38288 non-null  float64       
 15  tov_perc  38288 non-null  float64       
 16  usg_perc  38288 non-null  float64       
 17  ortg      38

# Dataframe of team's last 15 performances

In [61]:
# Return ten lastest dates team played
def last_15_date(team, date):
    schedule = df[df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 15 < 0:
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
    else:
        date_1, date_2 = schedule.iloc[date_index - 1]['date'], schedule.iloc[date_index - 2]['date']
        date_3, date_4 = schedule.iloc[date_index - 3]['date'], schedule.iloc[date_index - 4]['date']
        date_5, date_6 = schedule.iloc[date_index - 5]['date'], schedule.iloc[date_index - 6]['date']
        date_7, date_8 = schedule.iloc[date_index - 7]['date'], schedule.iloc[date_index - 8]['date']
        date_9, date_10 = schedule.iloc[date_index - 9]['date'], schedule.iloc[date_index - 10]['date']
        date_11, date_12 = schedule.iloc[date_index - 11]['date'], schedule.iloc[date_index - 12]['date']
        date_13, date_14 = schedule.iloc[date_index - 13]['date'], schedule.iloc[date_index - 14]['date']
        date_15 = schedule.iloc[date_index - 15]['date']
        return date_1, date_2, date_3, date_4, date_5, date_6, date_7, date_8, date_9, date_10, date_11, date_12, date_13, date_14, date_15

df['dates'] = df.apply(lambda x: last_15_date(x.team, x.date), axis=1)
df['date_1'], df['date_2'] = df['dates'].apply(lambda x: x[0]), df['dates'].apply(lambda x: x[1])
df['date_3'], df['date_4'] = df['dates'].apply(lambda x: x[2]), df['dates'].apply(lambda x: x[3])
df['date_5'], df['date_6'] = df['dates'].apply(lambda x: x[4]), df['dates'].apply(lambda x: x[5])
df['date_7'], df['date_8'] = df['dates'].apply(lambda x: x[6]), df['dates'].apply(lambda x: x[7])
df['date_9'], df['date_10'] = df['dates'].apply(lambda x: x[8]), df['dates'].apply(lambda x: x[9])
df['date_11'], df['date_12'] = df['dates'].apply(lambda x: x[10]), df['dates'].apply(lambda x: x[11])
df['date_13'], df['date_14'] = df['dates'].apply(lambda x: x[12]), df['dates'].apply(lambda x: x[13])
df['date_15'] = df['dates'].apply(lambda x: x[14])

In [62]:
# X and y column names to merge on
y_cols = df.columns
x_cols = ['date', 'team', 'ts_perc', 'efg_perc', '3par', 'ftr', 
          'orb_perc', 'drb_perc', 'trb_perc', 'ast_perc', 'stl_perc', 'blk_perc',
          'tov_perc', 'usg_perc', 'ortg', 'drtg']

last_15_games = df[y_cols]
X = df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 15 games stats for each team)
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for date in dates:
    last_15_games = pd.merge(last_15_games, X, left_on=['date' + date, 'team'], right_on=['date', 'team'], how='left', suffixes=('', date))

## Feature Engineering

In [63]:
# Create season variable
def season(month, year):
    if (month in list(range(10, 13)) and year == 2006) or (month in list(range(1, 7)) and year == 2007):
        return 2006
    elif (month in list(range(10, 13)) and year == 2007) or (month in list(range(1, 7)) and year == 2008):
        return 2007
    elif (month in list(range(10, 13)) and year == 2008) or (month in list(range(1, 7)) and year == 2009):
        return 2008
    elif (month in list(range(10, 13)) and year == 2009) or (month in list(range(1, 7)) and year == 2010):
        return 2009
    elif (month in list(range(10, 13)) and year == 2010) or (month in list(range(1, 7)) and year == 2011):
        return 2010
    elif (month in list(range(10, 11)) and year == 2011) or (month in list(range(1, 7)) and year == 2012):
        return 2011
    elif (month in list(range(10, 13)) and year == 2012) or (month in list(range(1, 7)) and year == 2013):
        return 2012
    elif (month in list(range(10, 13)) and year == 2013) or (month in list(range(1, 7)) and year == 2014):
        return 2013
    elif (month in list(range(10, 13)) and year == 2014) or (month in list(range(1, 7)) and year == 2015):
        return 2014
    elif (month in list(range(10, 13)) and year == 2015) or (month in list(range(1, 7)) and year == 2016):
        return 2015
    elif (month in list(range(10, 13)) and year == 2016) or (month in list(range(1, 7)) and year == 2017):
        return 2016
    elif (month in list(range(10, 13)) and year == 2017) or (month in list(range(1, 7)) and year == 2018):
        return 2017
    elif (month in list(range(10, 13)) and year == 2018) or (month in list(range(1, 7)) and year == 2019):
        return 2018
    elif (month in list(range(10, 13)) and year == 2019) or (month in list(range(1, 11)) and year == 2020):
        return 2019
    elif (month in list(range(12, 13)) and year == 2020) or (month in list(range(1, 8)) and year == 2021):
        return 2020

df['season'] = df['date'].apply(lambda x: season(x.month, x.year))
last_15_games['season'] = last_15_games['date'].apply(lambda x: season(x.month, x.year))
last_15_games.sample()

Unnamed: 0,date,visitor,home,team,target,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,dates,date_1,date_2,date_3,date_4,date_5,date_6,date_7,date_8,date_9,date_10,date_11,date_12,date_13,date_14,date_15,date_1.1,ts_perc_1,efg_perc_1,3par_1,ftr_1,orb_perc_1,drb_perc_1,trb_perc_1,ast_perc_1,stl_perc_1,blk_perc_1,tov_perc_1,usg_perc_1,ortg_1,drtg_1,date_2.1,ts_perc_2,efg_perc_2,3par_2,ftr_2,orb_perc_2,drb_perc_2,trb_perc_2,ast_perc_2,stl_perc_2,blk_perc_2,tov_perc_2,usg_perc_2,ortg_2,drtg_2,date_3.1,ts_perc_3,efg_perc_3,3par_3,ftr_3,orb_perc_3,drb_perc_3,trb_perc_3,ast_perc_3,stl_perc_3,blk_perc_3,tov_perc_3,usg_perc_3,ortg_3,drtg_3,date_4.1,ts_perc_4,efg_perc_4,3par_4,ftr_4,orb_perc_4,drb_perc_4,trb_perc_4,ast_perc_4,stl_perc_4,blk_perc_4,tov_perc_4,usg_perc_4,ortg_4,drtg_4,date_5.1,ts_perc_5,efg_perc_5,3par_5,ftr_5,orb_perc_5,drb_perc_5,trb_perc_5,ast_perc_5,stl_perc_5,blk_perc_5,tov_perc_5,usg_perc_5,ortg_5,drtg_5,date_6.1,ts_perc_6,efg_perc_6,3par_6,ftr_6,orb_perc_6,drb_perc_6,trb_perc_6,ast_perc_6,stl_perc_6,blk_perc_6,tov_perc_6,usg_perc_6,ortg_6,drtg_6,date_7.1,ts_perc_7,efg_perc_7,3par_7,ftr_7,orb_perc_7,drb_perc_7,trb_perc_7,ast_perc_7,stl_perc_7,blk_perc_7,tov_perc_7,usg_perc_7,ortg_7,drtg_7,date_8.1,ts_perc_8,efg_perc_8,3par_8,ftr_8,orb_perc_8,drb_perc_8,trb_perc_8,ast_perc_8,stl_perc_8,blk_perc_8,tov_perc_8,usg_perc_8,ortg_8,drtg_8,date_9.1,ts_perc_9,efg_perc_9,3par_9,ftr_9,orb_perc_9,drb_perc_9,trb_perc_9,ast_perc_9,stl_perc_9,blk_perc_9,tov_perc_9,usg_perc_9,ortg_9,drtg_9,date_10.1,ts_perc_10,efg_perc_10,3par_10,ftr_10,orb_perc_10,drb_perc_10,trb_perc_10,ast_perc_10,stl_perc_10,blk_perc_10,tov_perc_10,usg_perc_10,ortg_10,drtg_10,date_11.1,ts_perc_11,efg_perc_11,3par_11,ftr_11,orb_perc_11,drb_perc_11,trb_perc_11,ast_perc_11,stl_perc_11,blk_perc_11,tov_perc_11,usg_perc_11,ortg_11,drtg_11,date_12.1,ts_perc_12,efg_perc_12,3par_12,ftr_12,orb_perc_12,drb_perc_12,trb_perc_12,ast_perc_12,stl_perc_12,blk_perc_12,tov_perc_12,usg_perc_12,ortg_12,drtg_12,date_13.1,ts_perc_13,efg_perc_13,3par_13,ftr_13,orb_perc_13,drb_perc_13,trb_perc_13,ast_perc_13,stl_perc_13,blk_perc_13,tov_perc_13,usg_perc_13,ortg_13,drtg_13,date_14.1,ts_perc_14,efg_perc_14,3par_14,ftr_14,orb_perc_14,drb_perc_14,trb_perc_14,ast_perc_14,stl_perc_14,blk_perc_14,tov_perc_14,usg_perc_14,ortg_14,drtg_14,date_15.1,ts_perc_15,efg_perc_15,3par_15,ftr_15,orb_perc_15,drb_perc_15,trb_perc_15,ast_perc_15,stl_perc_15,blk_perc_15,tov_perc_15,usg_perc_15,ortg_15,drtg_15,season
28051,2017-03-31,Dallas Mavericks,Memphis Grizzlies,Memphis Grizzlies,11,0.591,0.54,0.36,0.267,20.0,86.7,55.3,65.7,4.6,9.5,13.4,100.0,113.3,103.0,"(2017-03-29 00:00:00, 2017-03-27 00:00:00, 201...",2017-03-29,2017-03-27,2017-03-26,2017-03-23,2017-03-21,2017-03-18,2017-03-16,2017-03-15,2017-03-13,2017-03-11,2017-03-09,2017-03-06,2017-03-04,2017-03-03,2017-02-28,2017-03-29,0.573,0.534,0.364,0.205,22.5,73.9,50.0,53.7,8.8,5.0,7.7,100.0,121.2,106.9,2017-03-27,0.438,0.372,0.289,0.322,40.4,76.5,55.6,48.4,4.9,4.8,6.4,100.0,109.7,111.0,2017-03-26,0.513,0.494,0.341,0.176,30.2,77.4,50.0,55.3,12.5,0.0,12.4,100.0,106.8,120.4,2017-03-23,0.532,0.5,0.372,0.192,25.6,84.2,54.5,52.9,4.7,7.8,13.3,100.0,105.7,113.9,2017-03-21,0.484,0.462,0.443,0.165,9.3,75.5,44.6,53.1,7.4,5.1,15.0,100.0,87.1,100.9,2017-03-18,0.58,0.507,0.395,0.408,17.5,75.7,45.5,62.5,7.9,6.4,10.0,100.0,116.8,107.8,2017-03-16,0.515,0.478,0.304,0.196,34.8,80.0,55.8,74.4,7.6,11.4,9.9,100.0,112.5,99.4,2017-03-15,0.527,0.488,0.365,0.212,31.1,73.1,53.6,72.2,2.3,4.8,10.6,100.0,113.6,105.5,2017-03-13,0.634,0.608,0.342,0.291,31.4,76.5,53.6,73.2,13.0,0.0,11.9,100.0,133.4,109.8,2017-03-11,0.449,0.406,0.322,0.256,24.5,82.4,47.1,64.7,11.5,7.1,11.5,100.0,94.2,112.0,2017-03-09,0.529,0.488,0.241,0.265,13.0,80.0,42.0,70.3,6.3,4.2,8.9,100.0,103.5,120.3,2017-03-06,0.579,0.506,0.325,0.506,19.0,80.6,47.4,67.6,7.1,4.7,13.7,100.0,111.2,124.5,2017-03-04,0.508,0.485,0.289,0.216,34.0,74.4,51.7,60.5,8.2,16.7,10.9,100.0,111.1,126.5,2017-03-03,0.545,0.518,0.341,0.268,15.9,80.0,46.4,42.1,4.3,0.0,9.8,100.0,108.2,112.5,2017-02-28,0.658,0.628,0.291,0.337,33.3,77.1,54.9,50.0,6.4,4.8,9.2,100.0,137.7,118.6,2016.0


In [64]:
# Calculate z-score
def z_score(value, mean, std):
    return (value - mean) / std

In [73]:
# Calculate perc difference
def perc_diff(value, mean):
    return (value - mean) / mean

### Last Performance

In [74]:
stats = ['ts_perc', 'efg_perc', '3par', 'ftr', 'orb_perc', 'drb_perc', 'trb_perc', 
         'ast_perc', 'stl_perc', 'blk_perc', 'tov_perc', 'usg_perc', 'ortg', 'drtg']
dates = ['_1']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_game = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_game[stat] = 0
    for date in dates:
        last_game[stat] = last_game[stat] + last_game[stat + date]
    
    last_game[stat] = last_game[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_game[stat + '_std'] = 0
    for date in dates:
        last_game[stat + '_std'] = last_game[stat + '_std'] + ((last_game[stat + date] - last_game[stat]) ** 2)
    
    last_game[stat + '_std'] = last_game[stat + '_std'] / len(dates)
    last_game[stat + '_std'] = last_game[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_game[stat + '_trend'] = 0
    for date in dates:
        last_game[stat + '_trend'] = last_game[stat + '_trend'] + \
                                        z_score(last_game[stat + date], last_game[stat], last_game[stat + '_std']).fillna(0)
    
    last_game[stat + '_trend'] = last_game[stat + '_trend'] / 1


# Aggregate stats for the entire game
last_game = last_game.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_game_cols = [col 
                  for col in last_game.columns
                  if (col[0] == 'target' and col[1] == 'sum') or \
                     (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                     (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                     ('_trend' in col[0] and col[1] == 'sum')]

last_game = last_game[last_game_cols].dropna(axis=0)
last_game.columns = [col[0] for col in last_game.columns]
last_game.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,ts_perc_trend,efg_perc_trend,3par_trend,ftr_trend,orb_perc_trend,drb_perc_trend,trb_perc_trend,ast_perc_trend,stl_perc_trend,blk_perc_trend,tov_perc_trend,usg_perc_trend,ortg_trend,drtg_trend
date,visitor,home,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2021-03-24,Cleveland Cavaliers,Chicago Bulls,19,0.482,0.443,0.448,0.365,24.2,76.1,47.95,53.75,9.1,6.2,9.2,100.0,203.3,243.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2016-01-24,Los Angeles Clippers,Toronto Raptors,20,0.6345,0.602,0.636,0.596,13.35,72.35,47.8,58.55,7.65,8.5,10.95,100.0,241.2,188.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-03-04,New York Knicks,Sacramento Kings,23,0.522,0.4665,0.556,0.655,16.5,75.4,45.35,65.8,7.15,13.7,9.15,100.0,207.4,233.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-02-19,San Antonio Spurs,Philadelphia 76ers,7,0.418,0.387,0.38,0.367,26.95,74.4,48.0,54.5,9.45,6.7,8.05,100.0,184.0,211.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-02-06,Sacramento Kings,New Orleans Hornets,11,0.538,0.495,0.391,0.464,36.2,71.05,53.95,52.35,8.65,6.25,15.1,100.0,219.9,214.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Last 5 Performances

In [75]:
stats = ['ts_perc', 'efg_perc', '3par', 'ftr', 'orb_perc', 'drb_perc', 'trb_perc', 
         'ast_perc', 'stl_perc', 'blk_perc', 'tov_perc', 'usg_perc', 'ortg', 'drtg']
dates = ['_1', '_2', '_3', '_4', '_5']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_5_games = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_5_games[stat] = 0
    for date in dates:
        last_5_games[stat] = last_5_games[stat] + last_5_games[stat + date]
    
    last_5_games[stat] = last_5_games[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_5_games[stat + '_std'] = 0
    for date in dates:
        last_5_games[stat + '_std'] = last_5_games[stat + '_std'] + ((last_5_games[stat + date] - last_5_games[stat]) ** 2)
    
    last_5_games[stat + '_std'] = last_5_games[stat + '_std'] / len(dates)
    last_5_games[stat + '_std'] = last_5_games[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_5_games[stat + '_trend'] = 0
    for date in dates[:3]:
        last_5_games[stat + '_trend'] = last_5_games[stat + '_trend'] + \
                                        z_score(last_5_games[stat + date], last_5_games[stat], last_5_games[stat + '_std']).fillna(0)
    
    last_5_games[stat + '_trend'] = last_5_games[stat + '_trend'] / len(dates[:3])

last_5_games = last_5_games.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_5_game_cols = [col 
                    for col in last_5_games.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

last_5_games = last_5_games[last_5_game_cols].dropna(axis=0)
last_5_games.columns = [col[0] for col in last_5_games.columns]
last_5_games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,ts_perc_trend,efg_perc_trend,3par_trend,ftr_trend,orb_perc_trend,drb_perc_trend,trb_perc_trend,ast_perc_trend,stl_perc_trend,blk_perc_trend,tov_perc_trend,usg_perc_trend,ortg_trend,drtg_trend
date,visitor,home,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2006-11-28,Indiana Pacers,Portland Trail Blazers,13,0.5616,0.5164,0.2052,0.305,24.02,73.16,48.28,53.78,6.36,6.94,12.4,100.0,111.02,116.4,-0.390923,0.098177,0.107177,-0.007204,-0.125713,-0.403272,-0.639427,-0.397628,-0.637117,-0.276467,-0.260813,0.0,0.054566,0.516929
2006-11-28,New York Knicks,Chicago Bulls,9,0.5308,0.4608,0.1732,0.4702,32.84,76.14,53.56,51.6,6.3,6.14,14.88,100.0,106.42,109.64,0.413701,0.629101,-0.656284,0.199701,-0.121708,0.319809,0.339298,-0.096004,-0.684225,-0.416442,0.67849,0.0,-0.256286,-0.334917
2006-11-29,Indiana Pacers,Golden State Warriors,18,0.5416,0.5002,0.4998,0.6122,28.55,69.23,48.74,62.95,8.2,11.65,14.82,100.0,213.46,209.74,0.278346,0.104221,-0.030751,0.47154,0.287844,-0.724103,-0.501442,0.281636,-0.249886,0.226984,-0.478366,0.0,0.712593,0.454615
2006-11-29,New York Knicks,Cleveland Cavaliers,13,0.525,0.471,0.1504,0.441,30.68,74.52,52.24,51.94,7.02,6.78,17.26,100.0,100.98,106.94,0.51943,0.700768,-0.289398,0.690871,-0.159448,-0.330892,0.113362,0.679677,0.025872,0.18698,0.542461,0.0,0.013928,-0.512166
2006-11-29,Orlando Magic,Seattle SuperSonics,8,0.5152,0.4686,0.3858,0.7212,26.72,72.98,49.33,52.04,8.85,7.16,13.54,100.0,205.94,203.46,-0.269911,-0.592873,-0.669436,-0.604163,1.059785,-0.273651,0.186773,-0.136605,0.58199,0.198311,-0.801117,0.0,0.310698,0.140401


### Last 10 Perfomances

In [76]:
stats = ['ts_perc', 'efg_perc', '3par', 'ftr', 'orb_perc', 'drb_perc', 'trb_perc', 
         'ast_perc', 'stl_perc', 'blk_perc', 'tov_perc', 'usg_perc', 'ortg', 'drtg']
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_10_games = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_10_games[stat] = 0
    for date in dates:
        last_10_games[stat] = last_10_games[stat] + last_10_games[stat + date]
    
    last_10_games[stat] = last_10_games[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_10_games[stat + '_std'] = 0
    for date in dates:
        last_10_games[stat + '_std'] = last_10_games[stat + '_std'] + ((last_10_games[stat + date] - last_10_games[stat]) ** 2)
    
    last_10_games[stat + '_std'] = last_10_games[stat + '_std'] / len(dates)
    last_10_games[stat + '_std'] = last_10_games[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_10_games[stat + '_trend'] = 0
    for date in dates[:5]:
        last_10_games[stat + '_trend'] = last_10_games[stat + '_trend'] + \
                                        z_score(last_10_games[stat + date], last_10_games[stat], last_10_games[stat + '_std']).fillna(0)
    
    last_10_games[stat + '_trend'] = last_10_games[stat + '_trend'] / len(dates[:5])

last_10_games = last_10_games.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_10_game_cols = [col 
                    for col in last_10_games.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

last_10_games = last_10_games[last_10_game_cols].dropna(axis=0)
last_10_games.columns = [col[0] for col in last_10_games.columns]
last_10_games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,ts_perc_trend,efg_perc_trend,3par_trend,ftr_trend,orb_perc_trend,drb_perc_trend,trb_perc_trend,ast_perc_trend,stl_perc_trend,blk_perc_trend,tov_perc_trend,usg_perc_trend,ortg_trend,drtg_trend
date,visitor,home,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2006-11-28,Indiana Pacers,Portland Trail Blazers,13,0.543,0.4869,0.1983,0.3608,23.49,72.53,47.17,53.19,7.06,7.04,13.95,100.0,105.97,115.06,0.653618,0.806574,0.169609,-0.537559,0.059818,0.136034,0.221091,0.06509,-0.217245,-0.030642,-0.432588,0.0,0.498813,0.101636
2006-11-28,New York Knicks,Chicago Bulls,9,0.5352,0.4819,0.195,0.3747,28.64,75.22,51.46,49.24,7.4,5.83,14.14,100.0,106.01,105.34,-0.121486,-0.462283,-0.401017,0.737796,0.585674,0.241524,0.391772,0.231112,-0.287195,0.069979,0.19972,0.0,0.059706,0.327538
2006-11-29,Indiana Pacers,Golden State Warriors,18,0.5487,0.5096,0.5142,0.635,26.505,68.785,47.85,65.5,8.535,10.585,14.47,100.0,214.94,208.72,-0.151411,-0.263342,-0.155228,-0.327635,0.679009,0.088181,0.28675,-0.5385,-0.30474,0.416208,0.180081,0.0,-0.187256,0.02551
2006-11-29,New York Knicks,Cleveland Cavaliers,13,0.5358,0.4843,0.1845,0.3873,28.26,74.55,51.52,50.71,7.19,5.64,15.45,100.0,104.22,104.3,-0.301169,-0.292099,-0.611719,0.417702,0.318807,-0.00589,0.136887,0.119988,-0.048155,0.267688,0.500342,0.0,-0.377866,0.210001
2006-11-29,Orlando Magic,Seattle SuperSonics,8,0.5374,0.49135,0.3744,0.7329,28.365,72.35,50.47,51.075,7.785,7.935,14.15,100.0,213.59,208.62,-0.880163,-1.00124,0.217928,-0.130526,-0.449444,0.146812,-0.365813,0.267239,0.811587,-0.414502,-0.294776,0.0,-0.692439,-0.469184


### Last 15 Performances (Unweighted)

In [77]:
stats = ['ts_perc', 'efg_perc', '3par', 'ftr', 'orb_perc', 'drb_perc', 'trb_perc', 
         'ast_perc', 'stl_perc', 'blk_perc', 'tov_perc', 'usg_perc', 'ortg', 'drtg']
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_15_games_unweighted = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15_games_unweighted[stat] = 0
    for date in dates:
        last_15_games_unweighted[stat] = last_15_games_unweighted[stat] + last_15_games_unweighted[stat + date]
    
    last_15_games_unweighted[stat] = last_15_games_unweighted[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_15_games_unweighted[stat + '_std'] = 0
    for date in dates:
        last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] + \
                                                    ((last_15_games_unweighted[stat + date] - last_15_games_unweighted[stat]) ** 2)
    
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] / len(dates)
    last_15_games_unweighted[stat + '_std'] = last_15_games_unweighted[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_15_games_unweighted[stat + '_trend'] = 0
    for date in dates[:10]:
        last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] + \
                                        z_score(last_15_games_unweighted[stat + date], last_15_games_unweighted[stat], last_15_games_unweighted[stat + '_std']).fillna(0)
    
    last_15_games_unweighted[stat + '_trend'] = last_15_games_unweighted[stat + '_trend'] / len(dates[:10])

last_15_games_unweighted = last_10_games.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_15_game_cols = [col 
                    for col in last_15_games_unweighted.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

last_15_games_unweighted = last_15_games_unweighted[last_15_game_cols].dropna(axis=0)
last_15_games_unweighted.columns = [col[0] for col in last_15_games_unweighted.columns]
last_15_games_unweighted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,ts_perc_trend,efg_perc_trend,3par_trend,ftr_trend,orb_perc_trend,drb_perc_trend,trb_perc_trend,ast_perc_trend,stl_perc_trend,blk_perc_trend,tov_perc_trend,usg_perc_trend,ortg_trend,drtg_trend
date,visitor,home,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2006-11-28,Indiana Pacers,Portland Trail Blazers,13,0.543,0.4869,0.1983,0.3608,23.49,72.53,47.17,53.19,7.06,7.04,13.95,100.0,105.97,115.06,0.653618,0.806574,0.169609,-0.537559,0.059818,0.136034,0.221091,0.06509,-0.217245,-0.030642,-0.432588,0.0,0.498813,0.101636
2006-11-28,New York Knicks,Chicago Bulls,9,0.5352,0.4819,0.195,0.3747,28.64,75.22,51.46,49.24,7.4,5.83,14.14,100.0,106.01,105.34,-0.121486,-0.462283,-0.401017,0.737796,0.585674,0.241524,0.391772,0.231112,-0.287195,0.069979,0.19972,0.0,0.059706,0.327538
2006-11-29,Indiana Pacers,Golden State Warriors,18,0.5487,0.5096,0.5142,0.635,26.505,68.785,47.85,65.5,8.535,10.585,14.47,100.0,214.94,208.72,-0.151411,-0.263342,-0.155228,-0.327635,0.679009,0.088181,0.28675,-0.5385,-0.30474,0.416208,0.180081,0.0,-0.187256,0.02551
2006-11-29,New York Knicks,Cleveland Cavaliers,13,0.5358,0.4843,0.1845,0.3873,28.26,74.55,51.52,50.71,7.19,5.64,15.45,100.0,104.22,104.3,-0.301169,-0.292099,-0.611719,0.417702,0.318807,-0.00589,0.136887,0.119988,-0.048155,0.267688,0.500342,0.0,-0.377866,0.210001
2006-11-29,Orlando Magic,Seattle SuperSonics,8,0.5374,0.49135,0.3744,0.7329,28.365,72.35,50.47,51.075,7.785,7.935,14.15,100.0,213.59,208.62,-0.880163,-1.00124,0.217928,-0.130526,-0.449444,0.146812,-0.365813,0.267239,0.811587,-0.414502,-0.294776,0.0,-0.692439,-0.469184


### Last 15 Performances (Weighted)

In [78]:
stats = ['ts_perc', 'efg_perc', '3par', 'ftr', 'orb_perc', 'drb_perc', 'trb_perc', 
         'ast_perc', 'stl_perc', 'blk_perc', 'tov_perc', 'usg_perc', 'ortg', 'drtg']
dates = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
cols = ['date', 'visitor', 'home', 'team', 'target'] + \
    [tup[0] + tup[1] for tup in list(itertools.product(stats, dates))]

last_15_games_weighted = last_15_games[cols].copy()

# Calculate mean for each stat over a team's last performance
for stat in stats:
    last_15_games_weighted[stat] = 0
    for date in dates:
        last_15_games_weighted[stat] = last_15_games_weighted[stat] + last_15_games_weighted[stat + date]
    
    last_15_games_weighted[stat] = last_15_games_weighted[stat] / len(dates)
    
# Calculate standard deviation for each stat over a team's performance
for stat in stats:
    last_15_games_weighted[stat + '_std'] = 0
    for date in dates:
        last_15_games_weighted[stat + '_std'] = last_15_games_weighted[stat + '_std'] + \
                                                ((last_15_games_weighted[stat + date] - last_15_games_weighted[stat]) ** 2)
    
    last_15_games_weighted[stat + '_std'] = last_15_games_weighted[stat + '_std'] / len(dates)
    last_15_games_weighted[stat + '_std'] = last_15_games_weighted[stat + '_std'] ** .5

# Feature engineer trends
for stat in stats:
    last_15_games_weighted[stat + '_trend'] = 0
    for date in dates[:10]:
        last_15_games_weighted[stat + '_trend'] = last_15_games_weighted[stat + '_trend'] + \
                                        z_score(last_15_games_weighted[stat + date], last_15_games_weighted[stat], last_15_games_weighted[stat + '_std']).fillna(0)
    
    last_15_games_weighted[stat + '_trend'] = last_15_games_weighted[stat + '_trend'] / len(dates[:10])

last_15_games_weighted = last_10_games.groupby(['date', 'visitor', 'home']).aggregate(['mean', 'sum'])

last_15_game_cols = [col 
                    for col in last_15_games_weighted.columns
                    if (col[0] == 'target' and col[1] == 'sum') or \
                       (col[0] in stats and col[1] == 'sum' and '_perc' not in col[0]) or \
                       (col[0] in stats and col[1] == 'mean' and '_perc' in col[0]) or \
                       ('_trend' in col[0] and col[1] == 'sum')]

last_15_games_weighted = last_15_games_weighted[last_15_game_cols].dropna(axis=0)
last_15_games_weighted.columns = [col[0] for col in last_15_games_weighted.columns]
last_15_games_weighted.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target,ts_perc,efg_perc,3par,ftr,orb_perc,drb_perc,trb_perc,ast_perc,stl_perc,blk_perc,tov_perc,usg_perc,ortg,drtg,ts_perc_trend,efg_perc_trend,3par_trend,ftr_trend,orb_perc_trend,drb_perc_trend,trb_perc_trend,ast_perc_trend,stl_perc_trend,blk_perc_trend,tov_perc_trend,usg_perc_trend,ortg_trend,drtg_trend
date,visitor,home,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2010-11-30,Los Angeles Lakers,Memphis Grizzlies,15,0.54115,0.4992,0.4092,0.5616,28.535,71.44,50.06,55.46,8.45,8.07,12.88,100.0,219.67,214.27,0.029006,-0.007441,-0.037897,0.098015,-0.552853,-0.065078,-0.261325,0.164732,0.132536,1.037541,-0.464341,0.0,-0.091041,-0.832292
2020-02-24,Memphis Grizzlies,Los Angeles Clippers,23,0.5601,0.52435,0.6325,0.5394,22.075,77.365,50.68,58.09,6.7,10.775,11.735,100.0,221.7,218.2,-0.546208,-0.657352,-0.422602,0.478973,0.640129,0.327998,0.141161,-0.514459,0.082433,-0.44291,0.19801,0.0,-0.444085,0.264745
2020-02-10,Atlanta Hawks,Orlando Magic,34,0.55335,0.5114,0.7861,0.5335,22.985,79.415,49.47,62.385,7.695,8.86,11.54,100.0,220.29,230.27,0.361035,0.20017,0.369379,0.228052,-1.285604,-0.873984,-0.90534,-0.210988,0.153551,0.683393,0.562533,0.0,-0.293516,-0.107418
2014-04-26,Miami Heat,Charlotte Bobcats,16,0.55355,0.51865,0.6024,0.611,21.525,80.11,50.805,58.81,7.7,6.405,12.835,100.0,214.17,210.98,-0.482686,-0.339689,0.297805,0.148623,-0.206379,0.557895,-0.052366,-0.477468,-0.682438,0.356174,0.192014,0.0,-0.660432,0.415024
2020-09-08,Miami Heat,Milwaukee Bucks,20,0.57935,0.5406,0.8958,0.6196,20.32,81.57,51.375,62.34,7.975,8.18,12.875,100.0,224.26,218.32,-0.113499,-0.229242,-0.153328,0.273646,0.600141,-0.422987,0.122751,-0.200523,-0.299437,0.263673,-0.539608,0.0,0.372257,0.364434


## Correlations of performances (last 1, last 5, last 10)

In [80]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in last_15_games_weighted:
    corr_p = pearsonr(last_15_games_weighted['target'], last_15_games_weighted[col])
    row = {'last': '15_weighted', 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in last_15_games_unweighted:
    corr_p = pearsonr(last_15_games_unweighted['target'], last_15_games_unweighted[col])
    row = {'last': '15_unweighted', 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)

# Correlations for last 10 game stats vs 3pt made 
for col in last_10_games:
    corr_p = pearsonr(last_10_games['target'], last_10_games[col])
    row = {'last': 10, 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Correlations for last 10 game stats vs 3pt made 
for col in last_5_games:
    corr_p = pearsonr(last_5_games['target'], last_5_games[col])
    row = {'last': 5, 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Correlations for last game stats vs 3pt made 
for col in last_game:
    corr_p = pearsonr(last_game['target'], last_game[col])
    row = {'last': 1, 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print each correlation
for stat in stats:
    print(f'Stat: {stat}')
    print(corr_df[(corr_df['stat'] == stat) & (corr_df['p-value'] < .05)].set_index(['last']).drop(['stat'], axis=1))
    print('\n')



Stat: ts_perc
               corr  p-value
last                        
15_weighted    0.43      0.0
15_unweighted  0.43      0.0
10             0.43      0.0
5              0.36      0.0
1              0.22      0.0


Stat: efg_perc
               corr  p-value
last                        
15_weighted    0.48      0.0
15_unweighted  0.48      0.0
10             0.48      0.0
5              0.42      0.0
1              0.25      0.0


Stat: 3par
               corr  p-value
last                        
15_weighted    0.72      0.0
15_unweighted  0.72      0.0
10             0.72      0.0
5              0.71      0.0
1              0.65      0.0


Stat: ftr
               corr  p-value
last                        
15_weighted   -0.32      0.0
15_unweighted -0.32      0.0
10            -0.32      0.0
5             -0.28      0.0
1             -0.16      0.0


Stat: orb_perc
               corr  p-value
last                        
15_weighted   -0.45      0.0
15_unweighted -0.45      0.0