In [1]:
import pandas as pd
import scipy.stats
import math
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline

In [2]:
data = pd.read_csv('shot_logs.csv')
data.head(2)

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,WIN_LOSE,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148


In [3]:
def prepareDF(df):
    # Create location, win_lose dummy variable
    df = df.join(pd.get_dummies(df[['LOCATION','WIN_LOSE']]))
    
    # replace GAME_CLOCK with seconds 
    GAME_CLOCK1 = []
    for i in df['GAME_CLOCK']:
        a = i.split(':')
        #b = (int(a[0])*60 + int(a[1]))/720
        b = (float(a[0])*60 + float(a[1]))
        GAME_CLOCK1.append(b)
    df['GAME_CLOCK'] = GAME_CLOCK1
    
    #fill empty shot clocl with game clock values 
    df['SHOT_CLOCK'] = df['SHOT_CLOCK'].fillna(value=df['GAME_CLOCK'])
    df=df.drop(data[data.SHOT_CLOCK > 24].index)
    #data['SHOT_CLOCK'].max()
    
    # normalize numerica variable
    numeric_col=['GAME_CLOCK','FINAL_MARGIN','SHOT_NUMBER','PERIOD','SHOT_CLOCK','DRIBBLES','TOUCH_TIME','SHOT_DIST','PTS_TYPE','CLOSE_DEF_DIST','FGM','PTS']
    x = df[numeric_col].values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df[numeric_col] = pd.DataFrame(x_scaled)
    
    df=df.dropna(axis=0)

    return df

In [4]:
train_data = prepareDF(data)
train_data.head(2)

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,WIN_LOSE,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id,LOCATION_A,LOCATION_H,WIN_LOSE_L,WIN_LOSE_W
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,0.726415,0.0,0.0,0.095833,0.015169,0.0625,...,101187,0.024436,1.0,0.666667,brian roberts,203148,1,0,0,1
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,0.726415,0.027027,0.0,0.019444,0.004775,0.0,...,202711,0.114662,0.0,0.0,brian roberts,203148,1,0,0,1


In [5]:
def prepareTrainData(y):
    col_list = list(train_data)
    remove_list = ['GAME_ID', 'MATCHUP', 'LOCATION', 'WIN_LOSE', 'SHOT_RESULT', 
                   'CLOSEST_DEFENDER', 'CLOSEST_DEFENDER_PLAYER_ID', 'player_name', 
                   'player_id', 'LOCATION_A', 'WIN_LOSE_L', 'PTS']
    for remove_item in remove_list:
        col_list.remove(remove_item)
    col_list.remove(y)
    X = train_data.loc[:, col_list]
    Y = train_data[y]
    
    return X, Y, col_list

#### We use 'SHOT_CLOCK' as our continuous outcome variable

In [6]:
con_X, con_Y, con_list = prepareTrainData('SHOT_CLOCK')
con_X_train, con_X_test, con_Y_train, con_Y_test = train_test_split(con_X, con_Y, test_size=0.2)

con_X_train.head()

Unnamed: 0,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST,FGM,LOCATION_H,WIN_LOSE_W
19203,0.537736,0.108108,0.166667,0.829167,0.125,0.885411,0.09322,0.0,0.056391,1.0,0,1
10331,0.433962,0.108108,0.0,0.211111,0.03125,0.874801,0.557203,1.0,0.048872,1.0,0,0
34620,0.462264,0.027027,0.5,0.630556,0.03125,0.884881,0.074153,0.0,0.033835,1.0,1,0
77389,0.424528,0.135135,0.333333,0.613889,0.03125,0.875332,0.358051,0.0,0.095865,0.0,0,0
13969,0.754717,0.108108,0.5,0.779167,0.125,0.892308,0.059322,0.0,0.022556,1.0,1,1


In [7]:
for col in con_list:
    print(col, scipy.stats.ttest_ind(con_X_train[col], con_Y_train, equal_var=False))

FINAL_MARGIN Ttest_indResult(statistic=1056.339433712531, pvalue=0.0)
SHOT_NUMBER Ttest_indResult(statistic=270.12304171232455, pvalue=0.0)
PERIOD Ttest_indResult(statistic=346.14075411891173, pvalue=0.0)
GAME_CLOCK Ttest_indResult(statistic=497.5879861838346, pvalue=0.0)
DRIBBLES Ttest_indResult(statistic=92.56234219139502, pvalue=0.0)
TOUCH_TIME Ttest_indResult(statistic=3680.348594008115, pvalue=0.0)
SHOT_DIST Ttest_indResult(statistic=416.15208735273126, pvalue=0.0)
PTS_TYPE Ttest_indResult(statistic=171.6744235004446, pvalue=0.0)
CLOSE_DEF_DIST Ttest_indResult(statistic=187.9493985561325, pvalue=0.0)
FGM Ttest_indResult(statistic=271.18550907917523, pvalue=0.0)
LOCATION_H Ttest_indResult(statistic=300.51347458253383, pvalue=0.0)
WIN_LOSE_W Ttest_indResult(statistic=304.1467846782004, pvalue=0.0)


#### We use 'FGM' as our binary outcome variable

In [8]:
bin_X, bin_Y, bin_list = prepareTrainData('FGM')
bin_X_train, bin_X_test, bin_Y_train, bin_Y_test = train_test_split(bin_X, bin_Y, test_size=0.2)

bin_X_train.head()

Unnamed: 0,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSE_DEF_DIST,LOCATION_H,WIN_LOSE_W
71600,0.575472,0.054054,0.166667,0.168056,0.033708,0.03125,0.872149,0.04661,0.0,0.084586,1,1
32985,0.622642,0.054054,0.0,0.581944,0.017978,0.0,0.872679,0.389831,0.0,0.116541,1,1
49690,0.584906,0.0,0.166667,0.305556,0.031039,0.03125,0.879576,0.53178,1.0,0.109023,0,1
104824,0.566038,0.513514,0.333333,0.320833,0.029213,0.0,0.874271,0.036017,0.0,0.157895,0,1
108220,0.45283,0.351351,0.5,0.904167,0.008567,0.125,0.900796,0.180085,0.0,0.06015,1,0


In [9]:
for col in bin_list:
    print(col, scipy.stats.ttest_ind(bin_X_train[col], bin_Y_train, equal_var=False))

FINAL_MARGIN Ttest_indResult(statistic=30.49102809411541, pvalue=2.2196567951522425e-203)
SHOT_NUMBER Ttest_indResult(statistic=-189.58280898859581, pvalue=0.0)
PERIOD Ttest_indResult(statistic=-125.13997734917115, pvalue=0.0)
GAME_CLOCK Ttest_indResult(statistic=19.61175850444102, pvalue=1.5383462740699607e-85)
SHOT_CLOCK Ttest_indResult(statistic=-272.45836749500523, pvalue=0.0)
DRIBBLES Ttest_indResult(statistic=-244.88085925964353, pvalue=0.0)
TOUCH_TIME Ttest_indResult(statistic=275.8993054523821, pvalue=0.0)
SHOT_DIST Ttest_indResult(statistic=-99.71878990827878, pvalue=0.0)
PTS_TYPE Ttest_indResult(statistic=-90.75526792122756, pvalue=0.0)
CLOSE_DEF_DIST Ttest_indResult(statistic=-240.20485603964875, pvalue=0.0)
LOCATION_H Ttest_indResult(statistic=20.94385993514797, pvalue=2.701224215216206e-97)
WIN_LOSE_W Ttest_indResult(statistic=23.517073761059443, pvalue=3.96515089999937e-122)


### Linear Regression on continuous variable

In [10]:
linear = LinearRegression()
lin_score = cross_val_score(linear, con_X_train, con_Y_train, cv=5)
print("Linear Regression: 5-fold Cross-Validation: Mean Accuracy: %f" % (lin_score.mean()))


Linear Regression: 5-fold Cross-Validation: Mean Accuracy: 0.008782


In [11]:
lin_results = sm.OLS(con_Y_train, con_X_train).fit()
print(lin_results.summary())

                            OLS Regression Results                            
Dep. Variable:             SHOT_CLOCK   R-squared:                       0.114
Model:                            OLS   Adj. R-squared:                  0.114
Method:                 Least Squares   F-statistic:                     1102.
Date:                Fri, 15 Mar 2019   Prob (F-statistic):               0.00
Time:                        14:24:10   Log-Likelihood:             1.2342e+05
No. Observations:              102455   AIC:                        -2.468e+05
Df Residuals:                  102443   BIC:                        -2.467e+05
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
FINAL_MARGIN      -0.0018      0.003     -0.

### Logistic Regression on binary variable

In [12]:
logistic = LogisticRegression()
log_score = cross_val_score(logistic, bin_X_train, bin_Y_train, cv=5)
print("Logistic Regression: 5-fold Cross-Validation: Mean Accuracy: %f" % (log_score.mean()))




Logistic Regression: 5-fold Cross-Validation: Mean Accuracy: 0.606461


In [13]:
log_results = sm.Logit(bin_Y_train, bin_X_train).fit()
print(log_results.summary())

Optimization terminated successfully.
         Current function value: 0.660473
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                    FGM   No. Observations:               102455
Model:                          Logit   Df Residuals:                   102443
Method:                           MLE   Df Model:                           11
Date:                Fri, 15 Mar 2019   Pseudo R-squ.:                 0.04110
Time:                        14:24:15   Log-Likelihood:                -67669.
converged:                       True   LL-Null:                       -70569.
                                        LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
FINAL_MARGIN       0.8293      0.089      9.327      0.000       0.655       1.004
SHOT_NUMBER      