In [50]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv("../data/shot_logs.csv")

In [3]:
print "Analyzing the data before doing cool data science is must"
print "# of rows:: {0}, # of columns:: {1}".format(data.shape[0], data.shape[1])
print "Type of each column: \n", data.dtypes


Analyzing the data before doing cool data science is must
# of rows:: 128069, # of columns:: 21
Type of each column: 
GAME_ID                         int64
MATCHUP                        object
LOCATION                       object
W                              object
FINAL_MARGIN                    int64
SHOT_NUMBER                     int64
PERIOD                          int64
GAME_CLOCK                     object
SHOT_CLOCK                    float64
DRIBBLES                        int64
TOUCH_TIME                    float64
SHOT_DIST                     float64
PTS_TYPE                        int64
SHOT_RESULT                    object
CLOSEST_DEFENDER               object
CLOSEST_DEFENDER_PLAYER_ID      int64
CLOSE_DEF_DIST                float64
FGM                             int64
PTS                             int64
player_name                    object
player_id                       int64
dtype: object


In [4]:
print "# of distinct values in the feature columns"
for column_name in data.columns:
    print column_name, "--> {0}".format(len(data[column_name].unique()))


# of distinct values in the feature columns
GAME_ID --> 904
MATCHUP --> 1808
LOCATION --> 2
W --> 2
FINAL_MARGIN --> 88
SHOT_NUMBER --> 38
PERIOD --> 7
GAME_CLOCK --> 719
SHOT_CLOCK --> 242
DRIBBLES --> 33
TOUCH_TIME --> 313
SHOT_DIST --> 448
PTS_TYPE --> 2
SHOT_RESULT --> 2
CLOSEST_DEFENDER --> 473
CLOSEST_DEFENDER_PLAYER_ID --> 474
CLOSE_DEF_DIST --> 299
FGM --> 2
PTS --> 3
player_name --> 281
player_id --> 281


In [5]:
print "Stats on Data"
data.describe()



Stats on Data




Unnamed: 0,GAME_ID,FINAL_MARGIN,SHOT_NUMBER,PERIOD,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,PTS_TYPE,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_id
count,128069.0,128069.0,128069.0,128069.0,122502.0,128069.0,128069.0,128069.0,128069.0,128069.0,128069.0,128069.0,128069.0,128069.0
mean,21400450.0,0.208723,6.506899,2.469427,12.453344,2.023355,2.765901,13.571504,2.26467,159038.487284,4.123015,0.452139,0.997314,157238.251247
std,257.8773,13.233267,4.71326,1.139919,5.763265,3.47776,3.043682,8.888964,0.441159,78791.172947,2.756446,0.497706,1.130978,79362.389336
min,21400000.0,-53.0,1.0,1.0,0.0,0.0,-163.6,0.0,2.0,708.0,0.0,0.0,0.0,708.0
25%,21400230.0,-8.0,3.0,1.0,,0.0,0.9,4.7,2.0,101249.0,2.3,0.0,0.0,101162.0
50%,21400450.0,1.0,5.0,2.0,,1.0,1.6,13.7,2.0,201949.0,3.7,0.0,0.0,201939.0
75%,21400670.0,9.0,9.0,3.0,,2.0,3.7,22.5,3.0,203079.0,5.3,1.0,2.0,202704.0
max,21400910.0,53.0,38.0,7.0,24.0,32.0,24.9,47.2,3.0,530027.0,53.2,1.0,3.0,204060.0


In [6]:
data.isnull().any()

GAME_ID                       False
MATCHUP                       False
LOCATION                      False
W                             False
FINAL_MARGIN                  False
SHOT_NUMBER                   False
PERIOD                        False
GAME_CLOCK                    False
SHOT_CLOCK                     True
DRIBBLES                      False
TOUCH_TIME                    False
SHOT_DIST                     False
PTS_TYPE                      False
SHOT_RESULT                   False
CLOSEST_DEFENDER              False
CLOSEST_DEFENDER_PLAYER_ID    False
CLOSE_DEF_DIST                False
FGM                           False
PTS                           False
player_name                   False
player_id                     False
dtype: bool

In [7]:
data.head()

Unnamed: 0,GAME_ID,MATCHUP,LOCATION,W,FINAL_MARGIN,SHOT_NUMBER,PERIOD,GAME_CLOCK,SHOT_CLOCK,DRIBBLES,...,SHOT_DIST,PTS_TYPE,SHOT_RESULT,CLOSEST_DEFENDER,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,PTS,player_name,player_id
0,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,1,1,1:09,10.8,2,...,7.7,2,made,"Anderson, Alan",101187,1.3,1,2,brian roberts,203148
1,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,2,1,0:14,3.4,0,...,28.2,3,missed,"Bogdanovic, Bojan",202711,6.1,0,0,brian roberts,203148
2,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,3,1,0:00,,3,...,10.1,2,missed,"Bogdanovic, Bojan",202711,0.9,0,0,brian roberts,203148
3,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,4,2,11:47,10.3,2,...,17.2,2,missed,"Brown, Markel",203900,3.4,0,0,brian roberts,203148
4,21400899,"MAR 04, 2015 - CHA @ BKN",A,W,24,5,2,10:34,10.9,2,...,3.7,2,missed,"Young, Thaddeus",201152,1.1,0,0,brian roberts,203148


In [8]:
data.columns


Index([u'GAME_ID', u'MATCHUP', u'LOCATION', u'W', u'FINAL_MARGIN',
       u'SHOT_NUMBER', u'PERIOD', u'GAME_CLOCK', u'SHOT_CLOCK', u'DRIBBLES',
       u'TOUCH_TIME', u'SHOT_DIST', u'PTS_TYPE', u'SHOT_RESULT',
       u'CLOSEST_DEFENDER', u'CLOSEST_DEFENDER_PLAYER_ID', u'CLOSE_DEF_DIST',
       u'FGM', u'PTS', u'player_name', u'player_id'],
      dtype='object')

In [9]:
import pprint
pprint.pprint(list(data.columns))

['GAME_ID',
 'MATCHUP',
 'LOCATION',
 'W',
 'FINAL_MARGIN',
 'SHOT_NUMBER',
 'PERIOD',
 'GAME_CLOCK',
 'SHOT_CLOCK',
 'DRIBBLES',
 'TOUCH_TIME',
 'SHOT_DIST',
 'PTS_TYPE',
 'SHOT_RESULT',
 'CLOSEST_DEFENDER',
 'CLOSEST_DEFENDER_PLAYER_ID',
 'CLOSE_DEF_DIST',
 'FGM',
 'PTS',
 'player_name',
 'player_id']


In [10]:
print "Original Data", data.shape  


Original Data (128069, 21)


In [11]:
cleaned_data = data.dropna()
print "After Removing Nulls", cleaned_data.shape

After Removing Nulls (122502, 21)


In [12]:
dataForAnalysis = cleaned_data.copy()

In [13]:
del dataForAnalysis["GAME_ID"]  
del dataForAnalysis['MATCHUP']
del dataForAnalysis['GAME_CLOCK']
del dataForAnalysis["FINAL_MARGIN"]
del dataForAnalysis["PTS"]
del dataForAnalysis["player_name"]
del dataForAnalysis["CLOSEST_DEFENDER"]
del dataForAnalysis["W"] ### Match Result
del dataForAnalysis['SHOT_RESULT']  ### Duplicate information , captured in FGM.

In [14]:
print "Categorizing the time when the shot was made into 3 brackets \n \
        [1-9) on the clock -- 0 \
        [9-17) on the clock -- 1 \
        [17-24] on the clock -- 2"

Categorizing the time when the shot was made into 3 brackets 
         [1-9) on the clock -- 0         [9-17) on the clock -- 1         [17-24] on the clock -- 2


In [15]:
def cat_shot_clock(time):
    if time > 0 and time < 9:
        return 0
    elif time >=9 and time < 17:
        return 1
    else:
        return 2

In [16]:
dataForAnalysis["SHOT_CLOCK_CAT"] = dataForAnalysis.SHOT_CLOCK.map(cat_shot_clock)
del dataForAnalysis["SHOT_CLOCK"]

In [17]:
print "Bining the shot distance into 7 categories using numpy linspace"

Bining the shot distance into 7 categories using numpy linspace


In [18]:
#shot_dist_bins = np.linspace(dataForAnalysis.SHOT_DIST.min(), dataForAnalysis.SHOT_DIST.max(), 7)
#dataForAnalysis["SHOT_DIST_CAT"] = np.digitize(dataForAnalysis.SHOT_DIST, shot_dist_bins)
dataForAnalysis["SHOT_DIST_CAT"] = pd.cut(dataForAnalysis.SHOT_DIST, 7, labels = range(1,8))
del dataForAnalysis["SHOT_DIST"]

In [19]:
print "Bining the dribbles into 4 categories using numpy linspace"


Bining the dribbles into 4 categories using numpy linspace


In [20]:
#dribble_bins = np.linspace(dataForAnalysis.DRIBBLES.min(), dataForAnalysis.DRIBBLES.max(), 4)
#dataForAnalysis["DRIBBLES_CAT"] = np.digitize(dataForAnalysis.DRIBBLES, dribble_bins)
dataForAnalysis["DRIBBLES_CAT"] = pd.cut(dataForAnalysis.DRIBBLES, 4, labels = range(1,5))
del dataForAnalysis["DRIBBLES"]

In [21]:
print "Categorizing the touch time when the shot was made into 3 brackets \n \
        <=2 on the clock -- 0 \
        [3-7) on the clock -- 1 \
        >=7 on the clock -- 2"

Categorizing the touch time when the shot was made into 3 brackets 
         <=2 on the clock -- 0         [3-7) on the clock -- 1         >=7 on the clock -- 2


In [22]:
def touch_time_cat(touch_time):
    if touch_time <=2:
        return 0
    elif touch_time > 2 and touch_time <=6:
        return 1
    else:
        return 2  

In [23]:
dataForAnalysis["TOUCH_TIME_CAT"] = dataForAnalysis.TOUCH_TIME.map(touch_time_cat)
del dataForAnalysis["TOUCH_TIME"]

In [24]:
#close_def_dist_bins = np.linspace(dataForAnalysis.CLOSE_DEF_DIST.min(), dataForAnalysis.CLOSE_DEF_DIST.max(), 11)
#dataForAnalysis["CLOSE_DEF_DIST_CAT"] = np.digitize(dataForAnalysis.CLOSE_DEF_DIST, close_def_dist_bins)
dataForAnalysis["CLOSE_DEF_DIST_CAT"] = pd.cut(dataForAnalysis.CLOSE_DEF_DIST, 11, labels = range(1,12))
del dataForAnalysis["CLOSE_DEF_DIST"]

In [25]:
#dataForAnalysis = pd.get_dummies(dataForAnalysis,columns = ["LOCATION"], drop_first=True)
#dataForAnalysis['LOCATION_H'] = dataForAnalysis['LOCATION_H'].astype(int)
le = LabelEncoder()
dataForAnalysis["IS_HOME"] = dataForAnalysis[["LOCATION"]].apply(le.fit_transform)
del dataForAnalysis["LOCATION"]

In [26]:
binaryEncoder = ce.BinaryEncoder(cols = ["SHOT_NUMBER", "player_id", "CLOSEST_DEFENDER_PLAYER_ID"])
dataForAnalysis = binaryEncoder.fit_transform(dataForAnalysis)

In [27]:
dataForAnalysis.columns
dataForAnalysis.dtypes

SHOT_NUMBER_0                      int64
SHOT_NUMBER_1                      int64
SHOT_NUMBER_2                      int64
SHOT_NUMBER_3                      int64
SHOT_NUMBER_4                      int64
SHOT_NUMBER_5                      int64
player_id_0                        int64
player_id_1                        int64
player_id_2                        int64
player_id_3                        int64
player_id_4                        int64
player_id_5                        int64
player_id_6                        int64
player_id_7                        int64
player_id_8                        int64
CLOSEST_DEFENDER_PLAYER_ID_0       int64
CLOSEST_DEFENDER_PLAYER_ID_1       int64
CLOSEST_DEFENDER_PLAYER_ID_2       int64
CLOSEST_DEFENDER_PLAYER_ID_3       int64
CLOSEST_DEFENDER_PLAYER_ID_4       int64
CLOSEST_DEFENDER_PLAYER_ID_5       int64
CLOSEST_DEFENDER_PLAYER_ID_6       int64
CLOSEST_DEFENDER_PLAYER_ID_7       int64
CLOSEST_DEFENDER_PLAYER_ID_8       int64
PERIOD          

In [28]:
features_X = dataForAnalysis.ix[:, dataForAnalysis.columns != 'FGM']
predict_Y = dataForAnalysis.FGM

In [29]:
features_X.head()

Unnamed: 0,SHOT_NUMBER_0,SHOT_NUMBER_1,SHOT_NUMBER_2,SHOT_NUMBER_3,SHOT_NUMBER_4,SHOT_NUMBER_5,player_id_0,player_id_1,player_id_2,player_id_3,...,CLOSEST_DEFENDER_PLAYER_ID_7,CLOSEST_DEFENDER_PLAYER_ID_8,PERIOD,PTS_TYPE,SHOT_CLOCK_CAT,SHOT_DIST_CAT,DRIBBLES_CAT,TOUCH_TIME_CAT,CLOSE_DEF_DIST_CAT,IS_HOME
0,0,1,0,0,0,0,1,0,0,0,...,1,0,1,2,1,2,1,0,1,0
1,0,1,1,0,0,0,1,0,0,0,...,0,0,1,3,0,5,1,0,2,0
3,0,0,0,0,1,1,1,0,0,0,...,0,1,2,2,1,3,1,0,1,0
4,0,1,1,0,1,1,1,0,0,0,...,0,1,2,2,1,1,1,1,1,0
5,0,0,1,0,1,0,1,0,0,0,...,0,1,2,2,1,3,1,1,1,0


In [30]:
predict_Y.head()

0    1
1    0
3    0
4    0
5    0
Name: FGM, dtype: int64

In [31]:
print "Benchmarking using Dummy Classifies, any model used should beat dummy"
dummy_class = DummyClassifier(strategy='most_frequent')
dummy_cross_val_scores = cross_val_score(dummy_class, features_X, predict_Y, cv = 11)
print "Mean accuracy across 11 folds for Dummy model is ", np.mean(dummy_cross_val_scores)

Benchmarking using Dummy Classifies, any model used should beat dummy
Mean accuracy across 11 folds for Dummy model is  0.543844181224


In [39]:
log_class = LogisticRegression()
log_cross_val_scores = cross_val_score(log_class, features_X, predict_Y, cv = 11, scoring = 'roc_auc')
print "Mean Auc score across 11 folds for Logistic Model is ", np.mean(log_cross_val_scores)

Mean Auc score across 11 folds for Logistic Model is  0.619335630621


In [51]:
rf_class = RandomForestClassifier(n_estimators = 51, max_depth = 7)
rf_cross_val_scroes = cross_val_score(rf_class, features_X, predict_Y, cv = 11, scoring='roc_auc')
print "Mean auc score across 11 folds for RandomForest Model is ", np.mean(rf_cross_val_scroes)

Mean auc score across 11 folds for RandomForest Model is  0.617969835008
