In [29]:
import pandas as pd
import numpy as np
import plotly as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn import tree

In [2]:
# pull shot data - note that this is from github - https://github.com/sealneaward/nba-movement-data

raw = pd.read_csv('shot_data.csv')
dropped = raw.drop(columns = ['EVENT_TYPE', 'GAME_EVENT_ID', 'GAME_DATE', 'GAME_ID'])

dropped.head(10)

Unnamed: 0,ACTION_TYPE,EVENTTIME,GRID_TYPE,HTM,LOC_X,LOC_Y,MINUTES_REMAINING,PERIOD,PLAYER_ID,PLAYER_NAME,...,SHOT_DISTANCE,SHOT_MADE_FLAG,SHOT_TIME,SHOT_TYPE,SHOT_ZONE_AREA,SHOT_ZONE_BASIC,SHOT_ZONE_RANGE,TEAM_ID,TEAM_NAME,VTM
0,Jump Shot,215,Shot Chart Detail,LAL,-23.9825,157.0968,3,1,101138,Brandon Bass,...,15,0,218.5,2PT Field Goal,Center(C),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,OKC
1,Dunk Shot,191,Shot Chart Detail,LAL,224.3221,24.9662,3,1,101138,Brandon Bass,...,0,1,204.83,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,OKC
2,Jump Shot,518,Shot Chart Detail,LAL,-5.4122,173.0601,8,2,101138,Brandon Bass,...,16,1,520.65,2PT Field Goal,Center(C),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,OKC
3,Jump Shot,120,Shot Chart Detail,LAL,-25.9351,158.757,2,3,101138,Brandon Bass,...,15,0,123.26,2PT Field Goal,Center(C),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,OKC
4,Jump Shot,646,Shot Chart Detail,LAL,178.335,20.1712,10,1,201579,Roy Hibbert,...,15,1,650.49,2PT Field Goal,Right Side(R),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,OKC
5,Hook Shot,552,Shot Chart Detail,LAL,100.3073,98.9265,9,1,201579,Roy Hibbert,...,5,0,546.23,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,1610612747,Los Angeles Lakers,OKC
6,Hook Shot,430,Shot Chart Detail,LAL,111.916,222.6307,7,1,201579,Roy Hibbert,...,4,0,449.9,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,1610612747,Los Angeles Lakers,OKC
7,Hook Shot,590,Shot Chart Detail,LAL,-92.6563,65.3523,9,3,201579,Roy Hibbert,...,5,0,593.22,2PT Field Goal,Center(C),In The Paint (Non-RA),Less Than 8 ft.,1610612747,Los Angeles Lakers,OKC
8,Jump Shot,401,Shot Chart Detail,LAL,-216.1135,-10.5376,6,1,1626204,Larry Nance Jr.,...,13,1,408.84,2PT Field Goal,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,OKC
9,Layup Shot,273,Shot Chart Detail,LAL,-219.9258,16.3056,4,2,1626204,Larry Nance Jr.,...,1,0,297.98,2PT Field Goal,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,OKC


Get the shots grouped by player in a dictionary

In [3]:

playerDict = {} # first is player name, second is their shots

# get list of ids
playerIds = dropped.PLAYER_ID.unique()

# make dictionary with empty arrays for each player's shots
for pid in playerIds:
    playerDict[pid] = []

# for each shot, add it to the player ID's array
for index, row in dropped.iterrows():
    innerPid = row['PLAYER_ID']
    shotMade = row['SHOT_MADE_FLAG']
    
    playerDict[innerPid].append(shotMade)

In [4]:
# try 2
playerAvgGrouped = dropped.groupby('PLAYER_ID', as_index=False)['SHOT_MADE_FLAG'].mean()
playerAverages = playerAvgGrouped.set_index(playerAvgGrouped['PLAYER_ID'])

In [None]:
shotAvgDict = {}

# get average of python list
def avg(l): 
    return sum(l) / len(l) 

for p in playerDict.keys():
    shotAvgDict[p] = avg(playerDict[p])
    
shotAvgDict

In [32]:
# copy
full = dropped.sample(frac = 1)
full['PLAYER_AVG'] = ''

# add average -> This takes a bit for me. About 9 mins on my machine
# Will go to OH to try to speed it up, but until then, we can just use the outputted CSV 
for index, row in full.iterrows():
    full.loc[full['PLAYER_ID'] == row['PLAYER_ID'], 'PLAYER_AVG'] = shotAvgDict[row['PLAYER_ID']]

In [34]:
full.to_csv('fullDF.csv')

In [3]:
fullDF = pd.read_csv('fullDF.csv')

In [4]:
fullDF = fullDF.loc[:, ~fullDF.columns.str.contains('^Unnamed')]
fullDF.drop(columns = ['EVENTTIME', 'GRID_TYPE'], inplace = True)

In [5]:
# separate target and features
target = fullDF['SHOT_MADE_FLAG']
features = fullDF.drop(columns = ['SHOT_MADE_FLAG'])
print(sum(target)/len(target))

0.4483052553068062


In [25]:
justNumeric = features.drop(columns = ["TEAM_NAME", "ACTION_TYPE", "HTM", "PLAYER_NAME", "SHOT_TYPE", "SHOT_ZONE_AREA", "SHOT_ZONE_BASIC", "SHOT_ZONE_RANGE", "PLAYER_ID", 'TEAM_ID', 'TEAM_ID', 'VTM'])
justNumeric

ohe = OneHotEncoder()
ohe.fit(features)
oheFeat = ohe.transform(features)

In [26]:
# split with everything 
X_train, X_test, y_train, y_test = train_test_split(oheFeat, target, test_size=0.25) #, random_state = 4400

In [8]:
# split with just the numeric
X_train, X_test, y_train, y_test = train_test_split(justNumeric, target, test_size=0.25) #, random_state = 4400

In [30]:
# scale
scaler = MaxAbsScaler()
scaler.fit(X_train)
x_train_scaled = scaler.transform(X_train)
x_test_scaled = scaler.transform(X_test)

In [31]:
# make classifier and fit it
decisionTree = tree.DecisionTreeClassifier(max_depth = 2, random_state = 4400)
decisionTree = decisionTree.fit(X_train, y_train)

# make random forest classifiers
randomForest = RandomForestClassifier(n_estimators=10, random_state=4400)
randomForest.fit(X_train, y_train)

# make adaboost
adaboost = AdaBoostClassifier(n_estimators=10, random_state=4400)
adaboost.fit(X_train, y_train)

loc = [decisionTree, randomForest, adaboost]

KeyboardInterrupt: 

In [17]:
# predict the classifications

for c in loc:    
    # for training
    c.predict(X_train)
    trainAcc = c.score(X_train, y_train)
    trainError = 1-trainAcc

    # for testing 
    c.predict(X_test)
    testAcc = c.score(X_test, y_test)
    testError = 1-testAcc

    # print results
    print('---------------------\n') 
    print("Training accuracy: %s" % (trainAcc))
    print("Training Error: %s" % (trainError))
    print("Testing accuracy: %s" % (testAcc))
    print("Testing Error: %s\n" % (testError))

---------------------

Training accuracy: 0.6212312549329124
Training Error: 0.3787687450670876
Testing accuracy: 0.6239522659468675
Testing Error: 0.3760477340531325

---------------------

Training accuracy: 0.9799210734017364
Training Error: 0.020078926598263602
Testing accuracy: 0.5871572666571956
Testing Error: 0.4128427333428044

---------------------

Training accuracy: 0.6215627466456196
Training Error: 0.3784372533543804
Testing accuracy: 0.6245205284841597
Testing Error: 0.37547947151584027

---------------------

Training accuracy: 0.598342541436464
Training Error: 0.40165745856353596
Testing accuracy: 0.5985698726144812
Testing Error: 0.40143012738551875



In [11]:
# for training
rfc50.predict(X_train)
trainAcc = rfc50.score(X_train, y_train)
trainError = 1-trainAcc
    
# for testing 
dt.predict(X_test)
testAcc = rfc50.score(X_test, y_test)
testError = 1-testAcc

# print results
print("Training accuracy: %s" % (trainAcc))
print("Training Error: %s" % (trainError))
print("Testing accuracy: %s" % (testAcc))
print("Testing Error: %s" % (testError))

NameError: name 'rfc50' is not defined

In [12]:
stumpAccs = {}

features

for column in justNumeric.columns:
    colName = column
    singCol = X_train[column].values
    dtStump = tree.DecisionTreeClassifier(max_depth = 1, random_state = 4400)
    dtStump.fit(singCol.reshape(-1, 1), y_train)
    
    singColtest = X_test[column].values
    dtStump.predict(singColtest.reshape(-1, 1))
    acc = dtStump.score(singColtest.reshape(-1, 1), y_test)
    stumpAccs[colName] = acc
    
#stumpAccsSorted = dict(sorted(stumpAccs.items(), key=operator.itemgetter(1),reverse=True))

for key, val in stumpAccs.items():
    print("Column: %s \nAccuracy: %s\n-----------------\n" % (key, val))
    
# for column in oheFeat.columns:

63350
63350
63350
63350
63350
63350
63350
63350
63350
63350
Column: LOC_X 
Accuracy: 0.5523511862480466
-----------------

Column: LOC_Y 
Accuracy: 0.5523511862480466
-----------------

Column: MINUTES_REMAINING 
Accuracy: 0.5523511862480466
-----------------

Column: PERIOD 
Accuracy: 0.5523511862480466
-----------------

Column: QUARTER 
Accuracy: 0.5523511862480466
-----------------

Column: SECONDS_REMAINING 
Accuracy: 0.5523511862480466
-----------------

Column: SHOT_ATTEMPTED_FLAG 
Accuracy: 0.5523511862480466
-----------------

Column: SHOT_DISTANCE 
Accuracy: 0.6239522659468675
-----------------

Column: SHOT_TIME 
Accuracy: 0.5523511862480466
-----------------

Column: PLAYER_AVG 
Accuracy: 0.5611119003646351
-----------------

