In [1]:
#This file was written on Kaggle notebooks and is designed to work with shot data from the 2014-2015 NBA season
#Dataset link: https://www.kaggle.com/fudgele/nba-shot-made-missed-predictor-2014-2015-season/edit
#The most accurate results I got was about 60% using a convolutional neural network. In the context of machine learning, this is relatively unaccurate.
#However, given how random and unpredictable a basketball shot is, I'd say 60% accuracy in predicted a shot made or a shot missed is a success.

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nba-shot-logs/shot_logs.csv


In [2]:
#Importing packages
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from keras.layers import Dense
from keras import Sequential
from keras.utils import to_categorical

In [3]:
#Creating dataframe
df = pd.read_csv('/kaggle/input/nba-shot-logs/shot_logs.csv')

In [4]:
#Exploratory data analysis - there are a lot of variables so I will only test the ones I am not sure about

#Home vs away games
location = df.loc[df.LOCATION == "H", "SHOT_RESULT"]
location = location.replace("missed", 0)
location = location.replace("made", 1)

print("The field goal percentage at home games was: " + str(sum(location)/len(location)))

location = df.loc[df.LOCATION == "A", "SHOT_RESULT"]
location = location.replace("missed", 0)
location = location.replace("made", 1)

print("The field goal percentage at away games was: " + str(sum(location)/len(location)))

The field goal percentage at home games was: 0.45617355397753934
The field goal percentage at away games was: 0.4481172526701489


In [5]:
#Quarters (are players fatiguing?)

#First quarter
period = df.loc[df.PERIOD == 1, "SHOT_RESULT"]
period = period.replace("missed", 0)
period = period.replace("made", 1)
print("The field goal percentage in the first quarter was: " + str(sum(period)/len(period)))

#Second quarter
period = df.loc[df.PERIOD == 2, "SHOT_RESULT"]
period = period.replace("missed", 0)
period = period.replace("made", 1)
print("The field goal percentage in the second quarter was: " + str(sum(period)/len(period)))

#Third quarter
period = df.loc[df.PERIOD == 3, "SHOT_RESULT"]
period = period.replace("missed", 0)
period = period.replace("made", 1)
print("The field goal percentage in the third quarter was: " + str(sum(period)/len(period)))

#Fourth quarter
period = df.loc[df.PERIOD == 4, "SHOT_RESULT"]
period = period.replace("missed", 0)
period = period.replace("made", 1)
print("The field goal percentage in the fourth quarter was: " + str(sum(period)/len(period)))

The field goal percentage in the first quarter was: 0.46052825299608374
The field goal percentage in the second quarter was: 0.4511073899718808


The field goal percentage in the third quarter was: 0.45714197013442615
The field goal percentage in the fourth quarter was: 0.4400988909109638


In [6]:
#Selecting the columns I need
df = df[['LOCATION','PERIOD', 'GAME_CLOCK','SHOT_CLOCK','DRIBBLES','TOUCH_TIME','SHOT_DIST','CLOSEST_DEFENDER_PLAYER_ID','CLOSE_DEF_DIST','FGM','player_id','FINAL_MARGIN']]

In [7]:
#Switching string in game clock to an int, in terms of seconds left

#Switching to a list so I can iterate through easier
seconds_left = df['GAME_CLOCK']
seconds_left = seconds_left.tolist()

#Multiplying minutes by 60 and adding to the seconds of each item to get the total seconds left
seconds_left_list = []
for x in seconds_left:
    ls = x.split(":")
    Y = (60*int(ls[0]) + int((ls[1])))
    seconds_left_list.append(Y)

#Switching back from list to dataframe
seconds_left_df = pd.DataFrame(data=seconds_left_list, columns=['SECONDS_LEFT'])
df = df.drop(['GAME_CLOCK'], axis=1)
df = pd.concat([df, seconds_left_df], axis=1)
df

Unnamed: 0,LOCATION,PERIOD,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,player_id,FINAL_MARGIN,SECONDS_LEFT
0,A,1,10.8,2,1.9,7.7,101187,1.3,1,203148,24,69
1,A,1,3.4,0,0.8,28.2,202711,6.1,0,203148,24,14
2,A,1,,3,2.7,10.1,202711,0.9,0,203148,24,0
3,A,2,10.3,2,1.9,17.2,203900,3.4,0,203148,24,707
4,A,2,10.9,2,2.7,3.7,201152,1.1,0,203148,24,634
...,...,...,...,...,...,...,...,...,...,...,...,...
128064,A,3,18.3,5,6.2,8.7,203935,0.8,0,101127,-16,112
128065,A,4,19.8,4,5.2,0.6,202323,0.6,1,101127,-16,688
128066,A,4,23.0,2,4.2,16.9,201977,4.2,1,101127,-16,670
128067,A,4,9.1,4,4.5,18.3,202340,3.0,0,101127,-16,157


In [8]:
#Switching location column to binary
location_column = df['LOCATION']
location_column = location_column.replace("A", 0)
location_column = location_column.replace("H", 1)

#Switching from series to dataframe
location_column_df = pd.DataFrame(data=location_column, columns=['LOCATION'])
df = df.drop(['LOCATION'], axis=1)
df = pd.concat([df, location_column_df], axis = 1)
df

Unnamed: 0,PERIOD,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,player_id,FINAL_MARGIN,SECONDS_LEFT,LOCATION
0,1,10.8,2,1.9,7.7,101187,1.3,1,203148,24,69,0
1,1,3.4,0,0.8,28.2,202711,6.1,0,203148,24,14,0
2,1,,3,2.7,10.1,202711,0.9,0,203148,24,0,0
3,2,10.3,2,1.9,17.2,203900,3.4,0,203148,24,707,0
4,2,10.9,2,2.7,3.7,201152,1.1,0,203148,24,634,0
...,...,...,...,...,...,...,...,...,...,...,...,...
128064,3,18.3,5,6.2,8.7,203935,0.8,0,101127,-16,112,0
128065,4,19.8,4,5.2,0.6,202323,0.6,1,101127,-16,688,0
128066,4,23.0,2,4.2,16.9,201977,4.2,1,101127,-16,670,0
128067,4,9.1,4,4.5,18.3,202340,3.0,0,101127,-16,157,0


In [9]:
#Removing data with NaN values for the SHOT_CLOCK column
df = df.dropna()
df

Unnamed: 0,PERIOD,SHOT_CLOCK,DRIBBLES,TOUCH_TIME,SHOT_DIST,CLOSEST_DEFENDER_PLAYER_ID,CLOSE_DEF_DIST,FGM,player_id,FINAL_MARGIN,SECONDS_LEFT,LOCATION
0,1,10.8,2,1.9,7.7,101187,1.3,1,203148,24,69,0
1,1,3.4,0,0.8,28.2,202711,6.1,0,203148,24,14,0
3,2,10.3,2,1.9,17.2,203900,3.4,0,203148,24,707,0
4,2,10.9,2,2.7,3.7,201152,1.1,0,203148,24,634,0
5,2,9.1,2,4.4,18.4,101114,2.6,0,203148,24,495,0
...,...,...,...,...,...,...,...,...,...,...,...,...
128063,2,15.3,2,1.6,8.9,203096,5.7,1,101127,-16,305,0
128064,3,18.3,5,6.2,8.7,203935,0.8,0,101127,-16,112,0
128065,4,19.8,4,5.2,0.6,202323,0.6,1,101127,-16,688,0
128066,4,23.0,2,4.2,16.9,201977,4.2,1,101127,-16,670,0


In [10]:
#sns.kdeplot(df.DRIBBLES, df.SHOT_RESULT, cmap="Reds", shade=True)


In [11]:
df = df.drop(['SECONDS_LEFT','DRIBBLES','LOCATION','player_id', 'CLOSEST_DEFENDER_PLAYER_ID','PERIOD'], axis=1)

In [12]:
#Defining independent and dependent variables
x = df
x = x.drop(['FGM'], axis=1)

y = df['FGM']

In [13]:
#Splitting Data
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, train_size=0.8)

In [14]:
#Fixing the row indexs after splitting the data
x_train = x_train.reset_index(drop = True)
x_train

Unnamed: 0,SHOT_CLOCK,TOUCH_TIME,SHOT_DIST,CLOSE_DEF_DIST,FINAL_MARGIN
0,14.3,10.0,3.5,0.1,2
1,21.9,1.1,22.3,4.6,14
2,14.9,5.2,2.2,2.0,-1
3,18.1,0.2,5.9,2.4,4
4,5.9,0.5,22.4,4.3,12
...,...,...,...,...,...
97996,8.3,1.5,23.9,2.8,-37
97997,14.2,4.0,4.7,2.4,-4
97998,19.7,0.8,0.9,1.8,-6
97999,15.1,2.8,9.6,3.8,1


In [15]:
#Logistic regression model
from sklearn.linear_model import LogisticRegression

LogReg = LogisticRegression(solver='liblinear', random_state=0)
LogReg.fit(x_train, y_train)

y_pred_logistic_regression = LogReg.predict(x_test)

In [16]:
print("Training accuracy is: " + str(LogReg.score(x_train, y_train)))
print("Testing accuracy is: " + str(accuracy_score(y_test, y_pred_logistic_regression)))

Training accuracy is: 0.6064529953775982
Testing accuracy is: 0.6069548181706869


In [17]:
#Decision tree model
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred_decision_tree = clf.predict(x_test)

In [18]:
print("Training accuracy is: " + str(clf.score(x_train, y_train)))
print("Testing accuracy is: " + str(accuracy_score(y_test, y_pred_decision_tree)))

Training accuracy is: 0.9999387761349374
Testing accuracy is: 0.5431206889514714


In [None]:
#Random forest model
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(x_train, y_train)
y_pred_rfc = rfc.predict(x_test)

In [None]:
print("Training accuracy is: " + str(rfc.score(x_train, y_train)))
print("Testing accuracy is: " + str(accuracy_score(y_test, y_pred_rfc)))

In [None]:
#Naive Bayes model (Gaussian)
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred_gnb = gnb.predict(x_test)

In [None]:
print("Training accuracy is: " + str(gnb.score(x_train, y_train)))
print("Testing accuracy is: " + str(accuracy_score(y_test, y_pred_gnb)))

In [None]:
model = Sequential()
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='relu'))
model.add(Dense(2, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, epochs=3, batch_size=15)

y_pred_nn = model.predict_classes(x_test)

In [None]:
print("Testing accuracy is: " + str(accuracy_score(y_test, y_pred_nn)))