In [1]:
#Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import json
import csv
import numpy

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [2]:
#Read in csv files
player_data_df = pd.read_csv("raw_data/player_data.csv",encoding = "ISO-8859-1",low_memory=False)
seasons_stats_df = pd.read_csv("raw_data/seasons_stats.csv",encoding = "ISO-8859-1",low_memory=False)

In [3]:
#Display player data
player_data_df.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University


In [4]:
#Rename column to merge on
seasons_stats_df = seasons_stats_df.rename(columns = {'Player':'name'})

#Display player's season stats
seasons_stats_df.head()

Unnamed: 0.1,Unnamed: 0,Year,name,Pos,Age,Tm,G,GS,MP,PER,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,...,0.705,,,,176.0,,,,217.0,458.0
1,1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,...,0.708,,,,109.0,,,,99.0,279.0
2,2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,...,0.698,,,,140.0,,,,192.0,438.0
3,3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,...,0.559,,,,20.0,,,,29.0,63.0
4,4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,...,0.548,,,,20.0,,,,27.0,59.0


In [5]:
#Locate all active players
player_active_df = player_data_df.loc[player_data_df["year_end"]==2018]

#Display active players
player_active_df.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
9,Alex Abrines,2017,2018,G-F,6-6,190.0,"August 1, 1993",
14,Quincy Acy,2013,2018,F,6-7,240.0,"October 6, 1990",Baylor University
21,Steven Adams,2014,2018,C,7-0,255.0,"July 20, 1993",University of Pittsburgh
23,Bam Adebayo,2018,2018,C-F,6-10,243.0,"July 18, 1997",University of Kentucky
26,Arron Afflalo,2008,2018,G,6-5,210.0,"October 15, 1985","University of California, Los Angeles"


In [6]:
#Check number of players
len(player_active_df)

471

In [7]:
#Merge dataframes
players_merged_df = pd.merge(seasons_stats_df,player_active_df,on= "name")

#Display
players_merged_df.head()

Unnamed: 0.1,Unnamed: 0,Year,name,Pos,Age,Tm,G,GS,MP,PER,...,TOV,PF,PTS,year_start,year_end,position,height,weight,birth_date,college
0,6698,1982.0,Larry Nance,PF,22.0,PHO,80.0,0.0,1186.0,14.7,...,104.0,169.0,529.0,2016,2018,F,6-9,230.0,"January 1, 1993",University of Wyoming
1,7071,1983.0,Larry Nance,PF,23.0,PHO,82.0,82.0,2914.0,18.7,...,190.0,254.0,1370.0,2016,2018,F,6-9,230.0,"January 1, 1993",University of Wyoming
2,7426,1984.0,Larry Nance,PF,24.0,PHO,82.0,82.0,2899.0,19.4,...,177.0,274.0,1451.0,2016,2018,F,6-9,230.0,"January 1, 1993",University of Wyoming
3,7772,1985.0,Larry Nance,PF,25.0,PHO,61.0,55.0,2202.0,20.6,...,136.0,185.0,1211.0,2016,2018,F,6-9,230.0,"January 1, 1993",University of Wyoming
4,8154,1986.0,Larry Nance,PF,26.0,PHO,73.0,69.0,2484.0,20.5,...,210.0,247.0,1474.0,2016,2018,F,6-9,230.0,"January 1, 1993",University of Wyoming


In [8]:
#Locate stats for the years 2013-2016
data_to_predict_df = players_merged_df.loc[(players_merged_df['Year']> 2012) & (players_merged_df['Year']< 2017)]

#Display
data_to_predict_df.head()

Unnamed: 0.1,Unnamed: 0,Year,name,Pos,Age,Tm,G,GS,MP,PER,...,TOV,PF,PTS,year_start,year_end,position,height,weight,birth_date,college
15,23910,2016.0,Larry Nance,PF,23.0,LAL,63.0,22.0,1266.0,13.4,...,41.0,124.0,349.0,2016,2018,F,6-9,230.0,"January 1, 1993",University of Wyoming
34,22503,2014.0,Tim Hardaway,SG,21.0,NYK,81.0,1.0,1875.0,12.7,...,47.0,144.0,824.0,2014,2018,G,6-6,205.0,"March 16, 1992",University of Michigan
35,23113,2015.0,Tim Hardaway,SG,22.0,NYK,70.0,30.0,1681.0,12.1,...,82.0,119.0,804.0,2014,2018,G,6-6,205.0,"March 16, 1992",University of Michigan
36,23721,2016.0,Tim Hardaway,SG,23.0,ATL,51.0,1.0,864.0,11.7,...,23.0,48.0,326.0,2014,2018,G,6-6,205.0,"March 16, 1992",University of Michigan
56,21769,2013.0,Vince Carter,SF,36.0,DAL,81.0,3.0,2093.0,17.8,...,106.0,226.0,1088.0,1999,2018,G-F,6-6,220.0,"January 26, 1977",University of North Carolina


In [9]:
#Clean dataframe by replacing na with 0
data_to_predict_df = data_to_predict_df.fillna(0)

In [10]:
#Export dataframe as csv
data_to_predict_df.to_csv('data_to_predict.csv')

In [11]:
#Create list of header names
original_headers = list(data_to_predict_df.columns.values)

#Display for reference
original_headers

['Unnamed: 0',
 'Year',
 'name',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'PER',
 'TS%',
 '3PAr',
 'FTr',
 'ORB%',
 'DRB%',
 'TRB%',
 'AST%',
 'STL%',
 'BLK%',
 'TOV%',
 'USG%',
 'blanl',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 'blank2',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'year_start',
 'year_end',
 'position',
 'height',
 'weight',
 'birth_date',
 'college']

In [12]:
#Define variable to predict
class_column = 'PTS'

In [13]:
#Define variables used to make prediction
feature_columns = ['Age', 'MP', 'FG', '3P', '3PA','eFG%', 'ORB'
                   ,'TOV', 'PF','TS%','weight']

In [14]:
#Pull above variables from data
nba_feature = data_to_predict_df[feature_columns]
nba_class = data_to_predict_df[class_column]

In [15]:
#Split data into random train and test subsets
train_feature, test_feature, train_class, test_class = train_test_split(nba_feature, nba_class,
    train_size=0.75, test_size=0.25, random_state=0)

### Prediction Method 1

In [16]:
#Train the data
linearsvm = LinearSVC(random_state=0).fit(train_feature, train_class)

In [17]:
#Predict
prediction1 = linearsvm.predict(test_feature)

In [18]:
#Display prediction outcome/accuracy
pd.crosstab(test_class, prediction1, rownames=['Actual'], colnames=['Predicted'], margins=True).head()

Predicted,8.0,9.0,13.0,22.0,23.0,25.0,33.0,53.0,62.0,244.0,308.0,593.0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2.0,1,0,0,0,0,0,0,0,0,0,0,0,1
4.0,1,0,0,0,0,0,0,0,0,0,0,0,1
5.0,1,0,0,0,0,0,0,0,0,0,0,0,1
7.0,1,0,0,0,0,0,0,0,0,0,0,0,1
10.0,1,0,0,0,0,0,0,0,0,0,0,0,1


In [19]:
#Evaluate scores using cross validation
scores = cross_val_score(linearsvm, nba_feature, nba_class, cv=6)



In [20]:
#Display prediction conclusion
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))

Cross-validation scores: [0.00249066 0.00729927 0.         0.11111111 0.         0.        ]
Average cross-validation score: 0.02


### Prediction Method 2

In [21]:
#Train the data
nb = GaussianNB().fit(train_feature, train_class)

In [22]:
#Predict
prediction2= nb.predict(test_feature)

In [23]:
#Display prediction outcome/accuracy
pd.crosstab(test_class, prediction2, rownames=['Actual'], colnames=['Predicted'], margins=True).head()

Predicted,13.0,21.0,22.0,29.0,34.0,53.0,58.0,83.0,98.0,104.0,...,1134.0,1149.0,1249.0,1289.0,1377.0,1417.0,1446.0,1463.0,1920.0,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [24]:
#Evaluate scores using cross validation
scores = cross_val_score(nb, nba_feature, nba_class, cv=6)

#Display prediction conclusion
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))



Cross-validation scores: [0.00996264 0.02189781 0.06493506 0.33333333 1.         1.        ]
Average cross-validation score: 0.41


In [25]:
#Train data used for prediction output
train_class_df = pd.DataFrame(train_class,columns=[class_column])     
train_data_df = pd.merge(train_class_df, train_feature, left_index=True, right_index=True)
train_data_df.to_csv('train_data.csv', index=False)

train_data_df.head()

Unnamed: 0,PTS,Age,MP,FG,3P,3PA,eFG%,ORB,TOV,PF,TS%,weight
982,1021.0,27.0,2214.0,330.0,35.0,124.0,0.452,38.0,148.0,90.0,0.539,190.0
2286,100.0,21.0,368.0,44.0,0.0,0.0,0.367,26.0,25.0,44.0,0.379,257.0
1435,1209.0,26.0,2556.0,413.0,162.0,404.0,0.554,30.0,111.0,119.0,0.601,210.0
2320,245.0,21.0,614.0,113.0,5.0,21.0,0.54,41.0,27.0,51.0,0.551,230.0
1956,1562.0,22.0,3167.0,553.0,185.0,503.0,0.501,42.0,243.0,172.0,0.546,195.0


In [26]:
#Prediction method 1 output
temp_df = pd.DataFrame(test_class,columns=[class_column])
temp_df['Predicted PTS']=pd.Series(prediction1, index=temp_df.index)
test_data_df = pd.merge(temp_df, test_feature, left_index=True, right_index=True)
test_data_df.to_csv('test_data_1.csv', index=False)

test_data_df.head()

Unnamed: 0,PTS,Predicted PTS,Age,MP,FG,3P,3PA,eFG%,ORB,TOV,PF,TS%,weight
2021,940.0,593.0,23.0,2208.0,375.0,73.0,246.0,0.435,41.0,113.0,155.0,0.46,225.0
2236,932.0,22.0,21.0,2439.0,359.0,105.0,297.0,0.538,54.0,98.0,143.0,0.566,210.0
1590,914.0,593.0,27.0,2048.0,300.0,76.0,226.0,0.464,42.0,150.0,165.0,0.534,200.0
1033,685.0,593.0,26.0,1605.0,289.0,19.0,65.0,0.466,75.0,75.0,115.0,0.491,221.0
1245,904.0,22.0,26.0,1530.0,338.0,76.0,271.0,0.45,35.0,161.0,63.0,0.493,190.0


In [27]:
#Prediction method 2 output
temp_df = pd.DataFrame(test_class,columns=[class_column])
temp_df['Predicted PTS']=pd.Series(prediction2, index=temp_df.index)
temp_df['Name'] = pd.Series(data_to_predict_df['name'])
temp_df['Year'] = pd.Series(data_to_predict_df['Year'])
test_data_df = pd.merge(temp_df, test_feature, left_index=True, right_index=True)
test_data_df.to_csv('test_data_2.csv', index=False)

test_data_df.head()

Unnamed: 0,PTS,Predicted PTS,Name,Year,Age,MP,FG,3P,3PA,eFG%,ORB,TOV,PF,TS%,weight
2021,940.0,813.0,Dion Waiters,2015.0,23.0,2208.0,375.0,73.0,246.0,0.435,41.0,113.0,155.0,0.46,225.0
2236,932.0,894.0,Gary Harris,2016.0,21.0,2439.0,359.0,105.0,297.0,0.538,54.0,98.0,143.0,0.566,210.0
1590,914.0,813.0,Jeremy Lin,2016.0,27.0,2048.0,300.0,76.0,226.0,0.464,42.0,150.0,165.0,0.534,200.0
1033,685.0,749.0,Thaddeus Young,2015.0,26.0,1605.0,289.0,19.0,65.0,0.466,75.0,75.0,115.0,0.491,221.0
1245,904.0,1134.0,Derrick Rose,2015.0,26.0,1530.0,338.0,76.0,271.0,0.45,35.0,161.0,63.0,0.493,190.0


In [28]:
len(test_data_df)

292