In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sklearn
from sklearn import linear_model

In [2]:
# import scraped NBA stats from Excel file

xls = pd.ExcelFile('nba_draft.xlsx')

map = {}
for sheet_name in xls.sheet_names:
    map[sheet_name] = xls.parse(sheet_name)
    

In [3]:
new_frame = pd.DataFrame(columns=map['draft_data_2019'].columns)
del map['draft_data_2020']
# looping through each dataframe item in map, and only acquiring the rows for players who play Small Forward
for key, value in map.items():
    test = value.loc[value['position'] == 'Center']
    test = test[['Pk', 'college_G', 'college_MP', 'college_FG', 'college_FGA',
       'college_3P', 'college_3PA', 'college_FT', 'college_FTA', 'college_ORB',
       'college_TRB', 'college_AST', 'college_STL', 'college_BLK',
       'college_TOV', 'college_PF', 'college_PTS', 'college_FG%',
       'college_3P%', 'college_FT%', 'college_MP.1', 'college_PTS%',
       'college_TRB.1', 'college_AST.1']]
    # reset indices and renumber
    test.reset_index(drop=True,inplace=True)
    test.index = test.index + 1
    test.Pk = test.index
    # drop any rows that are missing data
    test = test.dropna()
    if test.empty is False:
        new_frame = new_frame.append(test)

new_frame = new_frame[['Pk', 'college_G', 'college_MP', 'college_FG', 'college_FGA',
       'college_3P', 'college_3PA', 'college_FT', 'college_FTA', 'college_ORB',
       'college_TRB', 'college_AST', 'college_STL', 'college_BLK',
       'college_TOV', 'college_PF', 'college_PTS', 'college_FG%',
       'college_3P%', 'college_FT%', 'college_MP.1', 'college_PTS%',
       'college_TRB.1', 'college_AST.1']]        
        
X = new_frame.loc[:,new_frame.columns.str.startswith('college')]
y = new_frame['Pk'].astype(int)


In [4]:
new_frame

Unnamed: 0,Pk,college_G,college_MP,college_FG,college_FGA,college_3P,college_3PA,college_FT,college_FTA,college_ORB,...,college_TOV,college_PF,college_PTS,college_FG%,college_3P%,college_FT%,college_MP.1,college_PTS%,college_TRB.1,college_AST.1
1,1,36,989,179,293,0,1,90,143,108,...,90,94,448,0.611,0.000,0.629,27.5,12.4,8.4,1.0
2,2,38,893,206,369,1,6,162,268,154,...,78,122,575,0.558,0.167,0.604,23.5,15.1,9.8,1.0
3,3,103,1264,343,726,13,45,168,259,129,...,123,87,867,0.472,0.289,0.649,35.1,8.4,6.3,1.5
4,4,65,1164,345,635,9,33,243,359,74,...,189,87,942,0.543,0.273,0.677,34.2,14.5,8.2,3.2
7,7,101,941,415,798,4,16,210,364,106,...,155,103,1044,0.520,0.250,0.577,26.9,10.3,7.7,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,10,132,3295,532,990,46,113,138,173,311,...,90,331,1248,0.537,0.407,0.798,25.0,9.5,7.8,0.8
2,2,71,1301,249,500,34,91,202,279,142,...,83,169,734,0.498,0.374,0.724,18.3,10.3,5.3,0.3
3,3,65,1496,181,396,26,86,157,257,127,...,90,147,545,0.457,0.302,0.611,23.0,8.4,6.2,1.0
4,4,64,1692,291,489,4,13,184,241,145,...,148,179,770,0.595,0.308,0.763,26.4,12.0,8.7,1.4


In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
# create an instance of the model
lin_reg_mod = linear_model.LinearRegression()
test_set_rmse = 0
test_set_r2 = 0
# number of splits/ folds
n = 5

kf = KFold(n_splits=n, shuffle=True, random_state=1)

# initialize variable for sum of correct prediction fractions
sum_of_c_fraction = 0
for train_index, test_index in kf.split(X):
    
    # initialize variable for counting number of correct predictions
    c = 0
    
    lin_reg_mod.fit(X.iloc[train_index],y.iloc[train_index])
    
    y_pred = lin_reg_mod.predict(X.iloc[test_index])
    
    # compare final prediction values against true values
    final_predictions = pd.DataFrame(columns = ['True Ranking','Predicted Ranking'])
    y_test = list(y.iloc[test_index])
    y_pred = list(y_pred)
    for i in range(0,len(y_test)):
        new_row = {'True Ranking': y_test[i],'Predicted Ranking': int(y_pred[i])}
        if y_test[i] == int(y_pred[i]):
            
            # increment c value if prediction is correct
            c = c + 1
        final_predictions = final_predictions.append(new_row, ignore_index=True)
        
    print(final_predictions)
    print(str(c/n))
    print(r2_score(y_test,y_pred))
    sum_of_c_fraction = sum_of_c_fraction + (c/len(y_test))
    # check the predictions against the actual values by using the root mean square deviation 
    # and coefficient of determination metrics
    test_set_rmse = test_set_rmse + (np.sqrt(mean_squared_error(y_test, y_pred)))
    test_set_r2 = test_set_r2 + r2_score(y_test, y_pred)

print('Average RMSE: ' + str(test_set_rmse/n))
print('Average R2: ' + str(test_set_r2/n))
print('Accuracy: ' + str((1/n)*int(sum_of_c_fraction)))

   True Ranking Predicted Ranking
0             3                10
1             2                 4
2             1                 4
3             9                 3
4            10                 7
5             2                 3
6             4                 3
7             6                 6
8             3                 3
9            10                 6
10            5                 5
11            8                 8
12           10                 5
13            5                 1
14           10                 1
15            6                 5
0.8
-0.46307040243665076
   True Ranking Predicted Ranking
0             8                 9
1             2                12
2            12                 7
3             7                 3
4             5                -1
5             6                 3
6             5                 4
7             7                 5
8             9                 7
9             1                 0
10            9        

In [7]:
# dataframe showing all the features and their estimated coefficients obtained from the linear regression
coeff_df = pd.DataFrame(X.columns)
coeff_df.columns = ['Features']
coeff_df['Coefficient Estimate'] = pd.Series(lin_reg_mod.coef_)
coeff_df

Unnamed: 0,Features,Coefficient Estimate
0,college_G,0.037887
1,college_MP,0.002815
2,college_FG,-0.024697
3,college_FGA,-0.037017
4,college_3P,0.109258
5,college_3PA,-0.066158
6,college_FT,-0.006269
7,college_FTA,-0.042488
8,college_ORB,-0.016898
9,college_TRB,-0.003387
