In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sklearn
from sklearn import linear_model

In [2]:
# import scraped NBA stats from Excel file

xls = pd.ExcelFile('nba_draft.xlsx')

map = {}
for sheet_name in xls.sheet_names:
    map[sheet_name] = xls.parse(sheet_name)
    

In [3]:
new_frame = pd.DataFrame(columns=map['draft_data_2019'].columns)

# looping through each dataframe item in map, and only acquiring the rows for players who play Small Forward
for key, value in map.items():
    test = value.loc[value['position'] == 'Small']
    # reset indices and renumber
    test.reset_index(drop=True,inplace=True)
    test.index = test.index + 1
    test.Pk = test.index
    # drop any rows that are missing data
    test = test.dropna()
    if test.empty is False:
        new_frame = new_frame.append(test)
        
new_frame = new_frame[['Pk', 'college_G', 'college_MP', 'college_FG', 'college_FGA',
       'college_3P', 'college_3PA', 'college_FT', 'college_FTA', 'college_ORB',
       'college_TRB', 'college_AST', 'college_STL', 'college_BLK',
       'college_TOV', 'college_PF', 'college_PTS', 'college_FG%',
       'college_3P%', 'college_FT%', 'college_MP.1', 'college_PTS%',
       'college_TRB.1', 'college_AST.1']]

X = new_frame.loc[:,new_frame.columns.str.startswith('college')]
y = new_frame['Pk'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [4]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
# create an instance of the model
lin_reg_mod = linear_model.LinearRegression()
test_set_rmse = 0
test_set_r2 = 0
# number of splits/ folds
n = 5

kf = KFold(n_splits=n, shuffle=True, random_state=1)
for train_index, test_index in kf.split(X):
    lin_reg_mod.fit(X.iloc[train_index],y.iloc[train_index])
    y_pred = lin_reg_mod.predict(X.iloc[test_index])
    # compare final prediction values against true values
    final_predictions = pd.DataFrame(columns = ['True Ranking','Predicted Ranking'])
    y_test = list(y.iloc[test_index])
    y_pred = list(y_pred)
    for i in range(0,len(y_test)):
        new_row = {'True Ranking': y_test[i],'Predicted Ranking': int(y_pred[i])}
        final_predictions = final_predictions.append(new_row, ignore_index=True)
    print(final_predictions)
    # check the predictions against the actual values by using the root mean square deviation 
    # and coefficient of determination metrics
    test_set_rmse = test_set_rmse + (np.sqrt(mean_squared_error(y_test, y_pred)))
    test_set_r2 = test_set_r2 + r2_score(y_test, y_pred)

print('Average RMSE: ' + str(test_set_rmse/n))
print('Average R2: ' + str(test_set_r2/n))

   True Ranking Predicted Ranking
0             1                 2
1             3                 1
2             7                 3
3             3                 6
4             5                 2
5             8                 3
6             9                 5
7             2                 6
8             3                 4
9             2                 6
10            7                 4
11            3                -5
12            4                 6
13            1                 4
14            5                 5
15            6                 3
   True Ranking Predicted Ranking
0             3                 1
1             6                 5
2             3                 4
3             2                 1
4             2                 2
5             4                 3
6             7                 6
7             4                 6
8             1                 4
9             5                 4
10            2                 3
11            

In [6]:
# dataframe showing all the features and their estimated coefficients obtained from the linear regression
coeff_df = pd.DataFrame(X.columns)
coeff_df.columns = ['Features']
coeff_df['Coefficient Estimate'] = pd.Series(lin_reg_mod.coef_)
coeff_df

Unnamed: 0,Features,Coefficient Estimate
0,college_G,-0.00278
1,college_MP,-0.000681
2,college_FG,-0.017956
3,college_FGA,0.011271
4,college_3P,0.035337
5,college_3PA,-0.013826
6,college_FT,0.001278
7,college_FTA,0.000203
8,college_ORB,0.003436
9,college_TRB,0.003081
