In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sklearn
from sklearn import linear_model

In [2]:
# import scraped NBA stats from Excel file

xls = pd.ExcelFile('nba_draft.xlsx')

map = {}
for sheet_name in xls.sheet_names:
    map[sheet_name] = xls.parse(sheet_name)
    

In [3]:
new_frame = pd.DataFrame(columns=map['draft_data_2019'].columns)

# looping through each dataframe item in map, and only acquiring the rows for players who play Small Forward
for key, value in map.items():
    test = value.loc[value['position'] == 'Small']
    test.reset_index(drop=True,inplace=True)
    test.index = test.index + 1
    test.Pk = test.index
    # drop any rows that are missing data
    test = test.dropna()
    if test.empty is False:
        new_frame = new_frame.append(test)
        
new_frame = new_frame[['Pk', 'college_MP',
       'college_TRB', 'college_AST', 
        'college_PTS']]

X = new_frame.loc[:,new_frame.columns.str.startswith('college')]
y = new_frame['Pk'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [4]:
# splits the dataset into 80% train data and 20% test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
# create an instance of the model
lin_reg_mod = linear_model.LinearRegression()

In [6]:
# fit the model on the training data
lin_reg_mod.fit(X_train, y_train)

LinearRegression()

In [7]:
# dataframe showing all the features and their estimated coefficients obtained from the linear regression
coeff_df = pd.DataFrame(X.columns)
coeff_df.columns = ['Features']
coeff_df['Coefficient Estimate'] = pd.Series(lin_reg_mod.coef_)
coeff_df

Unnamed: 0,Features,Coefficient Estimate
0,college_MP,0.000372
1,college_TRB,0.003011
2,college_AST,-0.002098
3,college_PTS,-0.000426


In [8]:
# making predictions on the testing set
y_pred = lin_reg_mod.predict(X_test)

In [9]:
# check the predictions against the actual values by using the root mean square deviation 
# and coefficient of determination metrics
from sklearn.metrics import mean_squared_error, r2_score
test_set_rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
test_set_r2 = r2_score(y_test, y_pred)

In [10]:
# the lower the rmse value is, the better the fit
print(test_set_rmse)
# the closer the value is to 1, the better the fit
print(test_set_r2)

2.497101802966432
-0.05644768900120467


In [11]:
# compare final prediction values against true values
final_predictions = pd.DataFrame(columns = ['True Ranking','Predicted Ranking'])
y_test = list(y_test)
y_pred = list(y_pred)
for i in range(0,len(y_test)):
    new_row = {'True Ranking': y_test[i],'Predicted Ranking': int(y_pred[i])}
    final_predictions = final_predictions.append(new_row, ignore_index=True)
    
final_predictions

Unnamed: 0,True Ranking,Predicted Ranking
0,5,4
1,1,4
2,6,5
3,9,4
4,7,5
5,4,4
6,6,4
7,2,3
8,5,4
9,9,3
