# Predicting Position of NBA Players
### Based on in game stats

In [1]:
import pandas as pd

### Creating Dataframe

In [2]:
shot_df = pd.read_csv("./NBA_Shot_dist - Sheet2.csv")
guard_df = pd.read_csv("./NBA_Shot_dist - guard_list.csv")
forward_df = pd.read_csv("./NBA_Shot_dist - forwards_list.csv")
center_df = pd.read_csv("./NBA_Shot_dist - center_list.csv")

shot_df.columns = ['name', 'team', 'szn', 'type', 'games', 
              '0to8_make', '8to16_make', '16to24_make', '24plus_make', 'bcMake',
              '0to8_att', '8to16_att', '16to24_att', '24plus_att', 'bcatt', 
              'avg_dist', 'avg_madeDist', 'avg_missDist']

position_df = pd.concat([guard_df, forward_df, center_df])
position_df = position_df.groupby(['name'], as_index = False).sum()

In [3]:
df = pd.merge(shot_df, position_df, how='inner')

### Deleting Columns

In [4]:
del df['0to8_att']
del df['8to16_att']
del df['16to24_att']
del df['24plus_att']
del df['bcatt']
del df['bcMake']
del df['avg_missDist']
del df['avg_dist']
del df['team']
del df['szn']
del df['type']

df

Unnamed: 0,name,games,0to8_make,8to16_make,16to24_make,24plus_make,avg_madeDist,guard,forward,center
0,James Harden,60,0.579,0.437,0.353,0.376,13.13,1,0,0
1,Anthony Davis,59,0.670,0.412,0.369,0.344,6.61,0,1,1
2,Giannis Antetokounmpo,64,0.668,0.359,0.345,0.297,5.46,0,1,0
3,LeBron James,68,0.698,0.430,0.374,0.372,7.96,1,1,0
4,Damian Lillard,61,0.530,0.421,0.469,0.379,14.02,1,0,0
5,Kevin Durant,59,0.714,0.486,0.485,0.433,13.37,0,1,0
6,Stephen Curry,48,0.603,0.537,0.605,0.432,16.56,1,0,0
7,Russell Westbrook,68,0.579,0.381,0.395,0.284,8.85,1,0,0
8,DeMarcus Cousins,46,0.573,0.431,0.276,0.358,8.98,0,0,1
9,Devin Booker,54,0.519,0.392,0.451,0.387,13.67,1,0,0


### Creating X and y - Train and Test Sets

In [5]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, 1:7]

y = df.iloc[:, 7:10]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)


### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn import metrics


""" RANDOM FOREST PREDICTION """

# X, y = make_classification(n_samples=100, n_features=5, n_informative=5, n_redundant=0, random_state=0, shuffle=False)

clf = RandomForestClassifier(n_estimators=120, max_depth=15, random_state=0)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


print(metrics.accuracy_score(y_test, y_pred), " -- Accuracy with Random Forests\n")

print("-------------- Importances per column with RF")

print((clf.feature_importances_), "\n")

0.4166666666666667  -- Accuracy with Random Forests

-------------- Importances per column with RF
[0.15766021 0.17872376 0.12944837 0.12772594 0.17372677 0.23271496] 



In [7]:
X_test, y_pred

(     games  0to8_make  8to16_make  16to24_make  24plus_make  avg_madeDist
 506      4      0.000       0.000        0.000        0.000          0.00
 352     27      0.640       0.571        0.571        0.000          6.85
 169     62      0.564       0.300        0.351        0.345         10.19
 435      4      1.000       0.000        0.000        0.500         15.33
 364      5      1.000       0.000        0.500        0.667         13.86
 48      12      0.500       0.348        0.300        0.312         12.41
 118     64      0.602       0.419        0.714        0.448          5.36
 108     63      0.626       0.533        0.500        0.345         12.98
 246     70      0.491       0.413        0.416        0.371         14.95
 257     60      0.602       0.468        0.500        0.284         12.26
 319     58      0.661       0.389        0.294        0.286          7.58
 424     29      0.381       0.000        0.200        0.333         14.80
 394     28      0.596   

### Splitting Y's up
* y1 = guard
* y2 = forward
* y3 = center

In [8]:
y1 = y.iloc[:,0]
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=.3)
y2 = y.iloc[:,1]
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=.3)
y3 = y.iloc[:,2]
X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=.3)




In [9]:
""" RANDOM FOREST PREDICTION """

# X, y = make_classification(n_samples=100, n_features=5, n_informative=5, n_redundant=0, random_state=0, shuffle=False)

clf = RandomForestClassifier(n_estimators=120, max_depth=15, random_state=0)

clf.fit(X_train, y1_train)

y1_pred = clf.predict(X_test)


print(metrics.accuracy_score(y1_test, y1_pred), " -- Accuracy with Random Forests\n")

print("-------------- Importances per column with RF")

print((clf.feature_importances_), "\n")

0.5128205128205128  -- Accuracy with Random Forests

-------------- Importances per column with RF
[0.18064216 0.15680362 0.15619497 0.14045883 0.17268555 0.19321487] 



### Logistic Regression

In [12]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, LinearRegression

clf = LogisticRegression()

clf.fit(X_train, y1_train)

y1_pred = clf.predict(X_test)

print(metrics.accuracy_score(y1_test, y1_pred), " -- Accuracy with Logistic Regression\n")


0.4935897435897436  -- Accuracy with Logistic Regression



### Linear Regression

In [None]:
regr = linear_model.LinearRegression()

regr.fit(X_train,y1_train)

y1_pred = regr.predict(X_test)


print(metrics.accuracy_score(y1_test, y1_pred), " -- Accuracy with Linear Regression\n")