# Predicting Position of NBA Players
### Based on in game stats

In [1]:
import pandas as pd

### Creating Dataframe

In [12]:
shot_df = pd.read_csv("./csvData/NBA_Shot_dist - Sheet2.csv")
guard_df = pd.read_csv("./csvData/NBA_Shot_dist - guard_list.csv")
forward_df = pd.read_csv("./csvData/NBA_Shot_dist - forwards_list.csv")
center_df = pd.read_csv("./csvData/NBA_Shot_dist - center_list.csv")
heightWeight_df = pd.read_csv("./csvData/playerHeight.csv")
salary_df = pd.read_csv("./csvData/playerSalaries.csv")

import numpy as np

for col in salary_df.select_dtypes([np.object]):

    salary_df[col] = salary_df[col].str.lstrip('$')
salary_df = salary_df.replace(regex={',':''})
salary_df = salary_df.iloc[:,[1,2]]
salary_df.columns = ['name', 'salary']

heightWeight_df = heightWeight_df.iloc[:,[0,7,9]]

shot_df.columns = ['name', 'team', 'szn', 'type', 'games', 
              '0to8_make', '8to16_make', '16to24_make', '24plus_make', 'bcMake',
              '0to8_att', '8to16_att', '16to24_att', '24plus_att', 'bcatt', 
              'avg_dist', 'avg_madeDist', 'avg_missDist']

gen_df = pd.read_csv("./csvData/NBA_Shot_dist - gen_stats.csv")
gen_df = gen_df.iloc[:, [0,19,20,22,23]]
gen_df.columns = ['name','reb','ast','stl','blk',]

position_df = pd.concat([guard_df, forward_df, center_df])
position_df = position_df.groupby(['name'], as_index = False).sum()

In [13]:
df = pd.merge(heightWeight_df, shot_df, how='inner')
df = pd.merge(df,gen_df, how='inner')
df = pd.merge(df,salary_df, how='inner')
df = pd.merge(df, position_df, how='inner')

### Deleting Columns

In [14]:
del df['0to8_att']
del df['8to16_att']
del df['16to24_att']
del df['24plus_att']
del df['bcatt']
del df['bcMake']
del df['avg_missDist']
del df['avg_dist']
del df['team']
del df['szn']
del df['type']
print(df)

                      name  heightMeters  weightKilograms  games  0to8_make  \
0             Alex Abrines          1.98             86.2     64      0.477   
1               Quincy Acy          2.01            108.9     59      0.419   
2             Steven Adams          2.13            115.7     64      0.658   
3            Arron Afflalo          1.96             95.3     46      0.571   
4             Cole Aldrich          2.11            113.4     16      0.273   
5        LaMarcus Aldridge          2.11            117.9     62      0.631   
6               Tony Allen          1.93             96.6     20      0.536   
7          Al-Farouq Aminu          2.06             99.8     55      0.490   
8          Justin Anderson          1.98            103.4     28      0.559   
9            Kyle Anderson          2.06            104.3     61      0.593   
10           Ryan Anderson          2.08            108.9     57      0.571   
11   Giannis Antetokounmpo          2.11            

### Creating X and y - Train and Test Sets

In [15]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, 1:13]

y = df.iloc[:, 14:17]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)


### Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn import metrics


""" RANDOM FOREST PREDICTION """

# X, y = make_classification(n_samples=100, n_features=5, n_informative=5, n_redundant=0, random_state=0, shuffle=False)

clf = RandomForestClassifier(n_estimators=120, max_depth=15, random_state=0)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


print(metrics.accuracy_score(y_test, y_pred), " -- Accuracy with Random Forests\n")

print("-------------- Importances per column with RF")

print((clf.feature_importances_), "\n")

0.6944444444444444  -- Accuracy with Random Forests

-------------- Importances per column with RF
[0.29006461 0.22221994 0.02722407 0.04426088 0.02469042 0.03090448
 0.04589267 0.0960454  0.06754865 0.06845396 0.02892628 0.05376863] 



### Splitting Y's up
* y1 = guard
* y2 = forward
* y3 = center

In [17]:
y1 = y.iloc[:,0]
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=.3)
#y2 = y.iloc[:,1]
#X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=.3)
#y3 = y.iloc[:,2]
#X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=.3)




In [18]:
""" RANDOM FOREST PREDICTION """

# X, y = make_classification(n_samples=100, n_features=5, n_informative=5, n_redundant=0, random_state=0, shuffle=False)

clf = RandomForestClassifier(n_estimators=120, max_depth=15, random_state=0)

clf.fit(X_train, y1_train)

y1_pred = clf.predict(X_test)


print(metrics.accuracy_score(y1_test, y1_pred), " -- Accuracy with Random Forests\n")

print("-------------- Importances per column with RF")

print((clf.feature_importances_), "\n")

0.9351851851851852  -- Accuracy with Random Forests

-------------- Importances per column with RF
[0.35066265 0.21517331 0.02063886 0.03027038 0.01727586 0.02003009
 0.02556424 0.07345014 0.09454572 0.09030768 0.02539937 0.0366817 ] 



### Using KMeans to determine how many groups to make
* Make 20 models
* Group by group number
* sum by position
* get score of model
* check w salaries

In [19]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

# Use silhouette score to find optimal number of clusters to segment the data
num_clusters = np.arange(2,10)
results = {}
for size in num_clusters:
    model = KMeans(n_clusters = size).fit(X)
    predictions = model.predict(X)
    results[size] = silhouette_score(X, predictions)

best_size = max(results, key=results.get)

In [20]:
kmeans = KMeans(n_clusters= 20)
kmeans = kmeans.fit(X)

labels = kmeans.predict(X)
centroids = kmeans.cluster_centers_


In [21]:
X['group'] = labels
X['name'] = df['name']
X['guard'] = df['guard']
X['forward'] = df['forward']
X['center'] = df['center']
X['salary'] = df['salary']
X.groupby('group')['guard','forward','center'].sum()

Unnamed: 0_level_0,guard,forward,center
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,24,6,0
1,11,1,0
2,1,10,26
3,6,4,0
4,7,8,2
5,8,16,2
6,3,21,4
7,3,11,4
8,0,9,6
9,0,0,4
