# Predicting Position of NBA Players
### Based on in game stats

In [1]:
import pandas as pd
import numpy as np

### Creating Dataframe

In [2]:
shot_df = pd.read_csv("./csvData/NBA_Shot_dist - Sheet2.csv")
guard_df = pd.read_csv("./csvData/NBA_Shot_dist - guard_list.csv")
forward_df = pd.read_csv("./csvData/NBA_Shot_dist - forwards_list.csv")
center_df = pd.read_csv("./csvData/NBA_Shot_dist - center_list.csv")
heightWeight_df = pd.read_csv("./csvData/playerHeight.csv")
salary_df = pd.read_csv("./csvData/playerSalaries.csv")
dribble_df = pd.read_csv("./csvData/NBA_Shot_dist - dribble stats.csv")

for col in salary_df.select_dtypes([np.object]):

    salary_df[col] = salary_df[col].str.lstrip('$')
salary_df = salary_df.replace(regex={',':''})
salary_df = salary_df.iloc[:,[1,2]]
salary_df.columns = ['name', 'salary']

heightWeight_df = heightWeight_df.iloc[:,[0,7,9]]

shot_df.columns = ['name', 'team', 'szn', 'type', 'games', 
              '0to8_make', '8to16_make', '16to24_make', '24plus_make', 'bcMake',
              '0to8_att', '8to16_att', '16to24_att', '24plus_att', 'bcatt', 
              'avg_dist', 'avg_madeDist', 'avg_missDist']

gen_df = pd.read_csv("./csvData/NBA_Shot_dist - gen_stats.csv")
gen_df = gen_df.iloc[:, [0,19,20,22,23]]
gen_df.columns = ['name','reb','ast','stl','blk',]

position_df = pd.concat([guard_df, forward_df, center_df])
position_df = position_df.groupby(['name'], as_index = False).sum()

dribble_df = dribble_df.iloc[:,[0,11,14,15]]
dribble_df.columns = ['name', 'avgDrib', 'post', 'paint']

In [3]:
df = pd.merge(heightWeight_df, shot_df, how='inner')
df = pd.merge(df,gen_df, how='inner')
df = pd.merge(df,dribble_df, how='inner')
df = pd.merge(df,salary_df, how='inner')
df = pd.merge(df, position_df, how='inner')
df = df.loc[df['games'] >= 26]

### Deleting Columns

In [4]:
del df['0to8_att']
del df['8to16_att']
del df['16to24_att']
del df['24plus_att']
del df['bcatt']
del df['bcMake']
del df['avg_missDist']
del df['avg_dist']
del df['team']
del df['szn']
del df['type']


### Creating X and y - Train and Test Sets

In [5]:
from sklearn.model_selection import train_test_split

x = df.iloc[:, 1:16]
x['heightMeters'] = x['heightMeters']**3
x['weightKilograms'] = x['weightKilograms']**3
x['avg_madeDist'] = x['avg_madeDist']**3
x['reb'] = x['reb']**2
x['avgDrib'] = x['avgDrib']**2
x['games'] = (x['games'])/82 

y = df.iloc[:, 17:20]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3)


### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn import metrics


""" RANDOM FOREST PREDICTION """

# X, y = make_classification(n_samples=100, n_features=5, n_informative=5, n_redundant=0, random_state=0, shuffle=False)

clf = RandomForestClassifier(n_estimators=120, max_depth=15, random_state=0)

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)


print(metrics.accuracy_score(y_test, y_pred), " -- Accuracy with Random Forests\n")

print("-------------- Importances per column with RF")

print((clf.feature_importances_), "\n")

0.75  -- Accuracy with Random Forests

-------------- Importances per column with RF
[0.19953178 0.15490382 0.02513541 0.02728829 0.02062952 0.0256401
 0.03792075 0.06791733 0.04936105 0.03733992 0.02349625 0.03439978
 0.12319537 0.0513891  0.12185153] 



### Splitting Y's up
* y1 = guard
* y2 = forward
* y3 = center

In [7]:
y1 = y.iloc[:,0]
x_train, x_test, y1_train, y1_test = train_test_split(x, y1, test_size=.3)
#y2 = y.iloc[:,1]
#X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=.3)
#y3 = y.iloc[:,2]
#X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=.3)




In [8]:
""" RANDOM FOREST PREDICTION """

# X, y = make_classification(n_samples=100, n_features=5, n_informative=5, n_redundant=0, random_state=0, shuffle=False)

clf = RandomForestClassifier(n_estimators=120, max_depth=15, random_state=0)

clf.fit(x_train, y1_train)

y1_pred = clf.predict(x_test)


print(metrics.accuracy_score(y1_test, y1_pred), " -- Accuracy with Random Forests\n")

print("-------------- Importances per column with RF")

print((clf.feature_importances_), "\n")

0.9772727272727273  -- Accuracy with Random Forests

-------------- Importances per column with RF
[0.20858943 0.18358284 0.01402156 0.02177928 0.01565438 0.02105792
 0.01844995 0.04378352 0.04582511 0.03537132 0.02541001 0.02874664
 0.14450258 0.07126682 0.12195866] 



### Using KMeans to determine how many groups to make
* Make 20 models
* Group by group number
* sum by position
* get score of model
* check w salaries

In [9]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

# Use silhouette score to find optimal number of clusters to segment the data
num_clusters = np.arange(2,10)
results = {}
for size in num_clusters:
    model = KMeans(n_clusters = size).fit(x)
    predictions = model.predict(x)
    results[size] = silhouette_score(x, predictions)

best_size = max(results, key=results.get)
best_size

2

In [10]:
kmeans = KMeans(n_clusters= 2)
kmeans = kmeans.fit(x)

labels = kmeans.predict(x)
centroids = kmeans.cluster_centers_

In [11]:
x['group'] = labels

df["salary"] = round(df.salary.astype(pd.np.number))
salaries = df.iloc[:,16].values
x = df[["name", "guard", "forward", "center"]]
x['salary'] = salaries / 1e6
x["group"] = labels

group1 = x.iloc[:,5]==1
group1 = x.loc[group1,:]

guards1 = group1.iloc[:,1]==1
guards1 = group1.loc[guards1,:]

guards1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,name,guard,forward,center,salary,group
8,Justin Anderson,1,1,0,1.57944,1
49,Jimmy Butler,1,0,0,18.696918,1
157,Josh Huestis,1,0,0,1.471382,1
166,LeBron James,1,1,0,33.285709,1
174,Joe Johnson,1,1,0,10.254904,1
299,Wayne Selden,1,1,0,1.312611,1
303,Ben Simmons,1,1,0,6.16884,1
312,Lance Stephenson,1,0,0,4.18,1


In [14]:
group0 = x.iloc[:,5]==0
group0 = x.loc[group0,:]

centers0 = group0.iloc[:,3]==1
centers0 = group0.loc[centers0,:]

centers0

Unnamed: 0,name,guard,forward,center,salary,group
29,Davis Bertans,0,1,1,1.312611,0
190,Skal Labissiere,0,1,1,1.312611,0
192,Joffrey Lauvergne,0,0,1,1.524305,0
213,Thon Maker,0,1,1,3.086784,0
250,Lucas Nogueira,0,0,1,2.947304,0
350,Brandan Wright,0,1,1,5.95576,0


In [12]:
df_groups = x.groupby('group', as_index=False)['guard', 'forward', 'center'].sum()
df_group_salary = x.groupby('group', as_index=False)['salary'].mean()
df_groups = pd.merge(df_groups, df_group_salary, how='inner')
df_groups

Unnamed: 0,group,guard,forward,center,salary
0,0,131,70,6,8.080828
1,1,8,75,68,9.942495
