# Predict position of NBA players

In [119]:
import pandas as pd

shot_stats = pd.read_csv('./csvData/NBA_Shot_dist - Sheet2.csv')
guards = pd.read_csv('./csvData/NBA_Shot_dist - guard_list.csv')
forwards = pd.read_csv('./csvData/NBA_Shot_dist - forwards_list.csv')
centers = pd.read_csv('./csvData/NBA_Shot_dist - center_list.csv')
playerHeight = pd.read_csv('./csvData/playerHeight.csv')

playerHeight = playerHeight.iloc[:, [0, 7, 9]]

shot_stats.columns = ['name', 'team', 'szn', 'type', 'games', '0to8_makePrecent', '8to16_makePrecent',
              '16to24_makePrecent', '24plus_makePrecent', 'bcMake', '0to8_att', 
              '8to16_att', '16to24_att', '24plus_att', 'bc_att', 'avgDist', 
              'avgMakeDist', 'avgMissDist']

position_df = pd.concat([guards, forwards, centers])

position_df = position_df.groupby(['name'], as_index=False).sum()

In [120]:
# merge
df = pd.merge(playerHeight, shot_stats, how='inner')
df = pd.merge(df, position_df, how='inner')


# Delete columns out of DF

In [121]:
del df['team']
del df['szn']
del df['type']
del df['0to8_att']
del df['8to16_att']
del df['16to24_att']
del df['24plus_att']
del df['avgMissDist']
del df['avgDist']
del df['bc_att']
del df['bcMake']

# Make X and y - Test and Train sets

In [122]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, 1:9]
y = df.iloc[:, 9:12]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

# Normalization && Tunning

In [123]:
X['heightMeters'] = X['heightMeters']**2
X['weightKilograms'] = X['weightKilograms']**2
X['avgMakeDist'] = X['avgMakeDist']**2
X['games'] = (X['games'])/82 



# Use random forest to predict on all three positions

In [124]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics

""" RANDOM FOREST PREDICTION """
# X, y = make_classification(n_samples=100, n_features=5, n_informative=5, n_redundant=0, random_state=0, shuffle=False)
clf = RandomForestClassifier(n_estimators=120, max_depth=25, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred), " -- Accuracy with Random Forests\n")
print("-------------- Importances per column with RF")
print((clf.feature_importances_), "\n")



0.7017543859649122  -- Accuracy with Random Forests

-------------- Importances per column with RF
[0.33540101 0.24113354 0.04844777 0.07467626 0.05160191 0.05580068
 0.07513222 0.11780661] 



# Spliting Y's up
* Y1 = Guards
* Y2 = Forwards
* Y3 = Centers

In [127]:
y1 = y.iloc[:, 0]
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=.3)

y2 = y.iloc[:, 0]
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=.3)

y3 = y.iloc[:, 0]
X_train, X_test, y3_train, y3_test = train_test_split(X, y3, test_size=.3)

In [128]:
""" RANDOM FOREST PREDICTION """
# X, y = make_classification(n_samples=100, n_features=5, n_informative=5, n_redundant=0, random_state=0, shuffle=False)
clf = RandomForestClassifier(n_estimators=120, max_depth=15, random_state=0)
clf.fit(X_train, y1_train)
y1_pred = clf.predict(X_test)

print(metrics.accuracy_score(y1_test, y1_pred), " -- Accuracy with Random Forests\n")
print("-------------- Importances per column with RF")
print((clf.feature_importances_), "\n")

0.4473684210526316  -- Accuracy with Random Forests

-------------- Importances per column with RF
[0.08545149 0.12110838 0.13152989 0.14686236 0.12106277 0.12709884
 0.10804134 0.15884492] 



# Kmeans method & Clulstering
##### how many clusters should this dataset have? 
* make 20 models
* group them by model
* sum positions in each model 
* get deviation score, as getting all positions in one group would be 100%
* sumation of salaries to see if best players are put in the same group
* Keep in mind - Fx(model_n) = groupBy(Gn) . count (positions) 

In [129]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans = kmeans.fit(X)

labels = kmeans.predict(X)
centroids = kmeans.cluster_centers_

from sklearn.metrics import silhouette_score

# Use silhouette score to find optimal number of clusters to segment the data
num_clusters = np.arange(2,10)
results = {}
for size in num_clusters:
    model = KMeans(n_clusters = size).fit(X)
    predictions = model.predict(X)
    results[size] = silhouette_score(X, predictions)

best_size = max(results, key=results.get)



print(centroids)
print(best_size)

[[3.69345276e+00 7.57410638e+03 5.86134050e-01 5.20590551e-01
  3.83023622e-01 3.63157480e-01 3.39779528e-01 1.77091969e+02]
 [4.40660286e+00 1.29660376e+04 5.92682927e-01 5.79428571e-01
  3.79695238e-01 3.33409524e-01 2.20447619e-01 8.03965295e+01]
 [4.09292041e+00 1.01807280e+04 5.78646093e-01 5.64836735e-01
  3.53285714e-01 3.26931973e-01 3.02639456e-01 1.36052876e+02]]
2


In [None]:
kmeans = KMeans(n_clusters=20)
kmeans = kmeans.fit(df)

labels = kmeans.predict(df)
centroids = kmeans.cluster_centers_

