In [None]:
#REFERENCE

#PCA
#https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
#https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html


#CLUSTERING


In [9]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import math

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import cluster
from sklearn.decomposition import PCA

from scipy.spatial import distance




In [2]:
##LOADING CLEANED DATA

cleaned_team_2018 = pd.read_csv('teams_cleaned.csv')
#cleaned_team_2018

In [3]:
#FEATURE SELECTION

#REMOVING MOV - corr w W
#REMOVING ORB, DRB, opp ORB, opp DRB, reduntant with ORB% and DRB%
#REMOVING TS% - reflected in shooting stats
#REMOVING PTS and opp PTS - reflected in ORtg and DRtg
#REMOVING ORTG - strongly correlated w Wins

## Selecting Specific Columns
cleaned_selected = cleaned_team_2018[['Team' ,'FTA', 'FT%','AST', 'STL', 'BLK', 'TOV', 'PF', 
                                      'opp_FTA', 'opp_PF',
                                      'Dist.','Pct_FGA_by_dist_0-3','Pct_FGA_by_dist_3-10','Pct_FGA_by_dist_10-16','Pct_FGA_by_dist_16-3pt','Pct_FGA_by_dist_3P','FGP_by_dist_2P','FGP_by_dist_3P','3FG_%assisted','3FG_corner_%3PA',
                                      'opp_Dist.','opp_Pct_FGA_by_dist_0-3','opp_Pct_FGA_by_dist_3P','opp_3FG_corner_%3PA',
                                      'Age','DRtg', 'Pace', 'ORB%', 'DRB%']]

# ['Team', 'FTA', 'FT%','AST', 'STL', 'BLK', 'TOV', 'PF', 
#                                       'opp_FTA', 'opp_FT%','opp_AST', 'opp_STL', 'opp_BLK', 'opp_TOV', 'opp_PF',
#                                       'Dist.','Pct_FGA_by_dist_0-3','Pct_FGA_by_dist_3-10','Pct_FGA_by_dist_10-16','Pct_FGA_by_dist_16-3pt','Pct_FGA_by_dist_3P','FGP_by_dist_2P','FGP_by_dist_3P','3FG_%assisted','3FG_corner_%3PA','3FG_corner_3P%',
#                                       'opp_Dist.','opp_Pct_FGA_by_dist_0-3','opp_Pct_FGA_by_dist_3-10','opp_Pct_FGA_by_dist_10-16','opp_Pct_FGA_by_dist_16-3pt','Pct_FGA_by_dist_3P','opp_FGP_by_dist_2P','opp_FGP_by_dist_3P','opp_3FG_%assisted','opp_3FG_corner_%3PA','opp_3FG_corner_3P%',
#                                       'Age','ORtg','DRtg', 'Pace',  'ORB%', 'DRB%'])


In [4]:
#STANDARDIZING FEATURES
## of non-text columns (excludes Team column)
x_standardized  = pd.DataFrame(StandardScaler().fit_transform(cleaned_selected.select_dtypes(exclude='object')),  
                               columns=['FTA', 'FT%','AST', 'STL', 'BLK', 'TOV', 'PF', 
                                      'opp_FTA', 'opp_PF',
                                      'Dist.','Pct_FGA_by_dist_0-3','Pct_FGA_by_dist_3-10','Pct_FGA_by_dist_10-16','Pct_FGA_by_dist_16-3pt','Pct_FGA_by_dist_3P','FGP_by_dist_2P','FGP_by_dist_3P','3FG_%assisted','3FG_corner_%3PA',
                                      'opp_Dist.','opp_Pct_FGA_by_dist_0-3','opp_Pct_FGA_by_dist_3P','opp_3FG_corner_%3PA',
                                      'Age','DRtg', 'Pace', 'ORB%', 'DRB%'])

#Adds Team column back
x_standardized['Team'] = cleaned_selected['Team']

In [None]:
##GENERATE PRINCIPAL COMPONENTS

#FINDING VARIANCE REPRESENTED BY PC 
pca = PCA()
principalComponents = pca.fit_transform(x_standardized.select_dtypes(exclude='object'))
principalDf = pd.DataFrame(data = principalComponents)

np.set_printoptions(precision=2, suppress=True)
print('Total Variance: ' + str(sum(pca.explained_variance_)))
print('Percentage of Var represented by PC: ')
print(pca.explained_variance_/sum(pca.explained_variance_))
print('Cumulative Percentage of Var represented by PC: ')
print(np.cumsum(pca.explained_variance_/sum(pca.explained_variance_)))




In [None]:
## Number of Cluster Analysis PCA = 2

#DECIDED ON # of Components = 2 to represent 29% of variance
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x_standardized.select_dtypes(exclude='object'))
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', ])#'principal component 3'])#, 'principal component 4', 'principal component 5', 'principal component 6', 'principal component 7'])


Sum_of_squared_distances = []
K = range(2,10)
for k in K:
    km = cluster.KMeans(n_clusters=k)
    km = km.fit(principalDf)
    Sum_of_squared_distances.append(km.inertia_)
    cluster_labels = km.labels_
    silhouette_avg = silhouette_score(principalDf, cluster_labels)
    print("For n_clusters =", k,
          "The average silhouette_score is :", silhouette_avg)
    
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
## Number of Cluster Analysis PCA = 3


#DECIDED ON # of Components = 3 to represent > 40% of variance
pca = PCA(n_components = 3)
principalComponents = pca.fit_transform(x_standardized.select_dtypes(exclude='object'))
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3'])#, 'principal component 4', 'principal component 5', 'principal component 6', 'principal component 7'])


Sum_of_squared_distances = []
K = range(2,10)
for k in K:
    km = cluster.KMeans(n_clusters=k)
    km = km.fit(principalDf)
    Sum_of_squared_distances.append(km.inertia_)
    cluster_labels = km.labels_
    silhouette_avg = silhouette_score(principalDf, cluster_labels)
    print("For n_clusters =", k,
          "The average silhouette_score is :", silhouette_avg)
    
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
## Number of Cluster Analysis PCA = 4


#DECIDED ON # of Components = 4 to represent > 50% of variance
pca = PCA(n_components = 4)
principalComponents = pca.fit_transform(x_standardized.select_dtypes(exclude='object'))
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4'])#, 'principal component 5', 'principal component 6', 'principal component 7'])


Sum_of_squared_distances = []
K = range(2,10)
for k in K:
    km = cluster.KMeans(n_clusters=k)
    km = km.fit(principalDf)
    Sum_of_squared_distances.append(km.inertia_)
    cluster_labels = km.labels_
    silhouette_avg = silhouette_score(principalDf, cluster_labels)
    print("For n_clusters =", k,
          "The average silhouette_score is :", silhouette_avg)
    
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
## Number of Cluster Analysis PCA = 6


#DECIDED ON # of Components = 7 to represent > 70% of variance
pca = PCA(n_components = 6)
principalComponents = pca.fit_transform(x_standardized.select_dtypes(exclude='object'))
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4', 'principal component 5', 'principal component 6'])


Sum_of_squared_distances = []
K = range(2,10)
for k in K:
    km = cluster.KMeans(n_clusters=k)
    km = km.fit(principalDf)
    Sum_of_squared_distances.append(km.inertia_)
    cluster_labels = km.labels_
    silhouette_avg = silhouette_score(principalDf, cluster_labels)
    print("For n_clusters =", k,
          "The average silhouette_score is :", silhouette_avg)
    
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [5]:
##CLUSTER ANALYSIS


pca = PCA(n_components = 6)
principalComponents = pca.fit_transform(x_standardized.select_dtypes(exclude='object'))
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3', 'principal component 4', 'principal component 5', 'principal component 6'])#, 'principal component 7'])

k = 5
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(principalDf)

labels = kmeans.labels_
centroids = kmeans.cluster_centers_

principalDf['Team'] = x_standardized['Team']
principalDf['Cluster'] = labels
principalDf['Playoff'] = cleaned_team_2018['Playoff']
principalDf['Tier'] = cleaned_team_2018['Tier']

principalDf

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,Team,Cluster,Playoff,Tier
0,-0.337543,3.369483,5.438103,-2.01175,-2.255166,-1.583024,Golden State Warriors*,4,1,Contender
1,-4.678415,5.265613,-2.848188,-0.129952,-1.280539,1.097578,Houston Rockets*,1,1,Contender
2,-0.444827,0.552624,1.437173,-0.454464,-0.721711,-2.442391,New Orleans Pelicans*,4,1,Mid-Tier
3,-3.566499,0.818564,2.132181,0.054788,1.579893,1.091094,Toronto Raptors*,1,1,Contender
4,-0.927349,2.279253,-2.198682,-1.508565,-2.935543,-0.720554,Cleveland Cavaliers*,3,1,Contender
5,-0.593124,-0.615679,0.239126,0.556178,0.721333,-2.807333,Denver Nuggets,4,0,Mid-Tier
6,0.352919,-0.217585,3.00581,1.972097,0.046445,-1.244201,Philadelphia 76ers*,4,1,Contender
7,0.812319,-1.644622,-1.082849,-3.498467,1.16218,-1.180531,Minnesota Timberwolves*,2,1,Mid-Tier
8,-2.772538,-2.472777,-2.611797,-0.224492,-0.569619,-1.771862,Los Angeles Clippers,0,0,Mid-Tier
9,1.479087,1.628226,-3.795788,0.439421,1.873014,-2.451889,Charlotte Hornets,3,0,Rebuilding


In [6]:
#PLOT CLUSTERS

#DF creation (adding PC and cluster labels)
teams_clustered = cleaned_team_2018
teams_clustered['Cluster'] = principalDf['Cluster']
teams_clustered['PC1'] = principalDf['principal component 1']
teams_clustered['PC2'] = principalDf['principal component 2']

teams_clustered.to_csv('teams_clustered_2018.csv')

#sns.scatterplot(x="PC1", y="PC2", hue="Cluster", data=teams_clustered)

In [None]:
## PLOT PRINCIPA COMPONENTS

plt.figure()
sns.scatterplot(x="principal component 1", y ="principal component 2", hue ="Team", data = principalDf, legend = False )
# plt.figure()
# sns.scatterplot(x="principal component 1", y ="principal component 3", hue ="Team", data = principalDf, legend = False )
# plt.figure()
# sns.scatterplot(x="principal component 2", y ="principal component 3", hue ="Team", data = principalDf, legend = False )
# plt.figure()
# sns.scatterplot(x="principal component 1", y ="principal component 4", hue ="Team", data = principalDf, legend = False )

# plt.figure()
# sns.scatterplot(x="principal component 1", y ="principal component 5", hue ="Team", data = principalDf, legend = False )





In [None]:
##EUCLIDEAN DISTANCE


a = (principalDf.iloc[0,0], principalDf.iloc[0,1])
b = (principalDf.iloc[1,0], principalDf.iloc[1,1])
dst = distance.euclidean(a, b)

In [10]:
def euclidean_distance(df, row1, row2):
    a = (principalDf.iloc[row1,0], principalDf.iloc[row1,1],principalDf.iloc[row1,2], principalDf.iloc[row1,3],principalDf.iloc[row1,4], principalDf.iloc[row1,5])
    b = (principalDf.iloc[row2,0], principalDf.iloc[row2,1],principalDf.iloc[row2,2], principalDf.iloc[row2,3],principalDf.iloc[row2,4], principalDf.iloc[row2,5])
    dst = distance.euclidean(a, b)
    return dst
    
    

In [13]:
euclidean_distance(principalDf, 1, 0)

10.137938336730999