In [10]:
import pandas as pd
import numpy as np

from sklearn.cluster import AffinityPropagation
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabaz_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize

from agglomerative import get_data
from cluster_visual import plot_tsne_scatter

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [11]:
# Constants - The master stat (feature) list, and the corresponding player data.
stats_list = ['MP_x','FG','FGA','2P','2PA','3P','3PA','FT','FTA','ORB', 'DRB', 'TRB', 'AST', 'STL','BLK','TOV','PF','PTS','FG%','2P%','3P%', 'eFG%', 'FT%', 'TS%_x', 'PER', '3PAr','FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%','ORtg', 'DRtg', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM','VORP', 'dist', 'spd', 'tchs', 'pass', 'sast', 'ftast', 'dfgm','dfga']
df = pd.read_csv('../data/final_stats.csv')
df.set_index('player_year',inplace=True)
df.fillna(value=0,inplace=True)

In [112]:
# After lot's of feature space randomization and analysis, let's use what we've learned to craft a feature space
# manually.  Here's our (tentative) new stats list:
new_stats_list = ['MP_x','2P','3P','3PA','FTr','TS%_x','ORB','TRB','AST%','STL%','BLK%','3PAr','DWS','DBPM','dist','dfga']

In [29]:
# Let's optimize a Gaussian Mixture model:
X = df[new_stats_list]

cluster_list = range(10,25)
pca_components = [2,3,4,5]
covar_list = ['full','diag','tied','spherical']

for k in cluster_list:
    print "{} clusters".format(k)
    for com in pca_components:
        for covar in covar_list:
            gmm = GaussianMixture(n_components=k,covariance_type=covar,n_init=10,random_state=23)
            pca = PCA(n_components=com,whiten=True,svd_solver='full')
            reduced = pca.fit_transform(X)
            gmm.fit(reduced)
            
            try:
                silhouette = silhouette_score(reduced,gmm.predict(reduced))
                calinski = calinski_harabaz_score(reduced,gmm.predict(reduced))
            except:
                continue

            X['label'] = gmm.predict(reduced)
            if X.loc['Nikola Jokic/2016-17']['label'] == X.loc['Marc Gasol/2016-17']['label'] and silhouette > 0.4:
                print("{} components| covariance type: {} | silhouette: {} | calinski: {}".format(com,covar,silhouette,calinski))
                print("Silhouette * Calinski = {}".format(silhouette*calinski))

10 clusters


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


11 clusters
12 clusters
13 clusters
14 clusters
15 clusters
16 clusters
4 components| covariance type: spherical | silhouette: 0.429010789479 | calinski: 428.670202253
Silhouette * Calinski = 183.904141894
17 clusters
3 components| covariance type: spherical | silhouette: 0.401774966444 | calinski: 643.364230862
Silhouette * Calinski = 258.487642266
4 components| covariance type: spherical | silhouette: 0.402868787223 | calinski: 397.510004555
Silhouette * Calinski = 160.144373444
18 clusters
19 clusters
3 components| covariance type: diag | silhouette: 0.417491565731 | calinski: 619.465446424
Silhouette * Calinski = 258.621599144
20 clusters
3 components| covariance type: diag | silhouette: 0.407771819001 | calinski: 567.390821846
Silhouette * Calinski = 231.365987508
3 components| covariance type: tied | silhouette: 0.440264333264 | calinski: 704.509470467
Silhouette * Calinski = 310.170392293
21 clusters
22 clusters
2 components| covariance type: diag | silhouette: 0.403898802186 | 

We start seeing our scores show up around k=16, let's tighten those parameter ranges a little bit, and up the number of initializations to truly tune this thing:

In [30]:
# Let's optimize a Gaussian Mixture model:
X = df[new_stats_list]

cluster_list = range(16,24)
pca_components = [2,3,4]
covar_list = ['full','diag','tied','spherical']

for k in cluster_list:
    print "{} clusters".format(k)
    for com in pca_components:
        for covar in covar_list:
            gmm = GaussianMixture(n_components=k,covariance_type=covar,n_init=100,random_state=23)
            pca = PCA(n_components=com,whiten=True,svd_solver='full')
            reduced = pca.fit_transform(X)
            gmm.fit(reduced)
            
            try:
                silhouette = silhouette_score(reduced,gmm.predict(reduced))
                calinski = calinski_harabaz_score(reduced,gmm.predict(reduced))
            except:
                continue

            X['label'] = gmm.predict(reduced)
            if X.loc['Nikola Jokic/2016-17']['label'] == X.loc['Marc Gasol/2016-17']['label'] and silhouette > 0.4:
                print("{} components| covariance type: {} | silhouette: {} | calinski: {}".format(com,covar,silhouette,calinski))
                print("Silhouette * Calinski = {}".format(silhouette*calinski))

16 clusters


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


17 clusters
18 clusters
19 clusters
3 components| covariance type: full | silhouette: 0.457492373491 | calinski: 598.867680962
Silhouette * Calinski = 273.97739677
20 clusters
3 components| covariance type: full | silhouette: 0.484977693537 | calinski: 802.964913312
Silhouette * Calinski = 389.42007165
3 components| covariance type: diag | silhouette: 0.494256365039 | calinski: 817.685890538
Silhouette * Calinski = 404.146456001
3 components| covariance type: tied | silhouette: 0.504493446295 | calinski: 817.770171525
Silhouette * Calinski = 412.55969211
3 components| covariance type: spherical | silhouette: 0.499597150193 | calinski: 800.564464287
Silhouette * Calinski = 399.959724903
21 clusters
22 clusters
23 clusters


Alright - our best results were 20 clusters, 3 PCA components, and tied covariance type - let's run with those!

In [113]:
X = df[new_stats_list]
pca = PCA(n_components=3,whiten=True,svd_solver='full')
gmm = GaussianMixture(n_components=20,covariance_type='tied',n_init=200,verbose=1)

reduced = pca.fit_transform(X)
gmm.fit(reduced)

print "silhouette score = {}".format(silhouette_score(reduced,gmm.predict(reduced)))
print "calinski-harabasz score = {}".format(calinski_harabaz_score(reduced,gmm.predict(reduced)))

Initialization 0
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 1
  Iteration 0
Initialization converged: True
Initialization 2
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 3
  Iteration 0
Initialization converged: True
Initialization 4
  Iteration 0
Initialization converged: True
Initialization 5
  Iteration 0
Initialization converged: True
Initialization 6
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 7
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 8
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 9
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 10
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 11
  Iteration 0
Initialization converged: True
Initialization 12
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 13
  Iteration 0
  Iteration 10
Initialization converged

  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 121
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 122
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 123
  Iteration 0
Initialization converged: True
Initialization 124
  Iteration 0
Initialization converged: True
Initialization 125
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 126
  Iteration 0
Initialization converged: True
Initialization 127
  Iteration 0
Initialization converged: True
Initialization 128
  Iteration 0
Initialization converged: True
Initialization 129
  Iteration 0
Initialization converged: True
Initialization 130
  Iteration 0
  Iteration 10
Initialization converged: True
Initialization 131
  Iteration 0
Initialization converged: True
Initialization 132
  Iteration 0
Initialization converged: True
Initialization 133
  Iteration 0
Initialization converged: True
Initialization 134
  Iteration 0
  Iteration 10


In [127]:
X['label'] = gmm.predict(reduced)
X['label'].value_counts()
X[X.label == 11]
# X.loc['Nikola Jokic/2016-17']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,MP_x,2P,3P,3PA,FTr,TS%_x,ORB,TRB,AST%,STL%,BLK%,3PAr,DWS,DBPM,dist,dfga,label
player_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Marc Gasol/2015-16,34.4,6.6,0.0,0.1,0.347,0.528,1.2,7.3,18.9,1.4,3.6,0.004,1.9,1.7,1.333537,4.390244,11
David West/2014-15,28.7,6.1,0.1,0.4,0.235,0.508,2.1,8.5,20.1,1.3,2.0,0.029,2.8,2.4,1.391341,3.0,11
Pau Gasol/2015-16,31.8,7.0,0.4,1.1,0.29,0.529,2.4,12.5,21.7,0.9,4.7,0.069,3.9,3.5,1.503939,7.717172,11
Evan Turner/2015-16,28.0,5.1,0.3,1.3,0.238,0.513,0.8,6.3,23.9,1.7,1.0,0.11,3.0,1.3,1.569333,2.466667,11
Josh Smith/2014-15,27.7,5.4,1.1,3.6,0.258,0.467,2.4,8.3,20.5,1.9,3.9,0.231,3.4,2.6,1.6983,6.51,11
DeMarcus Cousins/2016-17,34.2,7.5,1.9,5.3,0.468,0.562,2.2,11.6,25.8,2.0,3.3,0.254,3.8,2.6,1.733457,5.876543,11
Julius Randle/2016-17,28.8,6.1,0.3,1.1,0.365,0.543,2.5,10.7,19.3,1.1,1.4,0.082,1.5,0.7,1.74481,4.670886,11
Nikola Vucevic/2016-17,28.8,7.7,0.4,1.2,0.155,0.498,2.9,13.0,16.9,1.8,2.8,0.073,3.2,2.4,1.78,7.151899,11
Marc Gasol/2016-17,34.2,6.1,1.5,3.8,0.286,0.554,0.9,6.6,24.8,1.4,3.7,0.231,3.5,2.2,1.835909,5.136364,11
Mason Plumlee/2016-17,26.5,5.6,0.0,0.1,0.482,0.558,2.8,10.2,19.8,1.6,3.4,0.01,2.5,2.6,1.83625,6.9625,11


In [131]:
X.to_csv('../data/gaussian_final.csv')