In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

In [2]:
# Constants - The master stat (feature) list, and the corresponding player data.
stats_list = ['MP_x','FG','FGA','2P','2PA','3P','3PA','FT','FTA','ORB', 'DRB', 'TRB', 'AST', 'STL','BLK','TOV','PF','PTS','FG%','2P%','3P%', 'eFG%', 'FT%', 'TS%_x', 'PER', '3PAr','FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%','ORtg', 'DRtg', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM','VORP', 'dist', 'spd', 'tchs', 'pass', 'sast', 'ftast', 'dfgm','dfga']
df = pd.read_csv('../data/final_stats.csv')
df.set_index('player_year',inplace=True)
df.fillna(value=0,inplace=True)

In [30]:
feature_space = ['MP_x','2P','3P','3PA','FTr','TS%_x','ORB','TRB','AST%','STL%','BLK%','3PAr','DWS','DBPM','dist','dfga']

In [14]:
# Optimizing Gaussian Mixture Model
X = df[feature_space]

cluster_list = range(5,25)
covar_list = ['full','tied','diag','spherical']
comp_list = [2,3,4,5]

for k in cluster_list:
    print ('n clusters = {}'.format(k))
    max_score = [0,0,0,0]
    for covar in covar_list:
        for c in comp_list:
            pca = PCA(n_components=c,whiten=True,svd_solver='full')
            reduced = pca.fit_transform(X)
            
            gmm = GaussianMixture(n_components=k,covariance_type=covar,n_init=100)
            gmm.fit(reduced)
            labels = gmm.predict(reduced)
            try:
                score = silhouette_score(reduced,labels)
            except:
                continue
            if score > max_score[0]:
                max_score = [score,'covar type:{}'.format(covar),'{} pca components'.format(c)]
                print ('new max score: {}'.format(max_score))

n clusters = 5
new max score: [0.38256595355625328, 'covar type:full', '2 pca components']
new max score: [0.39785849961855618, 'covar type:tied', '2 pca components']
n clusters = 6
new max score: [0.33725247236732986, 'covar type:full', '2 pca components']
new max score: [0.34307604122257329, 'covar type:tied', '2 pca components']
n clusters = 7
new max score: [0.34190531469808544, 'covar type:full', '2 pca components']
n clusters = 8
new max score: [0.33654106119757538, 'covar type:full', '2 pca components']
n clusters = 9
new max score: [0.35309072934042185, 'covar type:full', '2 pca components']
n clusters = 10
new max score: [0.35308795641786439, 'covar type:full', '2 pca components']
n clusters = 11
new max score: [0.3502438278376156, 'covar type:full', '2 pca components']
new max score: [0.35052649425673904, 'covar type:diag', '2 pca components']
n clusters = 12
new max score: [0.33458295647438874, 'covar type:full', '2 pca components']
new max score: [0.34012283766456575, 'cova

In [15]:
gmm = GaussianMixture(n_components=12,covariance_type='full')
gmm_pca = GaussianMixture(n_components=13,covariance_type='full',n_init=100)

pca = PCA(n_components=2)
reduced = pca.fit_transform(X)

gmm.fit(X)
gmm_pca.fit(reduced)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=None, n_components=13, n_init=100, precisions_init=None,
        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [16]:
silhouette_score(X,gmm.predict(X))

-0.020533796388575339

In [17]:
silhouette_score(reduced,gmm_pca.predict(reduced))

0.32711881152878819

In [19]:
X['label'] = gmm_pca.predict(reduced)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [20]:
X.loc['Nikola Jokic/2016-17']

MP_x     27.900000
tchs     66.315068
2P        7.900000
3P        0.800000
3PA       2.500000
FTr       0.267000
TS%_x     0.640000
ORB       3.700000
TRB      12.700000
AST%     28.800000
STL%      1.500000
BLK%      2.100000
3PAr      0.163000
DWS       2.000000
DBPM      2.200000
dist      1.858630
dfga      7.342466
label     4.000000
Name: Nikola Jokic/2016-17, dtype: float64

In [23]:
X['label'].value_counts()

3     130
11    119
1     107
6     103
5      98
7      78
8      59
2      51
12     46
4      44
9      43
0      27
10      6
Name: label, dtype: int64

In [26]:
X[X['label'] == 1]

Unnamed: 0_level_0,MP_x,tchs,2P,3P,3PA,FTr,TS%_x,ORB,TRB,AST%,STL%,BLK%,3PAr,DWS,DBPM,dist,dfga,label
player_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Marc Gasol/2015-16,34.4,44.304878,6.6,0.0,0.1,0.347,0.528,1.2,7.3,18.9,1.4,3.6,0.004,1.9,1.7,1.333537,4.390244,1
David West/2014-15,28.7,48.170732,6.1,0.1,0.4,0.235,0.508,2.1,8.5,20.1,1.3,2.0,0.029,2.8,2.4,1.391341,3.000000,1
Jared Sullinger/2015-16,23.6,44.597701,6.1,0.5,1.9,0.200,0.476,3.6,12.7,15.5,1.9,1.9,0.128,3.6,2.3,1.458621,5.149425,1
Trevor Booker/2016-17,24.7,49.797468,5.7,0.5,1.6,0.186,0.554,2.9,11.7,12.9,2.1,1.2,0.132,2.2,1.8,1.546076,2.911392,1
Derrick Favors/2015-16,32.0,49.400000,7.6,0.0,0.1,0.313,0.551,3.1,9.1,8.9,2.0,3.9,0.005,3.4,2.3,1.563412,5.258824,1
Pau Gasol/2016-17,25.4,50.913580,5.5,1.2,2.3,0.305,0.578,2.4,11.1,14.7,0.8,3.5,0.172,3.1,2.8,1.592963,6.259259,1
Luc Mbah a Moute/2014-15,28.6,44.487805,3.6,1.2,3.8,0.256,0.466,1.5,6.2,9.8,2.1,0.9,0.318,2.0,0.1,1.597683,4.012195,1
Nerlens Noel/2015-16,29.3,44.144578,5.6,0.0,0.0,0.387,0.544,2.8,9.9,10.5,2.9,3.9,0.003,2.8,3.4,1.635060,6.710843,1
Nikola Vucevic/2015-16,31.3,49.126582,9.4,0.0,0.2,0.143,0.531,3.1,10.2,15.9,1.3,2.8,0.009,2.4,1.1,1.660380,6.278481,1
Marvin Williams/2015-16,28.9,45.644860,2.9,2.3,5.8,0.193,0.585,2.0,8.0,7.6,1.2,2.7,0.506,3.2,1.0,1.667757,3.691589,1


In [27]:
X.head()

Unnamed: 0_level_0,MP_x,tchs,2P,3P,3PA,FTr,TS%_x,ORB,TRB,AST%,STL%,BLK%,3PAr,DWS,DBPM,dist,dfga,label
player_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Nick Collison/2015-16,11.8,7.333333,2.6,0.0,0.1,0.303,0.498,3.5,8.8,9.6,1.1,1.7,0.018,0.7,1.3,0.381348,0.985816,6
Jimmer Fredette/2014-15,10.2,8.77907,3.8,0.6,3.4,0.271,0.479,0.6,2.8,17.2,1.5,0.3,0.289,0.2,-2.9,0.432442,0.27907,0
Brandon Bass/2016-17,11.1,10.397727,6.6,0.1,0.2,0.473,0.65,2.7,8.0,5.7,1.2,1.6,0.016,0.6,-0.8,0.463523,0.852273,6
Tyler Zeller/2016-17,10.3,10.795455,5.3,0.0,0.1,0.247,0.508,2.9,8.5,12.2,0.7,3.3,0.006,0.6,0.8,0.4725,1.477273,6
Justin Holiday/2014-15,11.1,8.861386,3.1,1.9,6.0,0.191,0.498,0.7,4.0,10.1,3.0,1.4,0.464,1.0,0.5,0.483663,0.514851,6


In [29]:
X.to_csv('../data/final_gaussian.csv')