In [10]:
import pandas as pd
import numpy as np

from sklearn.cluster import AffinityPropagation
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabaz_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize

from agglomerative import get_data
from cluster_visual import plot_tsne_scatter

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [11]:
# Constants - The master stat (feature) list, and the corresponding player data.
stats_list = ['MP_x','FG','FGA','2P','2PA','3P','3PA','FT','FTA','ORB', 'DRB', 'TRB', 'AST', 'STL','BLK','TOV','PF','PTS','FG%','2P%','3P%', 'eFG%', 'FT%', 'TS%_x', 'PER', '3PAr','FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%','ORtg', 'DRtg', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM','VORP', 'dist', 'spd', 'tchs', 'pass', 'sast', 'ftast', 'dfgm','dfga']
df = pd.read_csv('../data/final_stats.csv')
df.set_index('player_year',inplace=True)
df.fillna(value=0,inplace=True)

In [12]:
# After lot's of feature space randomization and analysis, let's use what we've learned to craft a feature space
# manually.  Here's our (tentative) new stats list:
new_stats_list = ['MP_x','2PA','3PA','FTr','TS%_x','ORB','TRB','AST%','STL%','BLK%','3PAr','DWS','DBPM','dist','dfga']

In [14]:
# Let's optimize a Gaussian Mixture model:
X = df[new_stats_list]

cluster_list = range(10,25)
pca_components = [2,3,4,5]
covar_list = ['full','diag','tied','spherical']

for k in cluster_list:
    print "n clusters = {}".format(k)
    for com in pca_components:
        for covar in covar_list:
            gmm = GaussianMixture(n_components=c,covariance_type=covar)
            pca = PCA(n_components=com,whiten=True,svd_solver='full')
            reduced = pca.fit_transform(X)
            gmm.fit(reduced)
            
            
            try:
                silhouette = silhouette_score(reduced,gmm.predict(X))
                calinski = calinski_harabaz_score(reduced,gmm.predict(X))
            except:
                continue

            X['label'] = gmm.predict(X)
            if X.loc['Nikola Jokic/2016-17']['label'] == X.loc['Marc Gasol/2016-17']['label']:
                print("{} components| covariance type: {} | silhouette: {} | calinski: {}".format(com,covar,silhouette,calinski))