In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

In [4]:
# Constants - The master stat (feature) list, and the corresponding player data.
stats_list = ['MP_x','FG','FGA','2P','2PA','3P','3PA','FT','FTA','ORB', 'DRB', 'TRB', 'AST', 'STL','BLK','TOV','PF','PTS','FG%','2P%','3P%', 'eFG%', 'FT%', 'TS%_x', 'PER', '3PAr','FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%','ORtg', 'DRtg', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM','VORP', 'dist', 'spd', 'tchs', 'pass', 'sast', 'ftast', 'dfgm','dfga']
df = pd.read_csv('../data/final_stats.csv')
df.set_index('player_year',inplace=True)
df.fillna(value=0,inplace=True)

In [5]:
feature_space = ['MP_x','tchs','2P','3P','3PA','FTr','TS%_x','ORB','TRB','AST%','STL%','BLK%','3PAr','DWS','DBPM','dist','dfga']

In [7]:
# Optimizing Agglomerative Clustering
X = df[feature_space]

cluster_list = range(5,25)
affinity_list = ['manhattan','l1','l2','cosine','euclidean']
comp_list = [2,3,4,5]
linkage_list = ['complete','average']

for k in cluster_list:
    print ('n clusters = {}'.format(k))
    max_score = [0,0,0,0]
    for affinity in affinity_list:
        for c in comp_list:
            for l in linkage_list:
                pca = PCA(n_components=c,whiten=True,svd_solver='full')
                reduced = pca.fit_transform(X)

                agg = AgglomerativeClustering(n_clusters=k,affinity=affinity,linkage=l)
                agg.fit(reduced)
                labels = agg.labels_
                try:
                    score = silhouette_score(reduced,labels)
                except:
                    continue
                if score > max_score[0]:
                    max_score = [score,'affinity:{}'.format(affinity),'{} linkage'.format(l),'{} pca components'.format(c)]
                    print ('new max score: {}'.format(max_score))

n clusters = 5
new max score: [0.34719174155773647, 'affinity:manhattan', 'complete linkage', '2 pca components']
new max score: [0.36175239685887317, 'affinity:l2', 'average linkage', '2 pca components']
n clusters = 6
new max score: [0.34207234958094279, 'affinity:manhattan', 'complete linkage', '2 pca components']
new max score: [0.3799916244496353, 'affinity:l2', 'average linkage', '2 pca components']
n clusters = 7
new max score: [0.31519347724543539, 'affinity:manhattan', 'complete linkage', '2 pca components']
new max score: [0.31700525748588121, 'affinity:l2', 'complete linkage', '2 pca components']
new max score: [0.37581178978782481, 'affinity:l2', 'average linkage', '2 pca components']
n clusters = 8
new max score: [0.31472730319411596, 'affinity:manhattan', 'complete linkage', '2 pca components']
new max score: [0.364180846591244, 'affinity:l2', 'average linkage', '2 pca components']
n clusters = 9
new max score: [0.32391961839158978, 'affinity:manhattan', 'complete linkage

In [8]:
pca = PCA(n_components=2,whiten=True,svd_solver='full')
agg = AgglomerativeClustering(n_clusters=13,affinity='l2',linkage='average')
reduced = pca.fit_transform(X)

agg.fit(X)

AgglomerativeClustering(affinity='l2', compute_full_tree='auto',
            connectivity=None, linkage='average',
            memory=Memory(cachedir=None), n_clusters=13,
            pooling_func=<function mean at 0x10f344840>)

In [9]:
X['label'] = agg.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [12]:
X.loc['Nikola Jokic/2016-17']
X['label'].value_counts()

3     346
7     229
2      70
0      66
4      65
1      60
5      43
12     10
10      8
9       6
11      3
8       3
6       2
Name: label, dtype: int64

In [19]:
X[X['label'] == 5]

Unnamed: 0_level_0,MP_x,tchs,2P,3P,3PA,FTr,TS%_x,ORB,TRB,AST%,STL%,BLK%,3PAr,DWS,DBPM,dist,dfga,label
player_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Steve Blake/2015-16,17.0,24.62037,1.5,1.9,5.5,0.083,0.51,0.3,3.2,29.5,1.2,0.4,0.629,0.8,-1.8,0.703333,0.583333,5
Tim Frazier/2015-16,14.5,25.549296,3.9,0.8,2.3,0.323,0.499,1.3,5.3,33.3,2.2,0.2,0.21,0.5,-0.7,0.742958,0.549296,5
Shelvin Mack/2014-15,15.1,23.388889,3.3,1.7,5.5,0.106,0.489,0.3,3.4,28.6,1.8,0.2,0.435,0.8,-2.2,0.770333,0.5,5
Marcelo Huertas/2015-16,16.4,23.317073,3.4,0.7,2.5,0.126,0.488,0.7,3.7,33.9,1.5,0.6,0.265,0.1,-2.9,0.771463,0.536585,5
Mario Chalmers/2015-16,22.5,28.0,3.6,1.2,3.9,0.59,0.561,0.6,4.1,26.6,3.3,0.9,0.336,1.5,-0.3,0.777647,0.537815,5
David West/2016-17,12.6,25.240964,5.6,0.1,0.3,0.222,0.571,2.0,8.6,24.1,2.4,4.4,0.032,1.7,3.9,0.789759,3.39759,5
Manu Ginobili/2015-16,19.6,24.924528,4.0,2.2,5.7,0.257,0.573,0.8,4.6,23.6,3.0,0.8,0.411,2.3,1.3,0.873962,0.698113,5
DJ Augustin/2015-16,19.0,27.466667,2.4,2.1,5.1,0.366,0.575,0.2,2.9,25.3,1.6,0.3,0.48,0.6,-2.7,0.886,0.533333,5
Nick Calathes/2014-15,14.4,28.2625,4.0,0.5,1.8,0.182,0.455,0.8,4.6,26.6,3.9,0.5,0.174,1.5,1.8,0.889375,0.65,5
Aaron Brooks/2015-16,16.1,29.662651,4.0,2.1,6.0,0.136,0.494,0.7,3.3,26.0,1.4,0.7,0.394,0.7,-2.8,0.918554,0.710843,5
