In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

In [2]:
# Constants - The master stat (feature) list, and the corresponding player data.
stats_list = ['MP_x','FG','FGA','2P','2PA','3P','3PA','FT','FTA','ORB', 'DRB', 'TRB', 'AST', 'STL','BLK','TOV','PF','PTS','FG%','2P%','3P%', 'eFG%', 'FT%', 'TS%_x', 'PER', '3PAr','FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%','ORtg', 'DRtg', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM','VORP', 'dist', 'spd', 'tchs', 'pass', 'sast', 'ftast', 'dfgm','dfga']
df = pd.read_csv('../data/final_stats.csv')
df.set_index('player_year',inplace=True)
df.fillna(value=0,inplace=True)

In [3]:
feature_space = ['MP_x','tchs','2P','3P','3PA','FTr','TS%_x','ORB','TRB','AST%','STL%','BLK%','3PAr','DWS','DBPM','dist','dfga']

In [12]:
# Optimizing BIRCH
X = df[feature_space]
X = normalize(X)

threshold_list = np.linspace(0.1,4,50)
branch_list = np.arange(0,120,10)
comp_list = [2,3,4,5]
max_score = [0,0,0,0]

for t in threshold_list:
    print ('threshold = {}'.format(t))
    for b in branch_list:
        for c in comp_list:
            pca = PCA(n_components=c,whiten=True,svd_solver='full')
            reduced = pca.fit_transform(X)
            
            birch = Birch(threshold=t,branching_factor=b,n_clusters=None)
            try:
                birch.fit(reduced)
            except:
                continue
            try:
                score = silhouette_score(reduced,birch.labels_)
            except:
                continue
            if score > max_score[0]:
                max_score = [score,'threshold:{}'.format(t),'branching:{}'.format(b),'{} pca components'.format(c)]
                print ('new max score: {}'.format(max_score) + '{} clusters'.format(len(np.unique(birch.labels_))))

threshold = 0.1
new max score: [0.28789277833066979, 'threshold:0.1', 'branching:10', '2 pca components']284 clusters
new max score: [0.30235196022519839, 'threshold:0.1', 'branching:20', '2 pca components']272 clusters
new max score: [0.31526837476948594, 'threshold:0.1', 'branching:30', '2 pca components']264 clusters
new max score: [0.32591314329101095, 'threshold:0.1', 'branching:50', '2 pca components']256 clusters
new max score: [0.32954721528174646, 'threshold:0.1', 'branching:90', '2 pca components']251 clusters
threshold = 0.17959183673469387
threshold = 0.25918367346938775
threshold = 0.3387755102040816
threshold = 0.41836734693877553
threshold = 0.49795918367346936
threshold = 0.5775510204081632
threshold = 0.6571428571428571
threshold = 0.736734693877551
threshold = 0.8163265306122448
threshold = 0.8959183673469387
new max score: [0.36438150449398515, 'threshold:0.8959183673469387', 'branching:10', '2 pca components']6 clusters
threshold = 0.9755102040816326
threshold = 1.0

In [15]:
# Optimizing BIRCH, round 2
X = df[feature_space]
X = normalize(X)

threshold_list = np.linspace(0.8,0.9,20)
branch_list = np.arange(1,21,1)
comp_list = [2,3,4,5]

for t in threshold_list:
    max_score = [0,0,0,0] # reset high score for each threshold value

    print ('threshold = {}'.format(t))
    for b in branch_list:
        for c in comp_list:
            pca = PCA(n_components=c,whiten=True,svd_solver='full')
            reduced = pca.fit_transform(X)
            
            birch = Birch(threshold=t,branching_factor=b,n_clusters=None)
            try:
                birch.fit(reduced)
            except:
                continue
            try:
                score = silhouette_score(reduced,birch.labels_)
            except:
                continue
            if score > max_score[0]:
                max_score = [score,'threshold:{}'.format(t),'branching:{}'.format(b),'{} pca components'.format(c)]
                print ('new max score: {}'.format(max_score) + '{} clusters'.format(len(np.unique(birch.labels_))))

threshold = 0.8
new max score: [0.31503884977845314, 'threshold:0.8', 'branching:2', '2 pca components']8 clusters
new max score: [0.34288276239147858, 'threshold:0.8', 'branching:6', '2 pca components']7 clusters
new max score: [0.34366334062058729, 'threshold:0.8', 'branching:7', '2 pca components']7 clusters
threshold = 0.8052631578947369
new max score: [0.30697432259376611, 'threshold:0.8052631578947369', 'branching:2', '2 pca components']9 clusters
new max score: [0.31676503194669742, 'threshold:0.8052631578947369', 'branching:3', '2 pca components']8 clusters
new max score: [0.32395892515083563, 'threshold:0.8052631578947369', 'branching:7', '2 pca components']7 clusters
threshold = 0.8105263157894738
new max score: [0.32404443548203077, 'threshold:0.8105263157894738', 'branching:2', '2 pca components']10 clusters
threshold = 0.8157894736842106
new max score: [0.3133007348925545, 'threshold:0.8157894736842106', 'branching:2', '2 pca components']10 clusters
new max score: [0.31668