# Clustering for FIFA Index datset

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import metrics
from collections import defaultdict

def most_common(lst):
    return max(set(lst), key=lst.count)

# Part1: Use Kmeans++ to cluster all the players (Only numeric values)

In [2]:
csv = pd.read_table('FIFIndex_BPL_players_featured.csv', sep=',', index_col = 0)
print(csv.shape)
csv.head(10)

(34922, 60)


Unnamed: 0,PlayerID,Name,UpdateDate,Country,IsHomeGrown,OverallScore,PotentialScore,PotentialGrowth(%),Height,Weight,...,FKAcc,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,GoodAtAttack,GoodBallSense
0,Mesut Özil 10/15/1988,Mesut Özil,Dec_12_2016,Germany,0.0,89,0.864865,0.0,0.5,0.431818,...,0.853659,0.658824,0.816092,0.058824,0.057471,0.144444,0.104651,0.146067,0.0,0.0
1,Harry Kane 07/28/1993,Harry Kane,Dec_12_2016,England,1.0,84,0.891892,0.152174,0.625,0.659091,...,0.695122,0.835294,0.735632,0.152941,0.08046,0.1,0.116279,0.11236,0.0,0.0
2,Coutinho 06/12/1992,Coutinho,Dec_12_2016,Brazil,0.0,85,0.891892,0.130435,0.2,0.25,...,0.853659,0.623529,0.793103,0.152941,0.126437,0.066667,0.093023,0.05618,0.0,0.0
3,Sergio Agüero 06/02/1988,Sergio Agüero,Dec_12_2016,Argentina,0.0,89,0.864865,0.0,0.225,0.386364,...,0.768293,0.811765,0.908046,0.117647,0.137931,0.155556,0.05814,0.146067,1.0,0.0
4,Nemanja Matić 08/01/1988,Nemanja Matić,Dec_12_2016,Serbia,0.0,84,0.72973,0.0,0.775,0.613636,...,0.707317,0.682353,0.781609,0.152941,0.068966,0.155556,0.127907,0.089888,0.0,1.0
5,Vincent Kompany 04/10/1986,Vincent Kompany,Dec_12_2016,Belgium,0.0,85,0.756757,0.0,0.725,0.636364,...,0.52439,0.611765,0.45977,0.082353,0.103448,0.088889,0.046512,0.05618,0.0,0.0
6,Cesc Fàbregas 05/04/1987,Cesc Fàbregas,Dec_12_2016,Spain,0.0,86,0.783784,0.0,0.3,0.386364,...,0.878049,0.811765,0.862069,0.164706,0.057471,0.1,0.081395,0.157303,0.0,0.0
7,Riyad Mahrez 02/21/1991,Riyad Mahrez,Dec_12_2016,Algeria,0.0,84,0.810811,0.086957,0.4,0.113636,...,0.768293,0.717647,0.747126,0.117647,0.16092,0.088889,0.139535,0.05618,0.0,0.0
8,Granit Xhaka 09/27/1992,Granit Xhaka,Dec_12_2016,Switzerland,0.0,84,0.810811,0.086957,0.55,0.568182,...,0.743902,0.564706,0.505747,0.070588,0.068966,0.077778,0.116279,0.134831,0.0,0.0
9,Claudio Bravo 04/13/1983,Claudio Bravo,Dec_12_2016,Chile,0.0,85,0.756757,0.0,0.525,0.522727,...,0.621951,0.141176,0.057471,0.905882,0.942529,0.933333,1.0,0.94382,0.0,0.0


In [3]:
#Ignore all categorical values and OverallScore becuase OverallScore will be used as the label
numerics = csv._get_numeric_data()
numerics = numerics.drop('OverallScore', 1)
numerics.head(10)

Unnamed: 0,IsHomeGrown,PotentialScore,PotentialGrowth(%),Height,Weight,IsPhysicalAnomaly,Age,NumberOfPositisions,SwitchedTeams,Contract,...,FKAcc,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,GoodAtAttack,GoodBallSense
0,0.0,0.864865,0.0,0.5,0.431818,0.0,0.478261,0.25,0.0,0.166667,...,0.853659,0.658824,0.816092,0.058824,0.057471,0.144444,0.104651,0.146067,0.0,0.0
1,1.0,0.891892,0.152174,0.625,0.659091,0.0,0.26087,0.0,0.0,0.833333,...,0.695122,0.835294,0.735632,0.152941,0.08046,0.1,0.116279,0.11236,0.0,0.0
2,0.0,0.891892,0.130435,0.2,0.25,1.0,0.304348,0.5,0.0,0.5,...,0.853659,0.623529,0.793103,0.152941,0.126437,0.066667,0.093023,0.05618,0.0,0.0
3,0.0,0.864865,0.0,0.225,0.386364,1.0,0.478261,0.0,0.0,0.5,...,0.768293,0.811765,0.908046,0.117647,0.137931,0.155556,0.05814,0.146067,1.0,0.0
4,0.0,0.72973,0.0,0.775,0.613636,1.0,0.478261,0.25,0.0,0.333333,...,0.707317,0.682353,0.781609,0.152941,0.068966,0.155556,0.127907,0.089888,0.0,1.0
5,0.0,0.756757,0.0,0.725,0.636364,1.0,0.565217,0.0,0.0,0.333333,...,0.52439,0.611765,0.45977,0.082353,0.103448,0.088889,0.046512,0.05618,0.0,0.0
6,0.0,0.783784,0.0,0.3,0.386364,0.0,0.521739,0.25,0.0,0.333333,...,0.878049,0.811765,0.862069,0.164706,0.057471,0.1,0.081395,0.157303,0.0,0.0
7,0.0,0.810811,0.086957,0.4,0.113636,1.0,0.347826,0.25,0.0,0.5,...,0.768293,0.717647,0.747126,0.117647,0.16092,0.088889,0.139535,0.05618,0.0,0.0
8,0.0,0.810811,0.086957,0.55,0.568182,0.0,0.304348,0.0,0.0,0.666667,...,0.743902,0.564706,0.505747,0.070588,0.068966,0.077778,0.116279,0.134831,0.0,0.0
9,0.0,0.756757,0.0,0.525,0.522727,0.0,0.695652,0.0,0.0,0.5,...,0.621951,0.141176,0.057471,0.905882,0.942529,0.933333,1.0,0.94382,0.0,0.0


In [25]:
#Choose number of clusters
k = 2
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numerics)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([csv.iloc[csv.axes[0][ind]]['PlayerID'], csv.iloc[csv.axes[0][ind]]['UpdateDate'], csv.iloc[csv.axes[0][ind]]['OverallScore'], csv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:20]

[['Claudio Bravo 04/13/1983', 'Dec_12_2016', 85, 'GK'],
 ['Thibaut Courtois 05/11/1992', 'Dec_12_2016', 89, 'GK'],
 ['Hugo Lloris 12/26/1986', 'Dec_12_2016', 88, 'GK'],
 ['Petr Čech 05/20/1982', 'Dec_12_2016', 88, 'GK'],
 ['De Gea 11/07/1990', 'Dec_12_2016', 90, 'GK'],
 ['Loris Karius 06/22/1993', 'Dec_12_2016', 82, 'GK'],
 ['Asmir Begović 06/20/1987', 'Dec_12_2016', 83, 'GK'],
 ['Steve Mandanda 03/28/1985', 'Dec_12_2016', 84, 'GK'],
 ['Victor Valdés 01/14/1982', 'Dec_12_2016', 82, 'GK'],
 ['Jack Butland 03/10/1993', 'Dec_12_2016', 82, 'GK'],
 ['Kasper Schmeichel 11/05/1986', 'Dec_12_2016', 82, 'GK'],
 ['Ron-Robert Zieler 02/12/1989', 'Dec_12_2016', 81, 'GK'],
 ['Adrián 01/31/1987', 'Dec_12_2016', 80, 'GK'],
 ['Michel Vorm 10/20/1983', 'Dec_12_2016', 80, 'GK'],
 ['Łukasz Fabiański 04/18/1985', 'Dec_12_2016', 80, 'GK'],
 ['Ben Foster 04/03/1983', 'Dec_12_2016', 80, 'GK'],
 ['Fraser Forster 03/17/1988', 'Dec_12_2016', 79, 'GK'],
 ['Sergio Romero 02/22/1987', 'Dec_12_2016', 79, 'GK'],
 ['

In [26]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 90
The minimum OverallScore is 48
The most common OverallScore is 73

Result for cluster 1:
The maximum OverallScore is 90
The minimum OverallScore is 48
The most common OverallScore is 77



In [27]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numerics.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, :3]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 GKDiving
 GKReflexes
 GKPositioning

Cluster 1:
 BallControl
 Stamina
 Acceleration



In [28]:
for i in range(k):
    positions = []
    
    for item in cluster_dict_k[i]:
        if ('/' in item[3]):
            poss = item[3].split('/')
            
            for item2 in poss:
                positions.append(item2)
        else:
            positions.append(item[3])
            
    positions = set(positions)
    
    print('Result for cluster ' + str(i) + ':')
    print('Positions are: ')
    print(positions)
    print('')

Result for cluster 0:
Positions are: 
{'GK'}

Result for cluster 1:
Positions are: 
{'CB', 'CDM', 'RW', 'LB', 'CAM', 'CF', 'ST', 'LM', 'RB', 'LW', 'RWB', 'RM', 'LWB', 'CM'}



# Part2: Use DBSCAN to cluster all the players (Only numeric values)

In [31]:
dbscan = DBSCAN(eps=1.1)
dbscan.fit(numerics)
result_d = dbscan.labels_.astype(np.int)

# See what players are in each cluster
cluster_dict_d = defaultdict(list)
for cluster_label, ind in zip(result_d, range(len(result_d))):
    cluster_dict_d[cluster_label].append([csv.iloc[csv.axes[0][ind]]['PlayerID'], csv.iloc[csv.axes[0][ind]]['UpdateDate'], csv.iloc[csv.axes[0][ind]]['OverallScore']])

cluster_dict_d[0][0:20]

[['Mesut Özil 10/15/1988', 'Dec_12_2016', 89],
 ['Harry Kane 07/28/1993', 'Dec_12_2016', 84],
 ['Coutinho 06/12/1992', 'Dec_12_2016', 85],
 ['Sergio Agüero 06/02/1988', 'Dec_12_2016', 89],
 ['Nemanja Matić 08/01/1988', 'Dec_12_2016', 84],
 ['Vincent Kompany 04/10/1986', 'Dec_12_2016', 85],
 ['Cesc Fàbregas 05/04/1987', 'Dec_12_2016', 86],
 ['Riyad Mahrez 02/21/1991', 'Dec_12_2016', 84],
 ['Granit Xhaka 09/27/1992', 'Dec_12_2016', 84],
 ['Henrikh Mkhitaryan 01/21/1989', 'Dec_12_2016', 85],
 ['David Silva 01/08/1986', 'Dec_12_2016', 87],
 ['Eden Hazard 01/07/1991', 'Dec_12_2016', 89],
 ['Paul Pogba 03/15/1993', 'Dec_12_2016', 88],
 ['Diego Costa 10/07/1988', 'Dec_12_2016', 86],
 ['Romelu Lukaku 05/13/1993', 'Dec_12_2016', 84],
 ['Toby Alderweireld 03/02/1989', 'Dec_12_2016', 85],
 ['Laurent Koscielny 09/10/1985', 'Dec_12_2016', 85],
 ['Alexis Sánchez 12/19/1988', 'Dec_12_2016', 87],
 ['İlkay Gündoğan 10/24/1990', 'Dec_12_2016', 85],
 ['Santi Cazorla 12/13/1984', 'Dec_12_2016', 86]]

In [32]:
for i in range(len(cluster_dict_d) - 2):
    score = []
    
    for item in cluster_dict_d[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 90
The minimum OverallScore is 48
The most common OverallScore is 77

Result for cluster 1:
The maximum OverallScore is 90
The minimum OverallScore is 48
The most common OverallScore is 73

Result for cluster 2:
The maximum OverallScore is 83
The minimum OverallScore is 72
The most common OverallScore is 75

Result for cluster 3:
The maximum OverallScore is 84
The minimum OverallScore is 83
The most common OverallScore is 84

Result for cluster 4:
The maximum OverallScore is 82
The minimum OverallScore is 82
The most common OverallScore is 82

Result for cluster 5:
The maximum OverallScore is 81
The minimum OverallScore is 75
The most common OverallScore is 81

Result for cluster 6:
The maximum OverallScore is 81
The minimum OverallScore is 78
The most common OverallScore is 81

Result for cluster 7:
The maximum OverallScore is 78
The minimum OverallScore is 72
The most common OverallScore is 72

Result for cluster 8:
The maximum Overal

# Part3: Use Hierarchical clustering for all the players (Only numeric values)

In [None]:
HC=AgglomerativeClustering(n_clusters=k)
result_h=HC.fit_predict(numerics)

# See what players are in each cluster
cluster_dict_h = defaultdict(list)
for cluster_label, ind in zip(result_h, range(len(result_h))):
    cluster_dict_h[cluster_label].append([csv.iloc[csv.axes[0][ind]]['PlayerID'], csv.iloc[csv.axes[0][ind]]['UpdateDate'], csv.iloc[csv.axes[0][ind]]['OverallScore']])

cluster_dict_h[0][0:20]

In [None]:
for i in range(5):
    score = []
    
    for item in cluster_dict_h[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

# Part4: Use Kmeans++ to cluster all the players (includes categorcial values)

In [9]:
csv_cate = pd.read_table('FIFIndex_BPL_players_numeric_featured.csv', sep=',', index_col = 0)
print(csv_cate.shape)
csv_cate.head(10)

(23790, 1536)


Unnamed: 0,PlayerID,UpdateDate,IsHomeGrown,OverallScore,PotentialScore,PotentialGrowth(%),Height,Weight,IsPhysicalAnomaly,Age,...,StartYear_12/23/2014,StartYear_12/23/2016,StartYear_12/24/2015,Contract_0.0,Contract_0.166666666667,Contract_0.333333333333,Contract_0.5,Contract_0.666666666667,Contract_0.833333333333,Contract_1.0
0,Mesut Özil 10/15/1988,Dec_12_2016,0.0,89,0.864865,0.0,0.5,0.431818,0.0,0.478261,...,0,0,0,0,1,0,0,0,0,0
1,Harry Kane 07/28/1993,Dec_12_2016,1.0,84,0.891892,0.152174,0.625,0.659091,0.0,0.26087,...,0,0,0,0,0,0,0,0,1,0
2,Coutinho 06/12/1992,Dec_12_2016,0.0,85,0.891892,0.130435,0.2,0.25,1.0,0.304348,...,0,0,0,0,0,0,1,0,0,0
3,Sergio Agüero 06/02/1988,Dec_12_2016,0.0,89,0.864865,0.0,0.225,0.386364,1.0,0.478261,...,0,0,0,0,0,0,1,0,0,0
4,Nemanja Matić 08/01/1988,Dec_12_2016,0.0,84,0.72973,0.0,0.775,0.613636,1.0,0.478261,...,0,0,0,0,0,1,0,0,0,0
5,Vincent Kompany 04/10/1986,Dec_12_2016,0.0,85,0.756757,0.0,0.725,0.636364,1.0,0.565217,...,0,0,0,0,0,1,0,0,0,0
6,Cesc Fàbregas 05/04/1987,Dec_12_2016,0.0,86,0.783784,0.0,0.3,0.386364,0.0,0.521739,...,0,0,0,0,0,1,0,0,0,0
7,Riyad Mahrez 02/21/1991,Dec_12_2016,0.0,84,0.810811,0.086957,0.4,0.113636,1.0,0.347826,...,0,0,0,0,0,0,1,0,0,0
8,Granit Xhaka 09/27/1992,Dec_12_2016,0.0,84,0.810811,0.086957,0.55,0.568182,0.0,0.304348,...,0,0,0,0,0,0,0,1,0,0
9,Claudio Bravo 04/13/1983,Dec_12_2016,0.0,85,0.756757,0.0,0.525,0.522727,0.0,0.695652,...,0,0,0,0,0,0,1,0,0,0


In [12]:
#Ignore all categorical values and OverallScore becuase OverallScore will be used as the label
numerics_cate = csv_cate._get_numeric_data()
numerics_cate = numerics_cate.drop('OverallScore', 1)
print(numerics_cate.shape)
numerics_cate.head(10)

(23790, 1533)


Unnamed: 0,IsHomeGrown,PotentialScore,PotentialGrowth(%),Height,Weight,IsPhysicalAnomaly,Age,NumberOfPositisions,SwitchedTeams,YearsLeftInContract,...,StartYear_12/23/2014,StartYear_12/23/2016,StartYear_12/24/2015,Contract_0.0,Contract_0.166666666667,Contract_0.333333333333,Contract_0.5,Contract_0.666666666667,Contract_0.833333333333,Contract_1.0
0,0.0,0.864865,0.0,0.5,0.431818,0.0,0.478261,0.25,0.0,0.263158,...,0,0,0,0,1,0,0,0,0,0
1,1.0,0.891892,0.152174,0.625,0.659091,0.0,0.26087,0.0,0.0,0.631579,...,0,0,0,0,0,0,0,0,1,0
2,0.0,0.891892,0.130435,0.2,0.25,1.0,0.304348,0.5,0.0,0.368421,...,0,0,0,0,0,0,1,0,0,0
3,0.0,0.864865,0.0,0.225,0.386364,1.0,0.478261,0.0,0.0,0.473684,...,0,0,0,0,0,0,1,0,0,0
4,0.0,0.72973,0.0,0.775,0.613636,1.0,0.478261,0.25,0.0,0.263158,...,0,0,0,0,0,1,0,0,0,0
5,0.0,0.756757,0.0,0.725,0.636364,1.0,0.565217,0.0,0.0,0.578947,...,0,0,0,0,0,1,0,0,0,0
6,0.0,0.783784,0.0,0.3,0.386364,0.0,0.521739,0.25,0.0,0.263158,...,0,0,0,0,0,1,0,0,0,0
7,0.0,0.810811,0.086957,0.4,0.113636,1.0,0.347826,0.25,0.0,0.315789,...,0,0,0,0,0,0,1,0,0,0
8,0.0,0.810811,0.086957,0.55,0.568182,0.0,0.304348,0.0,0.0,0.263158,...,0,0,0,0,0,0,0,1,0,0
9,0.0,0.756757,0.0,0.525,0.522727,0.0,0.695652,0.0,0.0,0.210526,...,0,0,0,0,0,0,1,0,0,0


In [14]:
#Choose number of clusters is because the range of OverallScore is approximately equal from 40-90
k = 5
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numerics_cate)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([csv.iloc[csv.axes[0][ind]]['PlayerID'], csv.iloc[csv.axes[0][ind]]['UpdateDate'], csv.iloc[csv.axes[0][ind]]['OverallScore']])

cluster_dict_k[0][0:50]

[['Harry Kane 07/28/1993 Dec_12_2016', 'Dec_12_2016', 84],
 ['Coutinho 06/12/1992 Dec_12_2016', 'Dec_12_2016', 85],
 ['Sergio Agüero 06/02/1988 Dec_12_2016', 'Dec_12_2016', 89],
 ['Cesc Fàbregas 05/04/1987 Dec_12_2016', 'Dec_12_2016', 86],
 ['Henrikh Mkhitaryan 01/21/1989 Dec_12_2016', 'Dec_12_2016', 85],
 ['Eden Hazard 01/07/1991 Dec_12_2016', 'Dec_12_2016', 89],
 ['Paul Pogba 03/15/1993 Dec_12_2016', 'Dec_12_2016', 88],
 ['Diego Costa 10/07/1988 Dec_12_2016', 'Dec_12_2016', 86],
 ['Alexis Sánchez 12/19/1988 Dec_12_2016', 'Dec_12_2016', 87],
 ['İlkay Gündoğan 10/24/1990 Dec_12_2016', 'Dec_12_2016', 85],
 ['Santi Cazorla 12/13/1984 Dec_12_2016', 'Dec_12_2016', 86],
 ['Zlatan Ibrahimović 10/03/1981 Dec_12_2016', 'Dec_12_2016', 90],
 ['Dimitri Payet 03/29/1987 Dec_12_2016', 'Dec_12_2016', 85],
 ['Kevin De Bruyne 06/28/1991 Dec_12_2016', 'Dec_12_2016', 88],
 ['Willian 08/09/1988 Dec_12_2016', 'Dec_12_2016', 85],
 ['Islam Slimani 06/18/1988 Dec_12_2016', 'Dec_12_2016', 83],
 ['Ander Herrer

In [16]:
for i in range(5):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 90
The minimum OverallScore is 61
The most common OverallScore is 77

Result for cluster 1:
The maximum OverallScore is 86
The minimum OverallScore is 59
The most common OverallScore is 77

Result for cluster 2:
The maximum OverallScore is 77
The minimum OverallScore is 48
The most common OverallScore is 58

Result for cluster 3:
The maximum OverallScore is 90
The minimum OverallScore is 48
The most common OverallScore is 73

Result for cluster 4:
The maximum OverallScore is 89
The minimum OverallScore is 58
The most common OverallScore is 76



# Part5: Use PCA and Kmeans++ to do clustering (Only numeric values)

In [35]:
# Many of the variables are correlated. It may be useful to perform PCA
pca = PCA(n_components=11)
pca.fit(numerics.values)
print(pca.explained_variance_ratio_)
print()
print('The first 10 principal components explain:')
print(sum(pca.explained_variance_ratio_), 'of total variance')

[ 0.38363045  0.12814671  0.11010836  0.0872837   0.05163355  0.04098174
  0.0292438   0.02657965  0.02007613  0.01434518  0.01416295]

The first 10 principal components explain:
0.906192221131 of total variance


In [36]:
reduced_data = PCA(n_components=11).fit_transform(numerics)
kmeans = KMeans(init='k-means++', n_clusters=10, max_iter=100, n_init=10)
result_k = kmeans.fit_predict(reduced_data)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([csv.iloc[csv.axes[0][ind]]['PlayerID'], csv.iloc[csv.axes[0][ind]]['UpdateDate'], csv.iloc[csv.axes[0][ind]]['OverallScore']])

cluster_dict_k[0][0:30]

[['Callum McManaman 04/25/1991', 'Dec_12_2016', 73],
 ['Demarai Gray 06/28/1996', 'Dec_12_2016', 71],
 ['Josh Onomah 04/27/1997', 'Dec_12_2016', 71],
 ['Lewis Grabban 01/12/1988', 'Dec_12_2016', 70],
 ['Junior Stanislas 11/26/1989', 'Dec_12_2016', 70],
 ['Duncan Watmore 03/08/1994', 'Dec_12_2016', 69],
 ['Dominic Solanke 09/14/1997', 'Dec_12_2016', 69],
 ['Lloyd Isgrove 01/12/1993', 'Dec_12_2016', 67],
 ['Ashley Fletcher 10/02/1995', 'Dec_12_2016', 67],
 ['Gedion Zelalem 01/26/1997', 'Dec_12_2016', 67],
 ['James Wilson 12/01/1995', 'Dec_12_2016', 69],
 ['Angeliño 01/04/1997', 'Dec_12_2016', 65],
 ['Oviemuno Ejaria 11/18/1997', 'Dec_12_2016', 66],
 ['Conor McAleny 08/12/1992', 'Dec_12_2016', 65],
 ['Thierry Ambrose 03/28/1997', 'Dec_12_2016', 66],
 ['Lynden Gooch 12/24/1995', 'Dec_12_2016', 66],
 ['Jeff Reine-Adelaïde 01/17/1998', 'Dec_12_2016', 64],
 ['Ryan Seager 02/05/1996', 'Dec_12_2016', 63],
 ['Carlos de Pena 03/11/1992', 'Dec_12_2016', 62],
 ['Zachary Elbouzedi 04/05/1998', 'Dec_

In [37]:
for i in range(10):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 76
The minimum OverallScore is 51
The most common OverallScore is 59

Result for cluster 1:
The maximum OverallScore is 90
The minimum OverallScore is 48
The most common OverallScore is 73

Result for cluster 2:
The maximum OverallScore is 88
The minimum OverallScore is 70
The most common OverallScore is 82

Result for cluster 3:
The maximum OverallScore is 77
The minimum OverallScore is 48
The most common OverallScore is 58

Result for cluster 4:
The maximum OverallScore is 90
The minimum OverallScore is 61
The most common OverallScore is 78

Result for cluster 5:
The maximum OverallScore is 86
The minimum OverallScore is 63
The most common OverallScore is 77

Result for cluster 6:
The maximum OverallScore is 84
The minimum OverallScore is 51
The most common OverallScore is 60

Result for cluster 7:
The maximum OverallScore is 82
The minimum OverallScore is 61
The most common OverallScore is 75

Result for cluster 8:
The maximum Overal