# Clustering for FIFA Index datset

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import metrics
from collections import defaultdict

def most_common(lst):
    return max(set(lst), key=lst.count)

# Part1: Use Kmeans++ to cluster all the players (Only numeric values)

In [2]:
csv = pd.read_table('FIFIndex_BPL_players_featured.csv', sep=',', index_col = 0)
print(csv.shape)
csv.head(10)

(34922, 60)


Unnamed: 0,PlayerID,Name,UpdateDate,Country,IsHomeGrown,OverallScore,PotentialScore,PotentialGrowth(%),Height,Weight,...,FKAcc,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,GoodAtAttack,GoodBallSense
0,Mesut Özil 10/15/1988 Dec_12_2016,Mesut Özil,Dec_12_2016,Germany,0.0,89,0.864865,0.0,0.5,0.431818,...,0.853659,0.658824,0.816092,0.058824,0.057471,0.144444,0.104651,0.146067,0.0,0.0
1,Harry Kane 07/28/1993 Dec_12_2016,Harry Kane,Dec_12_2016,England,1.0,84,0.891892,0.152174,0.625,0.659091,...,0.695122,0.835294,0.735632,0.152941,0.08046,0.1,0.116279,0.11236,0.0,0.0
2,Coutinho 06/12/1992 Dec_12_2016,Coutinho,Dec_12_2016,Brazil,0.0,85,0.891892,0.130435,0.2,0.25,...,0.853659,0.623529,0.793103,0.152941,0.126437,0.066667,0.093023,0.05618,0.0,0.0
3,Sergio Agüero 06/02/1988 Dec_12_2016,Sergio Agüero,Dec_12_2016,Argentina,0.0,89,0.864865,0.0,0.225,0.386364,...,0.768293,0.811765,0.908046,0.117647,0.137931,0.155556,0.05814,0.146067,1.0,0.0
4,Nemanja Matić 08/01/1988 Dec_12_2016,Nemanja Matić,Dec_12_2016,Serbia,0.0,84,0.72973,0.0,0.775,0.613636,...,0.707317,0.682353,0.781609,0.152941,0.068966,0.155556,0.127907,0.089888,0.0,1.0
5,Vincent Kompany 04/10/1986 Dec_12_2016,Vincent Kompany,Dec_12_2016,Belgium,0.0,85,0.756757,0.0,0.725,0.636364,...,0.52439,0.611765,0.45977,0.082353,0.103448,0.088889,0.046512,0.05618,0.0,0.0
6,Cesc Fàbregas 05/04/1987 Dec_12_2016,Cesc Fàbregas,Dec_12_2016,Spain,0.0,86,0.783784,0.0,0.3,0.386364,...,0.878049,0.811765,0.862069,0.164706,0.057471,0.1,0.081395,0.157303,0.0,0.0
7,Riyad Mahrez 02/21/1991 Dec_12_2016,Riyad Mahrez,Dec_12_2016,Algeria,0.0,84,0.810811,0.086957,0.4,0.113636,...,0.768293,0.717647,0.747126,0.117647,0.16092,0.088889,0.139535,0.05618,0.0,0.0
8,Granit Xhaka 09/27/1992 Dec_12_2016,Granit Xhaka,Dec_12_2016,Switzerland,0.0,84,0.810811,0.086957,0.55,0.568182,...,0.743902,0.564706,0.505747,0.070588,0.068966,0.077778,0.116279,0.134831,0.0,0.0
9,Claudio Bravo 04/13/1983 Dec_12_2016,Claudio Bravo,Dec_12_2016,Chile,0.0,85,0.756757,0.0,0.525,0.522727,...,0.621951,0.141176,0.057471,0.905882,0.942529,0.933333,1.0,0.94382,0.0,0.0


In [3]:
#Ignore all categorical values and OverallScore becuase OverallScore will be used as the label
numerics = csv._get_numeric_data()
numerics = numerics.drop('OverallScore', 1)
numerics.head(10)

Unnamed: 0,IsHomeGrown,PotentialScore,PotentialGrowth(%),Height,Weight,IsPhysicalAnomaly,Age,NumberOfPositisions,SwitchedTeams,Contract,...,FKAcc,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,GoodAtAttack,GoodBallSense
0,0.0,0.864865,0.0,0.5,0.431818,0.0,0.478261,0.25,0.0,0.166667,...,0.853659,0.658824,0.816092,0.058824,0.057471,0.144444,0.104651,0.146067,0.0,0.0
1,1.0,0.891892,0.152174,0.625,0.659091,0.0,0.26087,0.0,0.0,0.833333,...,0.695122,0.835294,0.735632,0.152941,0.08046,0.1,0.116279,0.11236,0.0,0.0
2,0.0,0.891892,0.130435,0.2,0.25,1.0,0.304348,0.5,0.0,0.5,...,0.853659,0.623529,0.793103,0.152941,0.126437,0.066667,0.093023,0.05618,0.0,0.0
3,0.0,0.864865,0.0,0.225,0.386364,1.0,0.478261,0.0,0.0,0.5,...,0.768293,0.811765,0.908046,0.117647,0.137931,0.155556,0.05814,0.146067,1.0,0.0
4,0.0,0.72973,0.0,0.775,0.613636,1.0,0.478261,0.25,0.0,0.333333,...,0.707317,0.682353,0.781609,0.152941,0.068966,0.155556,0.127907,0.089888,0.0,1.0
5,0.0,0.756757,0.0,0.725,0.636364,1.0,0.565217,0.0,0.0,0.333333,...,0.52439,0.611765,0.45977,0.082353,0.103448,0.088889,0.046512,0.05618,0.0,0.0
6,0.0,0.783784,0.0,0.3,0.386364,0.0,0.521739,0.25,0.0,0.333333,...,0.878049,0.811765,0.862069,0.164706,0.057471,0.1,0.081395,0.157303,0.0,0.0
7,0.0,0.810811,0.086957,0.4,0.113636,1.0,0.347826,0.25,0.0,0.5,...,0.768293,0.717647,0.747126,0.117647,0.16092,0.088889,0.139535,0.05618,0.0,0.0
8,0.0,0.810811,0.086957,0.55,0.568182,0.0,0.304348,0.0,0.0,0.666667,...,0.743902,0.564706,0.505747,0.070588,0.068966,0.077778,0.116279,0.134831,0.0,0.0
9,0.0,0.756757,0.0,0.525,0.522727,0.0,0.695652,0.0,0.0,0.5,...,0.621951,0.141176,0.057471,0.905882,0.942529,0.933333,1.0,0.94382,0.0,0.0


In [None]:
# K-means++
# Determine number of clusters using Silhouette Score
def kmeans_silhouette_eval(X, max_clusters):
    s = np.zeros(max_clusters + 1)
    s[0] = 0
    s[1] = 0
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(init='k-means++', n_clusters=k)
        labels = kmeans.fit_predict(X)
        s[k] = metrics.silhouette_score(X, labels, metric='euclidean')
    plt.plot(range(2, len(s)), s[2:])
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    
# Determine number of clusters using error function
def kmeans_error_eval(X, max_clusters):
    error = np.zeros(max_clusters + 1)
    error[0] = 0
    error[1] = 0
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(init='k-means++', n_clusters=k)
        kmeans.fit_predict(X)
        error[k] = kmeans.inertia_
    plt.plot(range(2, len(error)), error[2:])
    plt.xlabel('Number of clusters')
    plt.ylabel('Error')

plt.figure(figsize=(8, 5))
kmeans_silhouette_eval(numerics, 3)
plt.figure(figsize=(8, 5))
kmeans_error_eval(numerics, 3)
plt.show()

In [51]:
#Choose number of clusters is because the range of OverallScore is approximately equal from 40-90
k = 5
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numerics)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([csv.iloc[csv.axes[0][ind]]['PlayerID'], [csv.iloc[csv.axes[0][ind]]['UpdateDate'], csv.iloc[csv.axes[0][ind]]['OverallScore']])

cluster_dict_k[0][0:50]

[['Claudio Bravo 04/13/1983 Dec_12_2016', 85],
 ['Thibaut Courtois 05/11/1992 Dec_12_2016', 89],
 ['Hugo Lloris 12/26/1986 Dec_12_2016', 88],
 ['Petr Čech 05/20/1982 Dec_12_2016', 88],
 ['De Gea 11/07/1990 Dec_12_2016', 90],
 ['Loris Karius 06/22/1993 Dec_12_2016', 82],
 ['Asmir Begović 06/20/1987 Dec_12_2016', 83],
 ['Steve Mandanda 03/28/1985 Dec_12_2016', 84],
 ['Victor Valdés 01/14/1982 Dec_12_2016', 82],
 ['Jack Butland 03/10/1993 Dec_12_2016', 82],
 ['Kasper Schmeichel 11/05/1986 Dec_12_2016', 82],
 ['Ron-Robert Zieler 02/12/1989 Dec_12_2016', 81],
 ['Adrián 01/31/1987 Dec_12_2016', 80],
 ['Michel Vorm 10/20/1983 Dec_12_2016', 80],
 ['Łukasz Fabiański 04/18/1985 Dec_12_2016', 80],
 ['Ben Foster 04/03/1983 Dec_12_2016', 80],
 ['Fraser Forster 03/17/1988 Dec_12_2016', 79],
 ['Sergio Romero 02/22/1987 Dec_12_2016', 79],
 ['David Ospina 08/31/1988 Dec_12_2016', 79],
 ['Willy Caballero 09/28/1981 Dec_12_2016', 78],
 ['Simon Mignolet 03/06/1988 Dec_12_2016', 78],
 ['Tom Heaton 04/15/19

In [52]:
for i in range(5):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[1])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 90
The minimum OverallScore is 48
The most common OverallScore is 73

Result for cluster 1:
The maximum OverallScore is 88
The minimum OverallScore is 67
The most common OverallScore is 75

Result for cluster 2:
The maximum OverallScore is 73
The minimum OverallScore is 48
The most common OverallScore is 60

Result for cluster 3:
The maximum OverallScore is 90
The minimum OverallScore is 61
The most common OverallScore is 76

Result for cluster 4:
The maximum OverallScore is 86
The minimum OverallScore is 53
The most common OverallScore is 76



In [53]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numerics.axes[1]

for i in range(5):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, :3]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 GKDiving
 GKReflexes
 GKPositioning

Cluster 1:
 Stamina
 SlideTackle
 StandTackle

Cluster 2:
 IsHomeGrown
 Acceleration
 SprintSpeed

Cluster 3:
 BallControl
 AttPosition
 Dribbling

Cluster 4:
 SlideTackle
 StandTackle
 Marking



In [50]:
# Many of the variables are correlated. It may be useful to perform PCA
pca = PCA(n_components=2)
pca.fit(numerics.values)
print(pca.explained_variance_ratio_)
print()
print('The first 2 principal components explain:')
print(sum(pca.explained_variance_ratio_), 'of total variance')

[ 0.38908821  0.13003069]

The first 2 principal components explain:
0.519118894243 of total variance


# Part2: Use DBSCAN to cluster all the players (Only numeric values)

In [12]:
dbscan = DBSCAN(eps=1)
dbscan.fit(numerics)
result_d = dbscan.labels_.astype(np.int)

# See what players are in each cluster
cluster_dict_d = defaultdict(list)
for cluster_label, ind in zip(result_d, range(len(result_d))):
    cluster_dict_d[cluster_label].append([csv.iloc[csv.axes[0][ind]]['PlayerID'], [csv.iloc[csv.axes[0][ind]]['UpdateDate'], csv.iloc[csv.axes[0][ind]]['OverallScore']])

cluster_dict_d[0][0:50]

[['Mesut Özil 10/15/1988 Dec_12_2016', 89],
 ['Harry Kane 07/28/1993 Dec_12_2016', 84],
 ['Cesc Fàbregas 05/04/1987 Dec_12_2016', 86],
 ['Granit Xhaka 09/27/1992 Dec_12_2016', 84],
 ['Laurent Koscielny 09/10/1985 Dec_12_2016', 85],
 ['Kevin De Bruyne 06/28/1991 Dec_12_2016', 88],
 ['Islam Slimani 06/18/1988 Dec_12_2016', 83],
 ['José Fonte 12/22/1983 Dec_12_2016', 83],
 ['Christian Eriksen 02/14/1992 Dec_12_2016', 84],
 ['Oscar 09/09/1991 Dec_12_2016', 83],
 ['Azpilicueta 08/28/1989 Dec_12_2016', 84],
 ['Roberto Pereyra 01/07/1991 Dec_12_2016', 81],
 ['Pablo Zabaleta 01/16/1985 Dec_12_2016', 82],
 ['Lucas Pérez 09/10/1988 Dec_12_2016', 81],
 ['Roberto Firmino 10/02/1991 Dec_12_2016', 82],
 ['Mamadou Sakho 02/13/1990 Dec_12_2016', 82],
 ['Gylfi Sigurðsson 09/09/1989 Dec_12_2016', 82],
 ['Bacary Sagna 02/14/1983 Dec_12_2016', 82],
 ['Deulofeu 03/13/1994 Dec_12_2016', 81],
 ['Daley Blind 03/09/1990 Dec_12_2016', 81],
 ['Nathaniel Clyne 04/05/1991 Dec_12_2016', 81],
 ['Ross Barkley 12/05/1

In [13]:
for i in range(len(cluster_dict_d) - 2):
    score = []
    
    for item in cluster_dict_d[i]:
        score.append(item[1])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 89
The minimum OverallScore is 48
The most common OverallScore is 77

Result for cluster 1:
The maximum OverallScore is 89
The minimum OverallScore is 51
The most common OverallScore is 76

Result for cluster 2:
The maximum OverallScore is 90
The minimum OverallScore is 65
The most common OverallScore is 77

Result for cluster 3:
The maximum OverallScore is 84
The minimum OverallScore is 76
The most common OverallScore is 78

Result for cluster 4:
The maximum OverallScore is 90
The minimum OverallScore is 54
The most common OverallScore is 75

Result for cluster 5:
The maximum OverallScore is 86
The minimum OverallScore is 70
The most common OverallScore is 76

Result for cluster 6:
The maximum OverallScore is 88
The minimum OverallScore is 75
The most common OverallScore is 88

Result for cluster 7:
The maximum OverallScore is 89
The minimum OverallScore is 50
The most common OverallScore is 73

Result for cluster 8:
The maximum Overal

# Part3: Use Hierarchical clustering for all the players (Only numeric values)

In [None]:
HC=AgglomerativeClustering(n_clusters=k)
result_h=HC.fit_predict(numerics)

# See what players are in each cluster
cluster_dict_h = defaultdict(list)
for cluster_label, ind in zip(result_h, range(len(result_h))):
    cluster_dict_h[cluster_label].append([csv.iloc[csv.axes[0][ind]]['PlayerID'], [csv.iloc[csv.axes[0][ind]]['UpdateDate'], csv.iloc[csv.axes[0][ind]]['OverallScore']])

cluster_dict_h[0][0:50]

In [None]:
for i in range(5):
    score = []
    
    for item in cluster_dict_h[i]:
        score.append(item[1])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')