# Clustering based on the seperated FIFAIndex player by positions

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import metrics
from collections import defaultdict

def most_common(lst):
    return max(set(lst), key=lst.count)

In [2]:
csv = pd.read_table('FIFIndex_BPL_players_featured.csv', sep=',', index_col = 0)
print(csv.shape)
csv.head(10)

(34922, 60)


Unnamed: 0,PlayerID,Name,UpdateDate,Country,IsHomeGrown,OverallScore,PotentialScore,PotentialGrowth(%),Height,Weight,...,FKAcc,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,GoodAtAttack,GoodBallSense
0,Mesut Özil 10/15/1988,Mesut Özil,Dec_12_2016,Germany,0,89,0.864865,0.0,0.5,0.431818,...,0.853659,0.658824,0.816092,0.058824,0.057471,0.144444,0.104651,0.146067,0,0
1,Harry Kane 07/28/1993,Harry Kane,Dec_12_2016,England,1,84,0.891892,0.152174,0.625,0.659091,...,0.695122,0.835294,0.735632,0.152941,0.08046,0.1,0.116279,0.11236,1,0
2,Coutinho 06/12/1992,Coutinho,Dec_12_2016,Brazil,0,85,0.891892,0.130435,0.2,0.25,...,0.853659,0.623529,0.793103,0.152941,0.126437,0.066667,0.093023,0.05618,1,0
3,Sergio Agüero 06/02/1988,Sergio Agüero,Dec_12_2016,Argentina,0,89,0.864865,0.0,0.225,0.386364,...,0.768293,0.811765,0.908046,0.117647,0.137931,0.155556,0.05814,0.146067,1,0
4,Nemanja Matić 08/01/1988,Nemanja Matić,Dec_12_2016,Serbia,0,84,0.72973,0.0,0.775,0.613636,...,0.707317,0.682353,0.781609,0.152941,0.068966,0.155556,0.127907,0.089888,0,1
5,Vincent Kompany 04/10/1986,Vincent Kompany,Dec_12_2016,Belgium,0,85,0.756757,0.0,0.725,0.636364,...,0.52439,0.611765,0.45977,0.082353,0.103448,0.088889,0.046512,0.05618,0,0
6,Cesc Fàbregas 05/04/1987,Cesc Fàbregas,Dec_12_2016,Spain,0,86,0.783784,0.0,0.3,0.386364,...,0.878049,0.811765,0.862069,0.164706,0.057471,0.1,0.081395,0.157303,0,1
7,Riyad Mahrez 02/21/1991,Riyad Mahrez,Dec_12_2016,Algeria,0,84,0.810811,0.086957,0.4,0.113636,...,0.768293,0.717647,0.747126,0.117647,0.16092,0.088889,0.139535,0.05618,0,0
8,Granit Xhaka 09/27/1992,Granit Xhaka,Dec_12_2016,Switzerland,0,84,0.810811,0.086957,0.55,0.568182,...,0.743902,0.564706,0.505747,0.070588,0.068966,0.077778,0.116279,0.134831,0,0
9,Claudio Bravo 04/13/1983,Claudio Bravo,Dec_12_2016,Chile,0,85,0.756757,0.0,0.525,0.522727,...,0.621951,0.141176,0.057471,0.905882,0.942529,0.933333,1.0,0.94382,0,0


# Part1: Seperate GoalKeeper from the dataset and use Kmeans++ to cluster

In [21]:
gkcsv = csv[csv.PreferredPositions == 'GK']
numeric = gkcsv._get_numeric_data().drop('OverallScore', axis=1)
print(gkcsv.shape)

(4088, 60)


In [22]:
#Choose number of clusters
k = 5
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([gkcsv.iloc[csv.axes[0][ind]]['PlayerID'], gkcsv.iloc[csv.axes[0][ind]]['UpdateDate'], gkcsv.iloc[csv.axes[0][ind]]['OverallScore'], gkcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Pau López 12/13/1994', 'Dec_12_2016', 73, 'GK'],
 ['Mark Birighitti 04/17/1991', 'Dec_12_2016', 68, 'GK'],
 ['Giedrius Arlauskis 12/01/1987', 'Dec_12_2016', 69, 'GK'],
 ['Daniel Bachmann 07/09/1994', 'Dec_12_2016', 62, 'GK'],
 ['Ian Lawlor 10/27/1994', 'Dec_12_2016', 61, 'GK'],
 ['Maksymilian Stryjek 07/18/1996', 'Dec_12_2016', 58, 'GK'],
 ['Daniel Iversen 07/19/1997', 'Dec_12_2016', 56, 'GK'],
 ['Jordan Holmes 05/01/1997', 'Dec_12_2016', 54, 'GK'],
 ['Conor Mitchell 05/09/1996', 'Dec_12_2016', 54, 'GK'],
 ['Nathan Gartside 03/08/1998', 'Dec_12_2016', 54, 'GK']]

In [23]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 73
The minimum OverallScore is 54
The most common OverallScore is 54

Result for cluster 1:
The maximum OverallScore is 89
The minimum OverallScore is 59
The most common OverallScore is 73

Result for cluster 2:
The maximum OverallScore is 90
The minimum OverallScore is 68
The most common OverallScore is 75

Result for cluster 3:
The maximum OverallScore is 68
The minimum OverallScore is 48
The most common OverallScore is 50

Result for cluster 4:
The maximum OverallScore is 84
The minimum OverallScore is 64
The most common OverallScore is 76



In [25]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numeric.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, :3]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 GKReflexes
 GKDiving
 GKPositioning

Cluster 1:
 IsPhysicalAnomaly
 GKDiving
 GKReflexes

Cluster 2:
 GKReflexes
 GKDiving
 GKPositioning

Cluster 3:
 IsHomeGrown
 IsPhysicalAnomaly
 Height

Cluster 4:
 IsHomeGrown
 GKPositioning
 GKReflexes



# Part2: Seperate Defender from the dataset and use Kmeans++ to cluster

In [6]:
defend = ['CB', 'RB', 'LCB', 'RB', 'LB', 'LB/CB', 'CB/RB', 'LB/RB', 'RB/RM', 'CB/CDM', 'LB/CB/RB', 'LB/CB/CDM', 'CB/RB/CDM', 'RW/RB/RWB', 'LW/LB/LWB', 'RW/RB/RM/RWB', 'RW/CB/RB/RWB']
defendcsv = csv[csv.PreferredPositions.isin(defend)]
print(defendcsv.shape)

(11328, 60)


In [7]:
numeric = defendcsv._get_numeric_data().drop('OverallScore', axis=1)

In [8]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([defendcsv.iloc[csv.axes[0][ind]]['PlayerID'], defendcsv.iloc[csv.axes[0][ind]]['UpdateDate'], defendcsv.iloc[csv.axes[0][ind]]['OverallScore'], defendcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Laurent Koscielny 09/10/1985', 'Dec_12_2016', 85, 'CB'],
 ['Nicolás Otamendi 02/12/1988', 'Dec_12_2016', 84, 'CB'],
 ['José Fonte 12/22/1983', 'Dec_12_2016', 83, 'CB'],
 ['Eric Bailly 04/12/1994', 'Dec_12_2016', 82, 'CB'],
 ['Azpilicueta 08/28/1989', 'Dec_12_2016', 84, 'LB/CB/RB'],
 ['Shkodran Mustafi 04/17/1992', 'Dec_12_2016', 83, 'CB'],
 ['David Luiz 04/22/1987', 'Dec_12_2016', 84, 'CB'],
 ['Pablo Zabaleta 01/16/1985', 'Dec_12_2016', 82, 'RB'],
 ['Mamadou Sakho 02/13/1990', 'Dec_12_2016', 82, 'CB'],
 ['Séamus Coleman 10/11/1988', 'Dec_12_2016', 82, 'RB']]

In [9]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 85
The minimum OverallScore is 60
The most common OverallScore is 75

Result for cluster 1:
The maximum OverallScore is 83
The minimum OverallScore is 67
The most common OverallScore is 73

Result for cluster 2:
The maximum OverallScore is 68
The minimum OverallScore is 51
The most common OverallScore is 59

Result for cluster 3:
The maximum OverallScore is 86
The minimum OverallScore is 69
The most common OverallScore is 76

Result for cluster 4:
The maximum OverallScore is 77
The minimum OverallScore is 48
The most common OverallScore is 58



In [11]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numeric.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, :3]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 SlideTackle
 Marking
 StandTackle

Cluster 1:
 IsHomeGrown
 SlideTackle
 Marking

Cluster 2:
 IsPhysicalAnomaly
 IsHomeGrown
 SlideTackle

Cluster 3:
 IsPhysicalAnomaly
 Marking
 SlideTackle

Cluster 4:
 IsHomeGrown
 SlideTackle
 StandTackle



# Part3: Seperate Midfield from the dataset and use Kmeans++ to cluster

In [12]:
midf = ['LW', 'LDM', 'LAM', 'LCM', 'CAM', 'CDM', 'CM', 'RM', 'LW/CAM', 'LM/RM', 'CAM/CM', 'CDM/CM', 'LM/CM', 'LM/ST/RM', 'CB/CDM/CM', 'CDM/RM/CM', 'LM/ST/RM', 'LM/CAM/RM', 'LW/LM/CAM', 'LW/LM/RW', 'CDM/CAM/CM', 'LM/CDM/CAM', 'LM/CDM/CAM/CM', 'LM/CDM/RM/CM', 'LM/CAM/RM/CM', 'LM/RW/CAM/RM']
midfcsv = csv[csv.PreferredPositions.isin(midf)]
print(midfcsv.shape)

(9588, 60)


In [13]:
numeric = midfcsv._get_numeric_data().drop('OverallScore', axis=1)

In [14]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([midfcsv.iloc[csv.axes[0][ind]]['PlayerID'], midfcsv.iloc[csv.axes[0][ind]]['UpdateDate'], midfcsv.iloc[csv.axes[0][ind]]['OverallScore'], midfcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Lewis Cook 02/03/1997', 'Dec_12_2016', 71, 'CDM/CAM/CM'],
 ['Adam Forshaw 10/08/1991', 'Dec_12_2016', 71, 'CDM/CM'],
 ['Harrison Reed 01/27/1995', 'Dec_12_2016', 68, 'CDM'],
 ['Emerson Hyndman 04/09/1996', 'Dec_12_2016', 65, 'CM'],
 ['Brahim Díaz 03/03/1999', 'Dec_12_2016', 64, 'CAM'],
 ['Jake Hesketh 03/27/1996', 'Dec_12_2016', 63, 'CAM'],
 ['Oliver Shenton 11/06/1997', 'Dec_12_2016', 62, 'CM'],
 ['Aleix García 06/28/1997', 'Dec_12_2016', 63, 'CDM/CM'],
 ['Joe Ward 10/24/1996', 'Dec_12_2016', 62, 'CM'],
 ['Will Patching 10/18/1998', 'Dec_12_2016', 60, 'CM']]

In [14]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 80
The minimum OverallScore is 75
The most common OverallScore is 78

Result for cluster 1:
The maximum OverallScore is 67
The minimum OverallScore is 60
The most common OverallScore is 62

Result for cluster 2:
The maximum OverallScore is 74
The minimum OverallScore is 68
The most common OverallScore is 72

Result for cluster 3:
The maximum OverallScore is 89
The minimum OverallScore is 81
The most common OverallScore is 81

Result for cluster 4:
The maximum OverallScore is 59
The minimum OverallScore is 48
The most common OverallScore is 58



In [15]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numeric.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, :3]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 IsPhysicalAnomaly
 Balance
 ShortPass

Cluster 1:
 ShortPass
 Stamina
 BallControl

Cluster 2:
 Acceleration
 ShortPass
 IsHomeGrown

Cluster 3:
 GoodBallSense
 ShortPass
 Stamina

Cluster 4:
 BallControl
 Dribbling
 Acceleration



# Part4: Seperate Offender from the dataset and use Kmeans++ to cluster

In [16]:
offend = ['ST', 'LS', 'RS', 'LW', 'RW', 'LM/ST', 'RW/RM', ]
offendcsv = csv[csv.PreferredPositions.isin(offend)]
print(offendcsv.shape)

(4538, 60)


In [17]:
numeric = offendcsv._get_numeric_data().drop('OverallScore', axis=1)

In [18]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([offendcsv.iloc[csv.axes[0][ind]]['PlayerID'], offendcsv.iloc[csv.axes[0][ind]]['UpdateDate'], offendcsv.iloc[csv.axes[0][ind]]['OverallScore'], offendcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Raheem Sterling 12/08/1994', 'Dec_12_2016', 82, 'LM/ST'],
 ['Jermain Defoe 10/07/1982', 'Dec_12_2016', 80, 'ST'],
 ['Nathan Dyer 11/29/1987', 'Dec_12_2016', 77, 'RW/RM'],
 ['Saido Berahino 08/04/1993', 'Dec_12_2016', 76, 'ST'],
 ['Connor Wickham 03/31/1993', 'Dec_12_2016', 74, 'ST'],
 ['Fraizer Campbell 09/13/1987', 'Dec_12_2016', 71, 'ST'],
 ['Will Keane 01/11/1993', 'Dec_12_2016', 67, 'ST'],
 ['Dominic Calvert-Lewin 03/16/1997', 'Dec_12_2016', 63, 'LM/ST'],
 ['Shayon Harrison 07/13/1997', 'Dec_12_2016', 60, 'ST'],
 ['Jerome Sinclair 09/20/1996', 'Dec_12_2016', 59, 'ST']]

In [19]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 82
The minimum OverallScore is 53
The most common OverallScore is 76

Result for cluster 1:
The maximum OverallScore is 90
The minimum OverallScore is 72
The most common OverallScore is 81

Result for cluster 2:
The maximum OverallScore is 76
The minimum OverallScore is 51
The most common OverallScore is 57

Result for cluster 3:
The maximum OverallScore is 69
The minimum OverallScore is 53
The most common OverallScore is 62

Result for cluster 4:
The maximum OverallScore is 83
The minimum OverallScore is 67
The most common OverallScore is 74



In [20]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numeric.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, :3]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 IsPhysicalAnomaly
 IsHomeGrown
 Acceleration

Cluster 1:
 GoodAtAttack
 Finishing
 AttPosition

Cluster 2:
 IsHomeGrown
 Acceleration
 SprintSpeed

Cluster 3:
 Acceleration
 SprintSpeed
 Finishing

Cluster 4:
 AttPosition
 Finishing
 BallControl

