# Clustering based on the seperated FIFAIndex player by positions

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import metrics
from collections import defaultdict

def most_common(lst):
    return max(set(lst), key=lst.count)

In [2]:
csv = pd.read_table('FIFIndex_BPL_players_featured.csv', sep=',', index_col = 0)
print(csv.shape)
csv.head(10)

(34922, 60)


Unnamed: 0,PlayerID,Name,UpdateDate,Country,IsHomeGrown,OverallScore,PotentialScore,PotentialGrowth(%),Height,Weight,...,FKAcc,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,GoodAtAttack,GoodBallSense
0,Mesut Özil 10/15/1988,Mesut Özil,Dec_12_2016,Germany,0,89,0.864865,0.0,0.5,0.431818,...,0.853659,0.658824,0.816092,0.058824,0.057471,0.144444,0.104651,0.146067,0,0
1,Harry Kane 07/28/1993,Harry Kane,Dec_12_2016,England,1,84,0.891892,0.152174,0.625,0.659091,...,0.695122,0.835294,0.735632,0.152941,0.08046,0.1,0.116279,0.11236,1,0
2,Coutinho 06/12/1992,Coutinho,Dec_12_2016,Brazil,0,85,0.891892,0.130435,0.2,0.25,...,0.853659,0.623529,0.793103,0.152941,0.126437,0.066667,0.093023,0.05618,1,0
3,Sergio Agüero 06/02/1988,Sergio Agüero,Dec_12_2016,Argentina,0,89,0.864865,0.0,0.225,0.386364,...,0.768293,0.811765,0.908046,0.117647,0.137931,0.155556,0.05814,0.146067,1,0
4,Nemanja Matić 08/01/1988,Nemanja Matić,Dec_12_2016,Serbia,0,84,0.72973,0.0,0.775,0.613636,...,0.707317,0.682353,0.781609,0.152941,0.068966,0.155556,0.127907,0.089888,0,1
5,Vincent Kompany 04/10/1986,Vincent Kompany,Dec_12_2016,Belgium,0,85,0.756757,0.0,0.725,0.636364,...,0.52439,0.611765,0.45977,0.082353,0.103448,0.088889,0.046512,0.05618,0,0
6,Cesc Fàbregas 05/04/1987,Cesc Fàbregas,Dec_12_2016,Spain,0,86,0.783784,0.0,0.3,0.386364,...,0.878049,0.811765,0.862069,0.164706,0.057471,0.1,0.081395,0.157303,0,1
7,Riyad Mahrez 02/21/1991,Riyad Mahrez,Dec_12_2016,Algeria,0,84,0.810811,0.086957,0.4,0.113636,...,0.768293,0.717647,0.747126,0.117647,0.16092,0.088889,0.139535,0.05618,0,0
8,Granit Xhaka 09/27/1992,Granit Xhaka,Dec_12_2016,Switzerland,0,84,0.810811,0.086957,0.55,0.568182,...,0.743902,0.564706,0.505747,0.070588,0.068966,0.077778,0.116279,0.134831,0,0
9,Claudio Bravo 04/13/1983,Claudio Bravo,Dec_12_2016,Chile,0,85,0.756757,0.0,0.525,0.522727,...,0.621951,0.141176,0.057471,0.905882,0.942529,0.933333,1.0,0.94382,0,0


# Part1: Seperate GoalKeeper from the dataset and use Kmeans++ to cluster

In [3]:
gkcsv = csv[csv.PreferredPositions == 'GK']
numeric = gkcsv._get_numeric_data()
print(gkcsv.shape)

(4088, 60)


In [4]:
#Choose number of clusters
k = 5
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([gkcsv.iloc[csv.axes[0][ind]]['PlayerID'], gkcsv.iloc[csv.axes[0][ind]]['UpdateDate'], gkcsv.iloc[csv.axes[0][ind]]['OverallScore'], gkcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Loris Karius 06/22/1993', 'Dec_12_2016', 82, 'GK'],
 ['Asmir Begović 06/20/1987', 'Dec_12_2016', 83, 'GK'],
 ['Victor Valdés 01/14/1982', 'Dec_12_2016', 82, 'GK'],
 ['Jack Butland 03/10/1993', 'Dec_12_2016', 82, 'GK'],
 ['Kasper Schmeichel 11/05/1986', 'Dec_12_2016', 82, 'GK'],
 ['Ron-Robert Zieler 02/12/1989', 'Dec_12_2016', 81, 'GK'],
 ['Adrián 01/31/1987', 'Dec_12_2016', 80, 'GK'],
 ['Michel Vorm 10/20/1983', 'Dec_12_2016', 80, 'GK'],
 ['Łukasz Fabiański 04/18/1985', 'Dec_12_2016', 80, 'GK'],
 ['Ben Foster 04/03/1983', 'Dec_12_2016', 80, 'GK']]

In [5]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 83
The minimum OverallScore is 77
The most common OverallScore is 80

Result for cluster 1:
The maximum OverallScore is 59
The minimum OverallScore is 48
The most common OverallScore is 54

Result for cluster 2:
The maximum OverallScore is 76
The minimum OverallScore is 70
The most common OverallScore is 73

Result for cluster 3:
The maximum OverallScore is 69
The minimum OverallScore is 60
The most common OverallScore is 68

Result for cluster 4:
The maximum OverallScore is 90
The minimum OverallScore is 84
The most common OverallScore is 88



In [8]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numeric.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, 1:4]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 GKReflexes
 GKDiving
 GKPositioning

Cluster 1:
 Height
 GKDiving
 GKReflexes

Cluster 2:
 GKPositioning
 GKDiving
 GKReflexes

Cluster 3:
 GKDiving
 GKReflexes
 GKPositioning

Cluster 4:
 GKReflexes
 GKDiving
 GKPositioning



# Part2: Seperate Defender from the dataset and use Kmeans++ to cluster

In [9]:
defend = ['CB', 'RB', 'LCB', 'RB', 'LB', 'LB/CB', 'CB/RB', 'LB/RB', 'RB/RM', 'CB/CDM', 'LB/CB/RB', 'LB/CB/CDM', 'CB/RB/CDM', 'RW/RB/RWB', 'LW/LB/LWB', 'RW/RB/RM/RWB', 'RW/CB/RB/RWB']
defendcsv = csv[csv.PreferredPositions.isin(defend)]
print(defendcsv.shape)

(11328, 60)


In [10]:
numeric = defendcsv._get_numeric_data()

In [11]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([defendcsv.iloc[csv.axes[0][ind]]['PlayerID'], defendcsv.iloc[csv.axes[0][ind]]['UpdateDate'], defendcsv.iloc[csv.axes[0][ind]]['OverallScore'], defendcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Vincent Kompany 04/10/1986', 'Dec_12_2016', 85, 'CB'],
 ['Toby Alderweireld 03/02/1989', 'Dec_12_2016', 85, 'CB'],
 ['Laurent Koscielny 09/10/1985', 'Dec_12_2016', 85, 'CB'],
 ['Nicolás Otamendi 02/12/1988', 'Dec_12_2016', 84, 'CB'],
 ['Jan Vertonghen 04/24/1987', 'Dec_12_2016', 83, 'CB'],
 ['José Fonte 12/22/1983', 'Dec_12_2016', 83, 'CB'],
 ['Joel Matip 08/08/1991', 'Dec_12_2016', 83, 'CB'],
 ['Eric Bailly 04/12/1994', 'Dec_12_2016', 82, 'CB'],
 ['Ashley Williams 08/23/1984', 'Dec_12_2016', 83, 'CB'],
 ['Per Mertesacker 09/29/1984', 'Dec_12_2016', 83, 'CB']]

In [12]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 86
The minimum OverallScore is 79
The most common OverallScore is 79

Result for cluster 1:
The maximum OverallScore is 66
The minimum OverallScore is 60
The most common OverallScore is 60

Result for cluster 2:
The maximum OverallScore is 78
The minimum OverallScore is 73
The most common OverallScore is 75

Result for cluster 3:
The maximum OverallScore is 73
The minimum OverallScore is 67
The most common OverallScore is 72

Result for cluster 4:
The maximum OverallScore is 59
The minimum OverallScore is 48
The most common OverallScore is 58



In [13]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numeric.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, 1:4]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 SlideTackle
 Marking
 StandTackle

Cluster 1:
 SlideTackle
 StandTackle
 Marking

Cluster 2:
 SlideTackle
 Marking
 StandTackle

Cluster 3:
 SlideTackle
 Marking
 StandTackle

Cluster 4:
 IsHomeGrown
 SlideTackle
 StandTackle



# Part3: Seperate Midfield from the dataset and use Kmeans++ to cluster

In [14]:
midf = ['LW', 'LDM', 'LAM', 'LCM', 'CAM', 'CDM', 'CM', 'RM', 'LW/CAM', 'LM/RM', 'CAM/CM', 'CDM/CM', 'LM/CM', 'LM/ST/RM', 'CB/CDM/CM', 'CDM/RM/CM', 'LM/ST/RM', 'LM/CAM/RM', 'LW/LM/CAM', 'LW/LM/RW', 'CDM/CAM/CM', 'LM/CDM/CAM', 'LM/CDM/CAM/CM', 'LM/CDM/RM/CM', 'LM/CAM/RM/CM', 'LM/RW/CAM/RM']
midfcsv = csv[csv.PreferredPositions.isin(midf)]
print(midfcsv.shape)

(9588, 60)


In [15]:
numeric = midfcsv._get_numeric_data()

In [16]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([midfcsv.iloc[csv.axes[0][ind]]['PlayerID'], midfcsv.iloc[csv.axes[0][ind]]['UpdateDate'], midfcsv.iloc[csv.axes[0][ind]]['OverallScore'], midfcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Emerson Hyndman 04/09/1996', 'Dec_12_2016', 65, 'CM'],
 ['Sam Field 05/08/1998', 'Dec_12_2016', 64, 'CDM/CM'],
 ['Oviemuno Ejaria 11/18/1997', 'Dec_12_2016', 66, 'CAM/CM'],
 ['Conor McAleny 08/12/1992', 'Dec_12_2016', 65, 'LM/ST/RM'],
 ['Harry Winks 02/02/1996', 'Dec_12_2016', 66, 'CAM/CM'],
 ['Lynden Gooch 12/24/1995', 'Dec_12_2016', 66, 'CAM/CM'],
 ['Jay Fulton 04/04/1994', 'Dec_12_2016', 66, 'CM'],
 ['Brahim Díaz 03/03/1999', 'Dec_12_2016', 64, 'CAM'],
 ['Jake Hesketh 03/27/1996', 'Dec_12_2016', 63, 'CAM'],
 ['Zachary Elbouzedi 04/05/1998', 'Dec_12_2016', 62, 'CAM']]

In [17]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 67
The minimum OverallScore is 60
The most common OverallScore is 62

Result for cluster 1:
The maximum OverallScore is 80
The minimum OverallScore is 75
The most common OverallScore is 78

Result for cluster 2:
The maximum OverallScore is 74
The minimum OverallScore is 67
The most common OverallScore is 72

Result for cluster 3:
The maximum OverallScore is 89
The minimum OverallScore is 81
The most common OverallScore is 81

Result for cluster 4:
The maximum OverallScore is 59
The minimum OverallScore is 48
The most common OverallScore is 58



In [18]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numeric.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, 1:4]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 Balance
 ShortPass
 BallControl

Cluster 1:
 BallControl
 ShortPass
 Stamina

Cluster 2:
 Stamina
 ShortPass
 BallControl

Cluster 3:
 BallControl
 ShortPass
 Reactions

Cluster 4:
 Acceleration
 ShortPass
 Balance



# Part4: Seperate Offender from the dataset and use Kmeans++ to cluster

In [19]:
offend = ['ST', 'LS', 'RS', 'LW', 'RW', 'LM/ST', 'RW/RM', ]
offendcsv = csv[csv.PreferredPositions.isin(offend)]
print(offendcsv.shape)

(4538, 60)


In [20]:
numeric = offendcsv._get_numeric_data().drop('OverallScore', axis=1)

In [21]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([offendcsv.iloc[csv.axes[0][ind]]['PlayerID'], offendcsv.iloc[csv.axes[0][ind]]['UpdateDate'], offendcsv.iloc[csv.axes[0][ind]]['OverallScore'], offendcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Lewis Grabban 01/12/1988', 'Dec_12_2016', 70, 'ST'],
 ['Lys Mousset 02/08/1996', 'Dec_12_2016', 68, 'ST'],
 ['Dominic Solanke 09/14/1997', 'Dec_12_2016', 69, 'ST'],
 ['Ashley Fletcher 10/02/1995', 'Dec_12_2016', 67, 'ST'],
 ['James Wilson 12/01/1995', 'Dec_12_2016', 69, 'ST'],
 ['Ryan Seager 02/05/1996', 'Dec_12_2016', 63, 'ST'],
 ['Joel Asoro 04/27/1999', 'Dec_12_2016', 63, 'ST'],
 ['Chuba Akpom 10/09/1995', 'Dec_12_2016', 64, 'ST'],
 ['Jonathan Benteke 04/28/1995', 'Dec_12_2016', 62, 'ST'],
 ['Toni Martínez 06/30/1997', 'Dec_12_2016', 62, 'ST']]

In [22]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 70
The minimum OverallScore is 51
The most common OverallScore is 59

Result for cluster 1:
The maximum OverallScore is 89
The minimum OverallScore is 72
The most common OverallScore is 81

Result for cluster 2:
The maximum OverallScore is 90
The minimum OverallScore is 67
The most common OverallScore is 74

Result for cluster 3:
The maximum OverallScore is 67
The minimum OverallScore is 53
The most common OverallScore is 62

Result for cluster 4:
The maximum OverallScore is 84
The minimum OverallScore is 67
The most common OverallScore is 76



In [23]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

#
features = numeric.axes[1]

for i in range(k):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, 1:4]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 SprintSpeed
 IsHomeGrown
 Finishing

Cluster 1:
 Finishing
 GoodAtAttack
 BallControl

Cluster 2:
 AttPosition
 ShotPower
 Strength

Cluster 3:
 Acceleration
 SprintSpeed
 Balance

Cluster 4:
 IsPhysicalAnomaly
 Finishing
 AttPosition

