# Clustering based on the seperated FIFAIndex player by positions

In [7]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn import metrics
from collections import defaultdict

def most_common(lst):
    return max(set(lst), key=lst.count)

In [2]:
csv = pd.read_table('FIFIndex_BPL_players_featured.csv', sep=',', index_col = 0)
print(csv.shape)
csv.head(10)

(34922, 60)


Unnamed: 0,PlayerID,Name,UpdateDate,Country,IsHomeGrown,OverallScore,PotentialScore,PotentialGrowth(%),Height,Weight,...,FKAcc,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,GoodAtAttack,GoodBallSense
0,Mesut Özil 10/15/1988,Mesut Özil,Dec_12_2016,Germany,0.0,89,0.864865,0.0,0.5,0.431818,...,0.853659,0.658824,0.816092,0.058824,0.057471,0.144444,0.104651,0.146067,0.0,0.0
1,Harry Kane 07/28/1993,Harry Kane,Dec_12_2016,England,1.0,84,0.891892,0.152174,0.625,0.659091,...,0.695122,0.835294,0.735632,0.152941,0.08046,0.1,0.116279,0.11236,0.0,0.0
2,Coutinho 06/12/1992,Coutinho,Dec_12_2016,Brazil,0.0,85,0.891892,0.130435,0.2,0.25,...,0.853659,0.623529,0.793103,0.152941,0.126437,0.066667,0.093023,0.05618,0.0,0.0
3,Sergio Agüero 06/02/1988,Sergio Agüero,Dec_12_2016,Argentina,0.0,89,0.864865,0.0,0.225,0.386364,...,0.768293,0.811765,0.908046,0.117647,0.137931,0.155556,0.05814,0.146067,1.0,0.0
4,Nemanja Matić 08/01/1988,Nemanja Matić,Dec_12_2016,Serbia,0.0,84,0.72973,0.0,0.775,0.613636,...,0.707317,0.682353,0.781609,0.152941,0.068966,0.155556,0.127907,0.089888,0.0,1.0
5,Vincent Kompany 04/10/1986,Vincent Kompany,Dec_12_2016,Belgium,0.0,85,0.756757,0.0,0.725,0.636364,...,0.52439,0.611765,0.45977,0.082353,0.103448,0.088889,0.046512,0.05618,0.0,0.0
6,Cesc Fàbregas 05/04/1987,Cesc Fàbregas,Dec_12_2016,Spain,0.0,86,0.783784,0.0,0.3,0.386364,...,0.878049,0.811765,0.862069,0.164706,0.057471,0.1,0.081395,0.157303,0.0,0.0
7,Riyad Mahrez 02/21/1991,Riyad Mahrez,Dec_12_2016,Algeria,0.0,84,0.810811,0.086957,0.4,0.113636,...,0.768293,0.717647,0.747126,0.117647,0.16092,0.088889,0.139535,0.05618,0.0,0.0
8,Granit Xhaka 09/27/1992,Granit Xhaka,Dec_12_2016,Switzerland,0.0,84,0.810811,0.086957,0.55,0.568182,...,0.743902,0.564706,0.505747,0.070588,0.068966,0.077778,0.116279,0.134831,0.0,0.0
9,Claudio Bravo 04/13/1983,Claudio Bravo,Dec_12_2016,Chile,0.0,85,0.756757,0.0,0.525,0.522727,...,0.621951,0.141176,0.057471,0.905882,0.942529,0.933333,1.0,0.94382,0.0,0.0


# Part1: Seperate GoalKeeper from the dataset and use Kmeans++ to cluster

In [14]:
gkcsv = csv[csv.PreferredPositions == 'GK']
numeric = gkcsv._get_numeric_data()
print(gkcsv.shape)

(4088, 60)


In [15]:
#Choose number of clusters
k = 5
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([gkcsv.iloc[csv.axes[0][ind]]['PlayerID'], gkcsv.iloc[csv.axes[0][ind]]['UpdateDate'], gkcsv.iloc[csv.axes[0][ind]]['OverallScore'], gkcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Adrián 01/31/1987', 'Dec_12_2016', 80, 'GK'],
 ['Michel Vorm 10/20/1983', 'Dec_12_2016', 80, 'GK'],
 ['Łukasz Fabiański 04/18/1985', 'Dec_12_2016', 80, 'GK'],
 ['Ben Foster 04/03/1983', 'Dec_12_2016', 80, 'GK'],
 ['Fraser Forster 03/17/1988', 'Dec_12_2016', 79, 'GK'],
 ['Sergio Romero 02/22/1987', 'Dec_12_2016', 79, 'GK'],
 ['David Ospina 08/31/1988', 'Dec_12_2016', 79, 'GK'],
 ['Willy Caballero 09/28/1981', 'Dec_12_2016', 78, 'GK'],
 ['Simon Mignolet 03/06/1988', 'Dec_12_2016', 78, 'GK'],
 ['Tom Heaton 04/15/1986', 'Dec_12_2016', 78, 'GK']]

In [16]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 80
The minimum OverallScore is 72
The most common OverallScore is 73

Result for cluster 1:
The maximum OverallScore is 63
The minimum OverallScore is 56
The most common OverallScore is 60

Result for cluster 2:
The maximum OverallScore is 90
The minimum OverallScore is 81
The most common OverallScore is 82

Result for cluster 3:
The maximum OverallScore is 71
The minimum OverallScore is 64
The most common OverallScore is 68

Result for cluster 4:
The maximum OverallScore is 55
The minimum OverallScore is 48
The most common OverallScore is 54



# Part2: Seperate Defender from the dataset and use Kmeans++ to cluster

In [17]:
defend = ['CB', 'RB', 'LCB', 'RB', 'LB', 'LB/CB/RB', ]
defendcsv = csv[csv.PreferredPositions.isin(defend)]
print(defendcsv.shape)

(7812, 60)


In [22]:
numeric = defendcsv._get_numeric_data()

In [25]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([defendcsv.iloc[csv.axes[0][ind]]['PlayerID'], defendcsv.iloc[csv.axes[0][ind]]['UpdateDate'], defendcsv.iloc[csv.axes[0][ind]]['OverallScore'], defendcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Héctor Bellerín 03/19/1995', 'Dec_12_2016', 80, 'RB'],
 ['Virgil van Dijk 07/08/1991', 'Dec_12_2016', 80, 'CB'],
 ['Federico Fernández 02/22/1989', 'Dec_12_2016', 79, 'CB'],
 ['Gabriel 11/22/1990', 'Dec_12_2016', 79, 'CB'],
 ['Gaël Clichy 07/26/1985', 'Dec_12_2016', 80, 'LB'],
 ['Kevin Wimmer 11/15/1992', 'Dec_12_2016', 79, 'CB'],
 ['Danny Rose 07/02/1990', 'Dec_12_2016', 80, 'LB'],
 ['Angelo Ogbonna 05/23/1988', 'Dec_12_2016', 80, 'CB'],
 ['Phil Jones 02/21/1992', 'Dec_12_2016', 79, 'CB'],
 ['Winston Reid 07/03/1988', 'Dec_12_2016', 80, 'CB']]

In [26]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 80
The minimum OverallScore is 75
The most common OverallScore is 76

Result for cluster 1:
The maximum OverallScore is 60
The minimum OverallScore is 48
The most common OverallScore is 58

Result for cluster 2:
The maximum OverallScore is 74
The minimum OverallScore is 68
The most common OverallScore is 73

Result for cluster 3:
The maximum OverallScore is 86
The minimum OverallScore is 80
The most common OverallScore is 83

Result for cluster 4:
The maximum OverallScore is 67
The minimum OverallScore is 61
The most common OverallScore is 63



# Part3: Seperate Midfield from the dataset and use Kmeans++ to cluster

In [19]:
midf = ['LW', 'LDM', 'LAM', 'LCM', 'CAM', 'CDM', 'CM', 'RM', 'LW/CAM', 'LW/LM/CAM', 'CDM/CM', 'CDM/CAM/CM', ]
midfcsv = csv[csv.PreferredPositions.isin(midf)]
print(midfcsv.shape)

(6332, 60)


In [27]:
numeric = midfcsv._get_numeric_data()

In [28]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([midfcsv.iloc[csv.axes[0][ind]]['PlayerID'], midfcsv.iloc[csv.axes[0][ind]]['UpdateDate'], midfcsv.iloc[csv.axes[0][ind]]['OverallScore'], midfcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Luke Dreher 11/27/1998', 'Dec_12_2016', 59, 'CM'],
 ['Sergio Molina 02/18/1996', 'Dec_12_2016', 59, 'CDM'],
 ['Robbie Leitch 04/01/1998', 'Dec_12_2016', 58, 'CDM/CM'],
 ['Tom Davies 06/30/1998', 'Dec_12_2016', 58, 'CM'],
 ['Kyle Scott 12/22/1997', 'Dec_12_2016', 58, 'CM'],
 ['Ryan Blair 02/23/1996', 'Dec_12_2016', 58, 'CM'],
 ['George Honeyman 09/08/1994', 'Dec_12_2016', 59, 'CAM'],
 ['Anton Walkes 02/08/1997', 'Dec_12_2016', 56, 'CDM'],
 ['Harvey Barnes 12/09/1997', 'Dec_12_2016', 56, 'CAM'],
 ['Thomas Dyson 10/14/1997', 'Dec_12_2016', 57, 'CM']]

In [29]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 59
The minimum OverallScore is 48
The most common OverallScore is 58

Result for cluster 1:
The maximum OverallScore is 80
The minimum OverallScore is 75
The most common OverallScore is 78

Result for cluster 2:
The maximum OverallScore is 74
The minimum OverallScore is 68
The most common OverallScore is 71

Result for cluster 3:
The maximum OverallScore is 89
The minimum OverallScore is 81
The most common OverallScore is 82

Result for cluster 4:
The maximum OverallScore is 67
The minimum OverallScore is 60
The most common OverallScore is 62



# Part4: Seperate Offender from the dataset and use Kmeans++ to cluster

In [20]:
offend = ['ST', 'LS', 'RS', 'LW', 'RW', 'LM', 'RM', 'LM/ST', 'RW/RM', ]
offendcsv = csv[csv.PreferredPositions.isin(offend)]
print(offendcsv.shape)

(5635, 60)


In [30]:
numeric = offendcsv._get_numeric_data()

In [32]:
#Choose number of clusters
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
result_k = kmeans.fit_predict(numeric)

# See what players are in each cluster
cluster_dict_k = defaultdict(list)
for cluster_label, ind in zip(result_k, range(len(result_k))):
    cluster_dict_k[cluster_label].append([offendcsv.iloc[csv.axes[0][ind]]['PlayerID'], offendcsv.iloc[csv.axes[0][ind]]['UpdateDate'], offendcsv.iloc[csv.axes[0][ind]]['OverallScore'], offendcsv.iloc[csv.axes[0][ind]]['PreferredPositions']])

cluster_dict_k[0][0:10]

[['Andre Gray 06/26/1991', 'Dec_12_2016', 75, 'ST'],
 ['Viktor Fischer 06/09/1994', 'Dec_12_2016', 75, 'LW'],
 ['Oumar Niasse 04/18/1990', 'Dec_12_2016', 75, 'ST'],
 ['Jordan Rhodes 02/05/1990', 'Dec_12_2016', 74, 'ST'],
 ['Victor Anichebe 04/23/1988', 'Dec_12_2016', 74, 'ST'],
 ['Connor Wickham 03/31/1993', 'Dec_12_2016', 74, 'ST'],
 ['Leonardo Ulloa 07/26/1986', 'Dec_12_2016', 74, 'ST'],
 ['Peter Crouch 01/30/1981', 'Dec_12_2016', 75, 'ST'],
 ['Benik Afobe 02/12/1993', 'Dec_12_2016', 74, 'ST'],
 ['Joshua King 01/15/1992', 'Dec_12_2016', 73, 'LM/ST']]

In [33]:
for i in range(k):
    score = []
    
    for item in cluster_dict_k[i]:
        score.append(item[2])
    
    minScore = min(score)
    maxScore = max(score)
    
    print('Result for cluster ' + str(i) + ':')
    print('The maximum OverallScore is ' + str(maxScore))
    print('The minimum OverallScore is ' + str(minScore))
    print('The most common OverallScore is ' + str(most_common(score)))
    print('')

Result for cluster 0:
The maximum OverallScore is 75
The minimum OverallScore is 70
The most common OverallScore is 74

Result for cluster 1:
The maximum OverallScore is 61
The minimum OverallScore is 50
The most common OverallScore is 60

Result for cluster 2:
The maximum OverallScore is 90
The minimum OverallScore is 81
The most common OverallScore is 81

Result for cluster 3:
The maximum OverallScore is 69
The minimum OverallScore is 62
The most common OverallScore is 63

Result for cluster 4:
The maximum OverallScore is 80
The minimum OverallScore is 76
The most common OverallScore is 76

