In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics

In [2]:
csv = pd.read_table('FIFIndex_BPL_players_featured.csv', sep=',', index_col = 0)
print(csv.shape)
csv.head(10)

(34922, 60)


Unnamed: 0,PlayerID,Name,UpdateDate,Country,IsHomeGrown,OverallScore,PotentialScore,PotentialGrowth(%),Height,Weight,...,FKAcc,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,GoodAtAttack,GoodBallSense
0,Mesut Özil 10/15/1988,Mesut Özil,Dec_12_2016,Germany,0.0,0.97619,0.864865,0.0,0.5,0.431818,...,0.853659,0.658824,0.816092,0.058824,0.057471,0.144444,0.104651,0.146067,0.0,0.0
1,Harry Kane 07/28/1993,Harry Kane,Dec_12_2016,England,1.0,0.857143,0.891892,0.152174,0.625,0.659091,...,0.695122,0.835294,0.735632,0.152941,0.08046,0.1,0.116279,0.11236,0.0,0.0
2,Coutinho 06/12/1992,Coutinho,Dec_12_2016,Brazil,0.0,0.880952,0.891892,0.130435,0.2,0.25,...,0.853659,0.623529,0.793103,0.152941,0.126437,0.066667,0.093023,0.05618,0.0,0.0
3,Sergio Agüero 06/02/1988,Sergio Agüero,Dec_12_2016,Argentina,0.0,0.97619,0.864865,0.0,0.225,0.386364,...,0.768293,0.811765,0.908046,0.117647,0.137931,0.155556,0.05814,0.146067,1.0,0.0
4,Nemanja Matić 08/01/1988,Nemanja Matić,Dec_12_2016,Serbia,0.0,0.857143,0.72973,0.0,0.775,0.613636,...,0.707317,0.682353,0.781609,0.152941,0.068966,0.155556,0.127907,0.089888,0.0,1.0
5,Vincent Kompany 04/10/1986,Vincent Kompany,Dec_12_2016,Belgium,0.0,0.880952,0.756757,0.0,0.725,0.636364,...,0.52439,0.611765,0.45977,0.082353,0.103448,0.088889,0.046512,0.05618,0.0,0.0
6,Cesc Fàbregas 05/04/1987,Cesc Fàbregas,Dec_12_2016,Spain,0.0,0.904762,0.783784,0.0,0.3,0.386364,...,0.878049,0.811765,0.862069,0.164706,0.057471,0.1,0.081395,0.157303,0.0,0.0
7,Riyad Mahrez 02/21/1991,Riyad Mahrez,Dec_12_2016,Algeria,0.0,0.857143,0.810811,0.086957,0.4,0.113636,...,0.768293,0.717647,0.747126,0.117647,0.16092,0.088889,0.139535,0.05618,0.0,0.0
8,Granit Xhaka 09/27/1992,Granit Xhaka,Dec_12_2016,Switzerland,0.0,0.857143,0.810811,0.086957,0.55,0.568182,...,0.743902,0.564706,0.505747,0.070588,0.068966,0.077778,0.116279,0.134831,0.0,0.0
9,Claudio Bravo 04/13/1983,Claudio Bravo,Dec_12_2016,Chile,0.0,0.880952,0.756757,0.0,0.525,0.522727,...,0.621951,0.141176,0.057471,0.905882,0.942529,0.933333,1.0,0.94382,0.0,0.0


In [3]:
numerics = csv._get_numeric_data()

# Many of the variables are correlated. It may be useful to perform PCA
pca = PCA(n_components=5)
pca.fit(numerics.values)
print(pca.explained_variance_ratio_)
print()
print('The first 5 principal components explain:')
print(sum(pca.explained_variance_ratio_), 'of total variance')

[ 0.3808201   0.12626641  0.11501451  0.08629519  0.05377722]

The first 5 principal components explain:
0.7621734355 of total variance


In [None]:
# K-means++
# Determine number of clusters using Silhouette Score
def kmeans_silhouette_eval(X, max_clusters):
    s = np.zeros(max_clusters + 1)
    s[0] = 0
    s[1] = 0
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(init='k-means++', n_clusters=k)
        labels = kmeans.fit_predict(X)
        s[k] = metrics.silhouette_score(X, labels, metric='euclidean')
    plt.plot(range(2, len(s)), s[2:])
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    
# Determine number of clusters using error function
def kmeans_error_eval(X, max_clusters):
    error = np.zeros(max_clusters + 1)
    error[0] = 0
    error[1] = 0
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(init='k-means++', n_clusters=k)
        kmeans.fit_predict(X)
        error[k] = kmeans.inertia_
    plt.plot(range(2, len(error)), error[2:])
    plt.xlabel('Number of clusters')
    plt.ylabel('Error')

plt.figure(figsize=(8, 5))
kmeans_silhouette_eval(numerics, 15)
plt.figure(figsize=(8, 5))
kmeans_error_eval(numerics, 15)
plt.show()

In [11]:
from collections import defaultdict

kmeans = KMeans(n_clusters=6).fit(numerics)

# See what players are in each cluster
cluster_dict = defaultdict(list)
for cluster_label, ind in zip(kmeans.labels_, range(len(kmeans.labels_))):
    cluster_dict[cluster_label].append(csv.iloc[csv.axes[0][ind]]['PlayerID'])

cluster_dict[0]

['Nemanja Matić 08/01/1988',
 'Henrikh Mkhitaryan 01/21/1989',
 'Paul Pogba 03/15/1993',
 'İlkay Gündoğan 10/24/1990',
 'Santi Cazorla 12/13/1984',
 'Nicolás Otamendi 02/12/1988',
 'Jan Vertonghen 04/24/1987',
 'Ander Herrera 08/14/1989',
 'Nolito 10/15/1986',
 'Wayne Rooney 10/24/1985',
 'Aaron Ramsey 12/26/1990',
 'Leighton Baines 12/11/1984',
 'Shkodran Mustafi 04/17/1992',
 'Yaya Touré 05/13/1983',
 'Gary Cahill 12/19/1985',
 'David Luiz 04/22/1987',
 'Adam Lallana 05/10/1988',
 'Séamus Coleman 10/11/1988',
 'Bastian Schweinsteiger 08/01/1984',
 'Moussa Dembélé 07/16/1987',
 'Xherdan Shaqiri 10/10/1991',
 'Morgan Schneiderlin 11/08/1989',
 "N'Golo Kanté 03/29/1991",
 'Dele Alli 04/11/1996',
 'Jordan Henderson 06/17/1990',
 'Antonio Valencia 08/04/1985',
 'James McCarthy 11/12/1990',
 'Fernandinho 05/04/1985',
 'Yohan Cabaye 01/14/1986',
 'Emre Can 01/12/1994',
 'Danny Drinkwater 03/05/1990',
 'Michael Carrick 07/28/1981',
 'Branislav Ivanović 02/22/1984',
 'Claudio Yacob 07/18/1987

In [12]:
# Investigate the most influential features for each cluster
desc_order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
features = csv.axes[1]

for i in range(6):
    print("Cluster {}:".format(i))
    for ind in desc_order_centroids[i, :]:
        print(' {}'.format(features[ind]))
    print()

Cluster 0:
 Finishing
 SlideTackle
 BallControl
 Team
 BirthDate
 SwitchedTeams
 NumberOfPositisions
 PreferredPositions
 Contract
 Composure
 Dribbling
 DefWorkRate
 Age
 ShortPass
 AttWorkRate
 Year
 Name
 TeamPosition
 StandTackle
 YearsLeftInContract
 Crossing
 LongPass
 Vision
 AttPosition
 Interceptions
 Marking
 Strength
 UpdateDate
 Acceleration
 Aggression
 Stamina
 Reactions
 PotentialGrowth(%)
 IsHomeGrown
 OverallScore
 PotentialScore
 IsPhysicalAnomaly
 PreferredFoot
 Height
 PlayerID
 ShotPower
 Balance
 Jumping
 Agilityd
 SprintSpeed
 Heading
 Country
 Weight

Cluster 1:
 DefWorkRate
 PreferredPositions
 AttWorkRate
 SwitchedTeams
 SlideTackle
 NumberOfPositisions
 BallControl
 BirthDate
 StandTackle
 Team
 Vision
 Contract
 Dribbling
 Name
 Age
 Composure
 YearsLeftInContract
 Marking
 Interceptions
 Reactions
 Year
 AttPosition
 ShortPass
 TeamPosition
 UpdateDate
 LongPass
 Aggression
 IsHomeGrown
 Crossing
 Stamina
 Acceleration
 Strength
 OverallScore
 PotentialGrow