In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
from models.gm_feature_imp import GMInterp

In [3]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import calinski_harabasz_score
from models.transfersdataset import TransfersDataset
from models.xgboost import Xgboost

In [4]:
gm = GaussianMixture()
td = TransfersDataset(Path.cwd()/'../prepped/final.csv')
td.encode_last_positions()
td.encode()
td.drop()

  data = pd.read_csv(filepath_or_buffer=path, index_col=False).drop_duplicates().reset_index(drop=True)


In [5]:
m = Xgboost(
    target = 'fee',
    features = ['age', 'season', 'window', 'loan', 'club_from_elo', 'club_to_elo', 'league_from_elo', 'league_to_elo'] + \
               td.data.columns[td.data.columns.str.contains('pos')].tolist() + \
               td.data.columns[td.data.columns.str.contains('_-')].tolist(),
    data = td.data
)
m.train_test_split()
m.scale()

In [6]:
td.data

Unnamed: 0,name,age,season,country_from,league_from,club_from,country_to,league_to,club_to,window,...,posAMF,posLAMF,posLWB,posRWF,posRCB,posGK,posRWB,posLDMF,posDMF,posCB
0,E. Hazard,28,0.0,England,Premier League,Chelsea,Spain,LaLiga,Real Madrid,1,...,0,0,0,0,0,0,0,0,0,0
1,A. Griezmann,28,0.0,Spain,LaLiga,Atlético Madrid,Spain,LaLiga,Barcelona,1,...,0,0,0,0,0,0,0,0,0,0
2,Philippe Coutinho,27,0.0,Spain,LaLiga,Barcelona,Germany,Bundesliga,Bayern München,1,...,0,0,0,0,0,0,0,0,0,0
3,H. Maguire,26,0.0,England,Premier League,Leicester City,England,Premier League,Manchester United,1,...,0,0,0,0,0,0,0,0,0,0
4,N. Pépé,24,0.0,France,Ligue 1,Lille,England,Premier League,Arsenal,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,N. Pierozzi,22,4.0,Italy,Serie A,Fiorentina,Italy,Serie A,Salernitana,0,...,0,0,0,0,0,0,0,0,0,0
2460,A. Ibrahimović,30,4.0,Italy,Serie A,Udinese,Italy,Serie A,Frosinone,0,...,0,0,0,0,0,0,0,0,0,0
2461,M. Milovanović,30,4.0,Spain,LaLiga,Getafe,Spain,LaLiga,Almería,0,...,0,0,0,0,0,0,0,0,0,0
2462,J. Justvan,25,4.0,Germany,Bundesliga,Hoffenheim,Germany,Bundesliga,Darmstadt 98,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
scores = []
for i in range(2, 11):
    gmm = GaussianMixture(n_components=i, covariance_type='diag', random_state=0)
    gmm.fit(m.X)
    y = gmm.fit_predict(m.X)
    scores.append([
        i,
        gmm.bic(m.X),
        gmm.aic(m.X),
        calinski_harabasz_score(m.X,gmm.predict(m.X))
    ])

scores = pd.DataFrame(scores, columns=['n_clusters', 'bic', 'aic', 'calinski_harabasz_score'])
scores
fig = make_subplots(specs=[[{"secondary_y": True}]])
for metric in scores.columns[1:]:
    if metric == 'calinski_harabasz_score':
        fig.add_trace(go.Scatter(x=scores.n_clusters, y=scores[metric], name=metric), secondary_y=True)
    else:
        fig.add_trace(go.Scatter(x=scores.n_clusters, y=scores[metric], name=metric))
fig.update_layout(height = 500, width = 1000, title = 'Elbow method')
fig.show()

In [8]:
gmm = GaussianMixture(n_components=i, covariance_type='diag', random_state=0)
gmm.fit(m.X)
labels = gmm.predict(m.X)

In [9]:
player_info_cols = ['name', 'age', 'season', 'country_from', 'league_from', 'club_from',
                            'country_to', 'league_to', 'club_to', 'window', 'fee', 'loan']
preds = m.data
preds['label'] = labels

In [10]:
preds[preds.label == 0]

Unnamed: 0,name,age,season,country_from,league_from,club_from,country_to,league_to,club_to,window,...,posLAMF,posLWB,posRWF,posRCB,posGK,posRWB,posLDMF,posDMF,posCB,label
1519,R. Kolo Muani,23,3.0,France,Ligue 1,Nantes,Germany,Bundesliga,Eintracht Frankfurt,1,...,0,0,0,0,0,0,0,0,0,0
1522,Emerson,28,3.0,England,Premier League,Chelsea,England,Premier League,West Ham United,1,...,0,0,0,0,0,0,0,0,0,0
1593,B. Leno,30,3.0,England,Premier League,Arsenal,England,Premier League,Fulham,1,...,0,0,0,0,0,0,0,0,0,0
1617,Munir El Haddadi,26,3.0,Spain,LaLiga,Sevilla,Spain,LaLiga,Getafe,1,...,1,0,0,0,0,0,0,0,0,0
1634,T. Strakosha,27,3.0,Italy,Serie A,Lazio,England,Premier League,Brentford,1,...,0,0,0,0,0,0,0,0,0,0
1705,Sergio Asenjo,33,3.0,Spain,LaLiga,Villarreal,Spain,LaLiga,Real Valladolid,1,...,0,0,0,0,0,0,0,0,0,0
1748,W. Falcone,27,3.0,Italy,Serie A,Sampdoria,Italy,Serie A,Lecce,1,...,0,0,0,0,0,0,0,0,0,0
1784,F. Forster,34,3.0,England,Premier League,Southampton,England,Premier League,Tottenham Hotspur,1,...,0,0,0,0,0,0,0,0,0,0
1861,Y. Sommer,34,3.0,Germany,Bundesliga,Borussia M'gladbach,Germany,Bundesliga,Bayern München,0,...,0,0,0,0,0,0,0,0,0,0
1931,D. Selke,27,3.0,Germany,Bundesliga,Hertha BSC,Germany,Bundesliga,Köln,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
preds[preds.label == 0].describe()

Unnamed: 0,age,season,window,fee,loan,club_from_elo,club_to_elo,league_from_elo,league_to_elo,accbpassespct_-5,...,posLAMF,posLWB,posRWF,posRCB,posGK,posRWB,posLDMF,posDMF,posCB,label
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,...,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,28.0,3.444444,0.888889,2377778.0,0.166667,1736.833333,1714.055556,1643.812963,1658.622222,20.337222,...,0.111111,0.0,0.0,0.055556,0.0,0.0,0.055556,0.055556,0.0,0.0
std,3.343123,0.51131,0.323381,4860350.0,0.383482,102.513844,109.193553,39.501692,37.25704,39.314125,...,0.323381,0.0,0.0,0.235702,0.0,0.0,0.235702,0.235702,0.0,0.0
min,23.0,3.0,0.0,0.0,0.0,1581.0,1527.0,1571.266667,1589.566667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,26.0,3.0,1.0,0.0,0.0,1641.25,1638.5,1619.033333,1624.175,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,3.0,1.0,0.0,0.0,1727.0,1696.5,1653.333333,1653.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,30.0,4.0,1.0,1350000.0,0.0,1817.25,1780.25,1670.808333,1696.008333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,34.0,4.0,1.0,15400000.0,1.0,1906.0,1958.0,1702.466667,1721.066667,100.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [12]:
kms = GMInterp(
	n_components=5,
	ordered_feature_names=m.X.columns.tolist(), 
	feature_importance_method='wcss_min', # or 'unsup2sup'
).fit(m.X)

# A dictionary where the key [0] is the cluster label, and [:10] will refer to the first 10 most important features
kms.feature_importances_[0]

TypeError: super(type, obj): obj must be an instance or subtype of type