In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
from models.gm_feature_imp import GMInterp

In [3]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import calinski_harabasz_score
from models.transfersdataset import TransfersDataset
from models.xgboost import Xgboost

In [4]:
gm = GaussianMixture()
td = TransfersDataset(Path.cwd()/'../prepped/final.csv')
td.encode_last_positions()
td.encode()
td.drop()

  data = pd.read_csv(filepath_or_buffer=path, index_col=False).drop_duplicates().reset_index(drop=True)


In [5]:
m = Xgboost(
    target = 'fee',
    features = ['age', 'season', 'window', 'loan', 'club_from_elo', 'club_to_elo', 'league_from_elo', 'league_to_elo'] + \
               td.data.columns[td.data.columns.str.contains('pos')].tolist() + \
               td.data.columns[td.data.columns.str.contains('_-')].tolist(),
    data = td.data
)
m.train_test_split()
m.scale()

In [6]:
td.data

Unnamed: 0,name,age,season,country_from,league_from,club_from,country_to,league_to,club_to,window,...,posAMF,posLAMF,posLWB,posRWF,posRCB,posGK,posRWB,posLDMF,posDMF,posCB
0,E. Hazard,28,0.0,England,Premier League,Chelsea,Spain,LaLiga,Real Madrid,1,...,0,0,0,0,0,0,0,0,0,0
1,A. Griezmann,28,0.0,Spain,LaLiga,Atlético Madrid,Spain,LaLiga,Barcelona,1,...,0,0,0,0,0,0,0,0,0,0
2,Philippe Coutinho,27,0.0,Spain,LaLiga,Barcelona,Germany,Bundesliga,Bayern München,1,...,0,0,0,0,0,0,0,0,0,0
3,H. Maguire,26,0.0,England,Premier League,Leicester City,England,Premier League,Manchester United,1,...,0,0,0,0,0,0,0,0,0,0
4,N. Pépé,24,0.0,France,Ligue 1,Lille,England,Premier League,Arsenal,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2459,N. Pierozzi,22,4.0,Italy,Serie A,Fiorentina,Italy,Serie A,Salernitana,0,...,0,0,0,0,0,0,0,0,0,0
2460,A. Ibrahimović,30,4.0,Italy,Serie A,Udinese,Italy,Serie A,Frosinone,0,...,0,0,0,0,0,0,0,0,0,0
2461,M. Milovanović,30,4.0,Spain,LaLiga,Getafe,Spain,LaLiga,Almería,0,...,0,0,0,0,0,0,0,0,0,0
2462,J. Justvan,25,4.0,Germany,Bundesliga,Hoffenheim,Germany,Bundesliga,Darmstadt 98,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
scores = []
for i in range(2, 11):
    gmm = GaussianMixture(n_components=i, covariance_type='diag', random_state=0)
    gmm.fit(m.X)
    y = gmm.fit_predict(m.X)
    scores.append([
        i,
        gmm.bic(m.X),
        gmm.aic(m.X),
        calinski_harabasz_score(m.X,gmm.predict(m.X))
    ])

scores = pd.DataFrame(scores, columns=['n_clusters', 'bic', 'aic', 'calinski_harabasz_score'])
scores
fig = make_subplots(specs=[[{"secondary_y": True}]])
for metric in scores.columns[1:]:
    if metric == 'calinski_harabasz_score':
        fig.add_trace(go.Scatter(x=scores.n_clusters, y=scores[metric], name=metric), secondary_y=True)
    else:
        fig.add_trace(go.Scatter(x=scores.n_clusters, y=scores[metric], name=metric))
fig.update_layout(height = 500, width = 1000, title = 'Elbow method', xaxis = dict(title_text='Number of clusters'), yaxis = dict(title_text='Scores'))
fig.show()

In [8]:
gmm = GaussianMixture(n_components=4, covariance_type='diag', random_state=0)
gmm.fit(m.X)
labels = gmm.predict(m.X)

In [9]:
player_info_cols = ['name', 'age', 'season', 'country_from', 'league_from', 'club_from',
                            'country_to', 'league_to', 'club_to', 'window', 'fee', 'loan']
preds = m.data
preds['label'] = labels

In [10]:
preds[preds.label == 0]

Unnamed: 0,name,age,season,country_from,league_from,club_from,country_to,league_to,club_to,window,...,posLAMF,posLWB,posRWF,posRCB,posGK,posRWB,posLDMF,posDMF,posCB,label
0,E. Hazard,28,0.0,England,Premier League,Chelsea,Spain,LaLiga,Real Madrid,1,...,0,0,0,0,0,0,0,0,0,0
1,A. Griezmann,28,0.0,Spain,LaLiga,Atlético Madrid,Spain,LaLiga,Barcelona,1,...,0,0,0,0,0,0,0,0,0,0
2,Philippe Coutinho,27,0.0,Spain,LaLiga,Barcelona,Germany,Bundesliga,Bayern München,1,...,0,0,0,0,0,0,0,0,0,0
3,H. Maguire,26,0.0,England,Premier League,Leicester City,England,Premier League,Manchester United,1,...,0,0,0,0,0,0,0,0,0,0
4,N. Pépé,24,0.0,France,Ligue 1,Lille,England,Premier League,Arsenal,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399,D. Spence,23,4.0,England,Premier League,Tottenham Hotspur,Italy,Serie A,Genoa,0,...,0,0,0,0,0,0,0,0,0,0
2430,A. Virginius,21,4.0,France,Ligue 1,Lille,France,Ligue 1,Clermont,0,...,1,0,0,0,0,0,0,0,0,0
2432,Junior Messias,32,4.0,Italy,Serie A,Milan,Italy,Serie A,Genoa,0,...,0,0,0,0,0,1,0,0,0,0
2451,K. Bonifazi,27,4.0,Italy,Serie A,Bologna,Italy,Serie A,Frosinone,0,...,0,0,0,1,0,0,0,0,0,0


In [11]:
preds[preds.label == 0].describe()

Unnamed: 0,age,season,window,fee,loan,club_from_elo,club_to_elo,league_from_elo,league_to_elo,accbpassespct_-5,...,posLAMF,posLWB,posRWF,posRCB,posGK,posRWB,posLDMF,posDMF,posCB,label
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,...,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0
mean,25.024862,1.574586,0.766575,8979913.0,0.461326,1719.05663,1692.933702,1620.964042,1629.363306,0.0,...,0.059392,0.073204,0.059392,0.060773,0.0,0.064917,0.0,0.0,0.0,0.0
std,3.692082,1.447549,0.423303,16845710.0,0.498847,123.577307,125.347356,54.79246,55.44182,0.0,...,0.236521,0.260652,0.236521,0.23908,0.0,0.24655,0.0,0.0,0.0,0.0
min,17.0,0.0,0.0,0.0,0.0,1432.0,1432.0,1540.966667,1540.966667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,1.0,0.0,0.0,1622.75,1597.0,1571.266667,1581.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,1.0,1.0,1000000.0,0.0,1718.0,1683.0,1618.066667,1621.933333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,28.0,3.0,1.0,11025000.0,1.0,1811.0,1772.5,1672.966667,1676.633333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,38.0,4.0,1.0,120000000.0,1.0,2077.0,2077.0,1721.066667,1721.066667,0.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [19]:
def pipeline(cluster):
    td = TransfersDataset(Path.cwd()/'../prepped/final.csv')
    td.encode()
    td.drop()
    m = Xgboost(
        target = 'marketval_0',
        features = ['age', 'season', 'window', 'loan', 'club_from_elo', 'club_to_elo', 'league_from_elo', 'league_to_elo'] + \
                td.data.columns[td.data.columns.str.contains('_-')].tolist(),
        data = td.data
    )
    gmm = GaussianMixture(n_components=4, covariance_type='diag', random_state=0)
    gmm.fit(m.X)
    labels = gmm.predict(m.X)
    m.data = m.data.iloc[labels == cluster] 
    m = Xgboost(
        target = 'marketval_0',
        features = ['age', 'season', 'window', 'loan', 'club_from_elo', 'club_to_elo', 'league_from_elo', 'league_to_elo'] + \
                td.data.columns[td.data.columns.str.contains('_-')].tolist(),
        data = m.data
    )
    display(m.data)
    m.train_test_split()
    m.scale()
    # m.tune_hp()
    m.train()
    m.inverse_scale()
    m.mae()
    m.mape()
    # display(m.feature_importance())
    m.plot_predictions()
    display(m.top_n_predictions(20))

In [20]:
pipeline(0)

MAE = 13930088.0
MAPE = 1.456923927572203


Unnamed: 0,name,age,season,country_from,league_from,club_from,country_to,league_to,club_to,window,fee,loan,marketval_0,marketval_0_pred,error
1584,A. Maitland-Niles,25,3.0,England,Premier League,Arsenal,England,Premier League,Southampton,1,0.0,1,8000000.0,8920206.0,920206.0
2100,N. Keïta,28,4.0,England,Premier League,Liverpool,Germany,Bundesliga,Werder Bremen,1,0.0,0,6000000.0,4735010.5,1264989.5
1178,M. Niang,26,2.0,France,Ligue 1,Rennes,France,Ligue 1,Bordeaux,1,0.0,0,4000000.0,2412326.75,1587673.25
2248,R. Gagliardini,29,4.0,Italy,Serie A,Inter Milan,Italy,Serie A,Monza,1,0.0,0,4000000.0,2412326.75,1587673.25
2094,L. Paredes,29,4.0,France,Ligue 1,Paris Saint-Germain,Italy,Serie A,Roma,1,2500000.0,0,8000000.0,11242891.0,3242891.0
1688,M. Gregoritsch,28,3.0,Germany,Bundesliga,Augsburg,Germany,Bundesliga,Freiburg,1,0.0,0,8000000.0,4628002.5,3371997.5
2120,Y. Mina,28,4.0,England,Premier League,Everton,Italy,Serie A,Fiorentina,1,0.0,0,5000000.0,9532366.0,4532366.0
2081,S. Posch,26,4.0,Germany,Bundesliga,Hoffenheim,Italy,Serie A,Bologna,1,5000000.0,0,14000000.0,19446384.0,5446384.0
1963,O. Dembélé,26,4.0,Spain,LaLiga,Barcelona,France,Ligue 1,Paris Saint-Germain,1,50000000.0,0,60000000.0,40593052.0,19406948.0
2372,T. Werner,27,4.0,Germany,Bundesliga,RB Leipzig,England,Premier League,Tottenham Hotspur,0,0.0,1,17000000.0,-7345511.5,24345511.5


In [21]:
pipeline(1)

MAE = 14365472.9
MAPE = 2.9074862043888885


Unnamed: 0,name,age,season,country_from,league_from,club_from,country_to,league_to,club_to,window,fee,loan,marketval_0,marketval_0_pred,error
1643,B. Drągowski,24,3.0,Italy,Serie A,Fiorentina,Italy,Serie A,Spezia,1,2450000.0,0,4500000.0,4174025.0,325975.0
2187,Luís Maximiano,24,4.0,Italy,Serie A,Lazio,Spain,LaLiga,Almería,1,500000.0,1,5000000.0,6357638.0,1357638.0
1718,I. Provedel,28,3.0,Italy,Serie A,Spezia,Italy,Serie A,Lazio,1,2550000.0,0,12000000.0,9874366.0,2125634.0
1931,D. Selke,27,3.0,Germany,Bundesliga,Hertha BSC,Germany,Bundesliga,Köln,0,0.0,0,2500000.0,6357638.0,3857638.0
1700,J. Kenny,25,3.0,England,Premier League,Everton,Germany,Bundesliga,Hertha BSC,1,0.0,0,3000000.0,7178582.0,4178582.0
2052,G. Vicario,26,4.0,Italy,Serie A,Empoli,England,Premier League,Tottenham Hotspur,1,18500000.0,0,30000000.0,17064316.0,12935684.0
695,A. Schwolow,28,1.0,Germany,Bundesliga,Freiburg,Germany,Bundesliga,Hertha BSC,1,7000000.0,0,5000000.0,19674244.0,14674244.0
1705,Sergio Asenjo,33,3.0,Spain,LaLiga,Villarreal,Spain,LaLiga,Real Valladolid,1,0.0,0,1000000.0,22026152.0,21026152.0
967,G. Donnarumma,22,2.0,Italy,Serie A,Milan,France,Ligue 1,Paris Saint-Germain,1,0.0,0,50000000.0,74798464.0,24798464.0
1519,R. Kolo Muani,23,3.0,France,Ligue 1,Nantes,Germany,Bundesliga,Eintracht Frankfurt,1,0.0,0,80000000.0,21625282.0,58374718.0


In [22]:
pipeline(3)

MAE = 5450879.373737373
MAPE = 0.7265743373571732


Unnamed: 0,name,age,season,country_from,league_from,club_from,country_to,league_to,club_to,window,fee,loan,marketval_0,marketval_0_pred,error
1013,Trincão,21,2.0,Spain,LaLiga,Barcelona,England,Premier League,Wolverhampton Wanderers,1,6000000.0,1,20000000.0,19973526.0,26474.0
155,Rubén Peña,27,0.0,Spain,LaLiga,Eibar,Spain,LaLiga,Villarreal,1,8000000.0,0,6500000.0,6547337.0,47337.0
916,D. Kohr,26,1.0,Germany,Bundesliga,Eintracht Frankfurt,Germany,Bundesliga,Mainz 05,0,0.0,1,6000000.0,5943553.0,56447.0
2123,L. Hall,18,4.0,England,Premier League,Chelsea,England,Premier League,Newcastle United,1,0.0,1,18000000.0,17935336.0,64664.0
1638,R. Cabella,32,3.0,France,Ligue 1,Montpellier,France,Ligue 1,Lille,1,0.0,0,5000000.0,4841965.0,158035.0
1866,B. Dieng,22,3.0,France,Ligue 1,Olympique Marseille,France,Ligue 1,Lorient,0,7000000.0,0,8000000.0,8224356.0,224356.0
897,A. Duncan,27,1.0,Italy,Serie A,Fiorentina,Italy,Serie A,Cagliari,0,1000000.0,1,10000000.0,9775347.0,224653.0
2209,Samú Costa,22,4.0,Spain,LaLiga,Almería,Spain,LaLiga,Mallorca,1,3000000.0,0,8000000.0,8254164.5,254164.5
2308,Guevara,26,4.0,Spain,LaLiga,Real Sociedad,Spain,LaLiga,Deportivo Alavés,1,1800000.0,0,5000000.0,4707866.0,292134.0
202,M. Wolf,24,0.0,Germany,Bundesliga,Borussia Dortmund,Germany,Bundesliga,Hertha BSC,1,2000000.0,1,6500000.0,6808485.0,308485.0


In [24]:
pipeline(3)

MAE = 5450879.373737373
MAPE = 0.7265743373571732


Unnamed: 0,name,age,season,country_from,league_from,club_from,country_to,league_to,club_to,window,fee,loan,marketval_0,marketval_0_pred,error
1013,Trincão,21,2.0,Spain,LaLiga,Barcelona,England,Premier League,Wolverhampton Wanderers,1,6000000.0,1,20000000.0,19973526.0,26474.0
155,Rubén Peña,27,0.0,Spain,LaLiga,Eibar,Spain,LaLiga,Villarreal,1,8000000.0,0,6500000.0,6547337.0,47337.0
916,D. Kohr,26,1.0,Germany,Bundesliga,Eintracht Frankfurt,Germany,Bundesliga,Mainz 05,0,0.0,1,6000000.0,5943553.0,56447.0
2123,L. Hall,18,4.0,England,Premier League,Chelsea,England,Premier League,Newcastle United,1,0.0,1,18000000.0,17935336.0,64664.0
1638,R. Cabella,32,3.0,France,Ligue 1,Montpellier,France,Ligue 1,Lille,1,0.0,0,5000000.0,4841965.0,158035.0
1866,B. Dieng,22,3.0,France,Ligue 1,Olympique Marseille,France,Ligue 1,Lorient,0,7000000.0,0,8000000.0,8224356.0,224356.0
897,A. Duncan,27,1.0,Italy,Serie A,Fiorentina,Italy,Serie A,Cagliari,0,1000000.0,1,10000000.0,9775347.0,224653.0
2209,Samú Costa,22,4.0,Spain,LaLiga,Almería,Spain,LaLiga,Mallorca,1,3000000.0,0,8000000.0,8254164.5,254164.5
2308,Guevara,26,4.0,Spain,LaLiga,Real Sociedad,Spain,LaLiga,Deportivo Alavés,1,1800000.0,0,5000000.0,4707866.0,292134.0
202,M. Wolf,24,0.0,Germany,Bundesliga,Borussia Dortmund,Germany,Bundesliga,Hertha BSC,1,2000000.0,1,6500000.0,6808485.0,308485.0


In [None]:
kms = GMInterp(
	n_components=5,
	ordered_feature_names=m.X.columns.tolist(), 
	feature_importance_method='wcss_min', # or 'unsup2sup'
).fit(m.X)

# A dictionary where the key [0] is the cluster label, and [:10] will refer to the first 10 most important features
kms.feature_importances_[0]

TypeError: super(type, obj): obj must be an instance or subtype of type