## Model imports

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import pickle
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

## Load the cleaned_players.csv into a dataframe

In [2]:
df_cleaned_players = pd.read_csv('cleaned_players.csv')
df_cleaned_players

Unnamed: 0,overall,potential,value_eur,age,height_cm,weight_kg,weak_foot,skill_moves,international_reputation,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed,attacker,midfielder,defender,goalkeeper,club_starter,club_substitute,club_reserve,nation_starter,nation_substitute,nation_reserve,preferred_foot_Left,preferred_foot_Right,attacking_work_rate,defending_work_rate,body_type_Unique,body_type_Lean,body_type_Stocky,body_type_Normal,nation_rank
0,90,92,78000000,34,169,69,4,4,5,82,89,88,90,31,63,80,90,66,89,85,91,89,89,88,91,88,76,88,90,92,83,66,71,65,89,44,38,91,90,71,91,17,31,22,2,11,11,11,6,0,1,0,0,0,1,0,0,1,0,0,1,0,2,1,1,0,0,0,194
1,90,90,119500000,32,183,80,4,4,5,77,89,74,85,40,81,67,90,88,80,85,82,76,85,67,87,76,76,76,90,80,87,84,75,84,85,78,47,91,78,88,87,30,40,18,10,2,11,6,6,0,1,0,0,0,1,0,0,1,0,0,0,1,3,2,1,0,0,0,174
2,90,90,45000000,36,186,80,4,5,5,82,89,79,85,31,72,85,90,88,76,85,86,80,80,76,87,84,84,84,90,72,91,91,75,76,89,61,25,91,73,84,91,21,31,22,6,11,11,11,10,0,1,0,0,0,1,0,0,1,0,0,0,1,3,1,1,0,0,0,194
3,90,90,129000000,29,174,66,5,5,5,87,79,83,90,36,63,80,81,62,85,85,91,85,85,80,91,92,88,92,87,80,80,63,80,49,80,61,34,81,86,88,91,30,31,26,6,6,11,11,10,0,1,1,0,0,1,0,0,0,0,1,0,1,3,2,1,0,0,0,194
4,90,90,125500000,30,179,69,5,4,4,72,84,88,85,63,76,89,81,53,89,81,86,80,80,88,87,72,72,76,90,76,87,59,88,72,89,73,64,86,90,80,87,66,62,52,10,11,2,6,10,0,0,1,0,0,1,0,0,1,0,0,0,1,3,3,1,0,0,0,194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14842,47,51,70000,22,179,63,3,2,1,53,32,41,47,40,45,45,29,44,46,29,40,32,35,46,47,55,56,53,52,69,42,59,50,45,26,48,38,34,39,41,33,35,40,44,2,6,2,11,10,0,0,0,1,0,0,1,0,0,0,1,0,1,2,2,0,1,0,0,133
14843,47,58,110000,19,174,69,3,2,1,58,37,46,42,40,49,50,29,44,50,29,40,50,31,46,38,59,56,60,45,65,46,45,63,45,35,48,38,49,48,41,45,35,40,44,10,11,2,6,6,0,0,1,0,0,0,0,1,0,0,1,0,1,2,2,0,1,0,0,154
14844,47,53,100000,21,176,69,3,2,1,58,32,41,47,40,49,36,29,40,46,33,45,36,35,46,47,59,56,57,45,57,50,55,54,49,31,48,34,44,44,37,33,35,40,44,6,2,6,6,2,0,0,1,0,0,0,0,1,0,0,1,0,1,2,2,0,1,0,0,154
14845,47,58,110000,19,171,63,3,2,1,67,42,32,47,13,40,28,48,35,37,33,40,32,31,30,43,67,64,68,45,72,46,48,46,38,40,31,12,44,39,45,45,8,13,9,6,6,6,11,10,0,1,0,0,0,0,0,1,0,0,1,0,1,2,2,0,1,0,0,154


## Define the dictionaries for saving the best models

In [3]:
df_dict = {
    'top_players': df_cleaned_players[df_cleaned_players['overall'] >= 70],
    'others': df_cleaned_players[df_cleaned_players['overall'] < 70]
}
best_models_dict = {
    'top_players': {
        'best_model': None,
        'best_model_train_score': -1,
        'best_model_test_score': -1,
        'best_model_random_state': -1
    },
    'others': {
        'best_model': None,
        'best_model_train_score': -1,
        'best_model_test_score': -1,
        'best_model_random_state': -1
    }
}

## Running the model

### The target column has exponential distribution, thus it's wrapped with np.log1p(y)

In [4]:
def get_target_column_random_state(description, y):
    return (np.log1p(y), 897) if description == 'top_players' else (y, 386)

### Models list

In [5]:
def get_models():
    return [DecisionTreeRegressor(), RandomForestRegressor(), XGBRegressor(), GradientBoostingRegressor()]

### Get the mape, r_squared and adjusted_r_squared

In [6]:
def get_metrics(description, model, X_train, X_test, y_test):
    adjusted_y_test = np.expm1(y_test) if description == 'top_players' else y_test
    
    pred = model.predict(X_test)
    adjusted_pred = np.expm1(pred) if description == 'top_players' else pred
    
    mape = mean_absolute_percentage_error(adjusted_y_test, adjusted_pred)
    r_squared = r2_score(adjusted_y_test, adjusted_pred)
    num_predictors = X_train.shape[1]
    sample_size = X_train.shape[0]
    adjusted_r_squared = 1 - (1 - r_squared) * ((sample_size - 1) / (sample_size - num_predictors - 1))
    
    return mape, r_squared, adjusted_r_squared

### Run the models and return the best one according to its test score

In [7]:
def train_and_eval(description, X_train, X_test, y_train, y_test, verbose=None):

    best_model = None
    best_model_test_score = -1

    models = get_models()

    for model in models:

        model.fit(X_train, y_train)

        model_test_score = model.score(X_test, y_test)

        if (model_test_score > best_model_test_score):
            best_model_test_score = model_test_score
            best_model = model

        if verbose:
            model_name = model.__class__.__name__
            print(f"{model_name} test score {model_test_score}")
            mape, r_squared, adjusted_r_squared = get_metrics(description, model, X_train, X_test, y_test)
            print("MAPE:", mape)
            print("R-squared:", r_squared)
            print("Adjusted R-squared:", adjusted_r_squared)
            print("\n")

    return best_model

### Run the models and set the models_dict according to the best_model

In [8]:
def run_models(description, df, verbose, models_dict):
    
    train_df = df.drop(columns=['value_eur'],axis=1)

    target_column, random_state = get_target_column_random_state(description, df['value_eur'])

    X_train, X_test, y_train, y_test = train_test_split(train_df, target_column, test_size=0.25, random_state=random_state)

    print(f"Running models for {description}:\n")
    best_model = train_and_eval(description, X_train, X_test, y_train, y_test, verbose)
    best_model_test_score = best_model.score(X_test, y_test)
    
    if (best_model_test_score > models_dict[description]['best_model_test_score']):
        models_dict[description]['best_model'] = best_model
        models_dict[description]['best_model_train_score'] = best_model.score(X_train, y_train)
        models_dict[description]['best_model_test_score'] = best_model_test_score
        
        models_dict[description]['best_model_random_state'] = random_state

        mape, r_squared, adjusted_r_squared = get_metrics(description, best_model, X_train, X_test, y_test)
        models_dict[description]['best_model_mape'] = mape
        models_dict[description]['best_model_r_squared'] = r_squared
        models_dict[description]['best_model_adjusted_r_squared'] = adjusted_r_squared


### Run the models and print the best results

In [9]:
verbose = 1
for description, df in df_dict.items():
    run_models(description, df, verbose, best_models_dict)

for description in df_dict.keys():
    print(f"Results for {description}:\n")
    for key, value in best_models_dict[description].items():
        print(f"{key}: {value}")
    print("\n")

Running models for top_players:

DecisionTreeRegressor test score 0.963695158037191
MAPE: 0.16066803574525831
R-squared: 0.933160394175121
Adjusted R-squared: 0.9318666016342466


RandomForestRegressor test score 0.9841300478834366
MAPE: 0.11637637432668972
R-squared: 0.963668815091552
Adjusted R-squared: 0.9629655641454163


XGBRegressor test score 0.9850348591424374
MAPE: 0.11028238833167063
R-squared: 0.9673221194693439
Adjusted R-squared: 0.9666895843494792


GradientBoostingRegressor test score 0.9868964931682317
MAPE: 0.10575688230013064
R-squared: 0.9830485487002537
Adjusted R-squared: 0.9827204249631678


Running models for others:

DecisionTreeRegressor test score 0.9557122603890027
MAPE: 0.10925173443609539
R-squared: 0.9557122603890027
Adjusted R-squared: 0.9553098059865508


RandomForestRegressor test score 0.9790824814533132
MAPE: 0.08474450698921668
R-squared: 0.9790824814533132
Adjusted R-squared: 0.9788923984303044


XGBRegressor test score 0.9806545746497044
MAPE: 0.08

### Results for the top_players

In [10]:
top_30_players = df_dict['top_players']
top_30_players_input = top_30_players.drop(columns=['value_eur'])
top_30_players_target = list(top_30_players['value_eur'])
top_30_players_pred = np.expm1(best_models_dict['top_players']['best_model'].predict(top_30_players_input))
diff = (np.abs(top_30_players_pred - top_30_players_target) / np.abs(top_30_players_target))
print(f"top_30_players mape: {diff.mean()}")

top_30_players mape: 0.10020816249463896


### Results for the others

In [11]:
other_players = df_dict['others']
other_players_input = other_players.drop(columns=['value_eur'])
other_players_target = list(other_players['value_eur'])
other_players_pred = best_models_dict['others']['best_model'].predict(other_players_input)
diff = (np.abs(other_players_pred - other_players_target) / np.abs(other_players_target))
print(f"others mape: {diff.mean()}")

others mape: 0.05425649673134111


## Check the prediction on top class players

In [12]:
cristiano_target = 45_000_000
cristiano_input = np.array([[90,90,36,186,80,4,5,5,82,89,79,85,31,72,85,90,88,76,85,86,80,80,76,87,84,84,84,90,72,91,91,75,76,89,61,25,91,73,84,91,21,31,22,6,11,11,11,10,0,1,0,0,0,1,0,0,1,0,0,0,1,3,1,1,0,0,0,194]])
# cristiano_input = np.array([[90,90,36]])
cristiano_pred = np.expm1(best_models_dict['top_players']['best_model'].predict(cristiano_input))
print(cristiano_pred)

[42521546.7885464]




In [13]:
messi_value_eur = 78_000_000
messi_input = np.array([[90,92,34,169,69,4,4,5,82,89,88,90,31,63,80,90,66,89,85,91,89,89,88,91,88,76,88,90,92,83,66,71,65,89,44,38,91,90,71,91,17,31,22,2,11,11,11,6,0,1,0,0,0,1,0,0,1,0,0,1,0,2,1,1,0,0,0,194]])
# messi_input = np.array([[90,92,34]])
messi_pred = np.expm1(best_models_dict['top_players']['best_model'].predict(messi_input))
print(messi_pred)

[68590186.80751763]




In [14]:
lewandowski_target = 119_500_000
lewandowski_input = np.array([[90,90,32,183,80,4,4,5,77,89,74,85,40,81,67,90,88,80,85,82,76,85,67,87,76,76,76,90,80,87,84,75,84,85,78,47,91,78,88,87,30,40,18,10,2,11,6,6,0,1,0,0,0,1,0,0,1,0,0,0,1,3,2,1,0,0,0,174]])
# lewandowski_input = np.array([[90,90,32]])
lewandowski_pred = np.expm1(best_models_dict['top_players']['best_model'].predict(lewandowski_input))
print(lewandowski_pred)

[1.08409629e+08]




In [15]:
neymar_target = 129_000_000
neymar_input = np.array([[90,90,29,174,66,5,5,5,87,79,83,90,36,63,80,81,62,85,85,91,85,85,80,91,92,88,92,87,80,80,63,80,49,80,61,34,81,86,88,91,30,31,26,6,6,11,11,10,0,1,1,0,0,1,0,0,0,0,1,0,1,3,2,1,0,0,0,194]])
# neymar_input = np.array([[90,90,29]])
neymar_pred = np.expm1(best_models_dict['top_players']['best_model'].predict(neymar_input))
print(neymar_pred)

[1.38359586e+08]




## Checking the feature_importances of each model

In [16]:
best_model_top_players = best_models_dict['top_players']['best_model']

best_model_top_players_feature_importance = best_model_top_players.feature_importances_

sorted_features_top_players = sorted(zip(df_dict['top_players'].drop(columns=["value_eur"]).columns, best_model_top_players_feature_importance), key=lambda x: x[1], reverse=True)

print("Feature Importances of top_players in Descending Order:")
for feature, importance in sorted_features_top_players:
    print(f"{feature}: {importance}")

Feature Importances of top_players in Descending Order:
overall: 0.42590431227597386
potential: 0.4206023826101452
age: 0.11462965444292457
mentality_positioning: 0.00719249735577264
movement_reactions: 0.005873702194618074
dribbling: 0.005030706453969526
skill_ball_control: 0.004633722515101113
pace: 0.0025302244545085794
skill_dribbling: 0.0022554506670345847
goalkeeping_speed: 0.0021683630055460133
attacking_finishing: 0.0018237061886246862
attacking_volleys: 0.0014759064170459335
attacking_heading_accuracy: 0.0008847763316939738
mentality_penalties: 0.0008459465841595996
power_long_shots: 0.0006685754038905307
shooting: 0.0006126865329218288
passing: 0.0005137283225322688
defending_standing_tackle: 0.0004005966293361628
attacking_short_passing: 0.0003713623407732047
attacking_crossing: 0.00015683275033122367
international_reputation: 0.00015600352977492802
defending: 0.00012853169400918045
movement_acceleration: 0.00012044887836851076
defender: 0.00010352947439358226
movement_sprin

In [17]:
best_model_others = best_models_dict['others']['best_model']

best_model_others_feature_importance = best_model_others.feature_importances_

sorted_features_others = sorted(zip(df_dict['others'].drop(columns=["value_eur"]).columns, best_model_others_feature_importance), key=lambda x: x[1], reverse=True)

print("Feature Importances of others in Descending Order:")
for feature, importance in sorted_features_others:
    print(f"{feature}: {importance}")

Feature Importances of others in Descending Order:
overall: 0.507155179977417
potential: 0.30852845311164856
age: 0.04607260599732399
skill_ball_control: 0.013173433020710945
skill_dribbling: 0.008688435889780521
mentality_positioning: 0.0077757034450769424
attacking_finishing: 0.007321315351873636
shooting: 0.006868491880595684
dribbling: 0.006096911150962114
passing: 0.0053107556886971
defender: 0.005016586743295193
pace: 0.00405777245759964
movement_reactions: 0.002603758592158556
skill_long_passing: 0.0021795444190502167
defending_marking_awareness: 0.0021481686271727085
skill_fk_accuracy: 0.0020291712135076523
movement_sprint_speed: 0.002006850205361843
attacking_volleys: 0.001980155473574996
defending_sliding_tackle: 0.0019397324649617076
defending_standing_tackle: 0.0018865688471123576
attacking_crossing: 0.0018332517938688397
attacking_short_passing: 0.001803031424060464
power_shot_power: 0.0017953751375898719
mentality_penalties: 0.0017645362531766295
attacking_heading_accurac

## Running the model on the top 3 features according to feature_importances

In [18]:
df_cleaned_players_top_3_features = df_cleaned_players[['overall', 'potential', 'age', 'value_eur']]

In [19]:
df_dict_top_3_features = {
    'top_players': df_cleaned_players_top_3_features[df_cleaned_players_top_3_features['overall'] >= 70],
    'others': df_cleaned_players_top_3_features[df_cleaned_players_top_3_features['overall'] < 70]
}
best_models_dict_top_3_features = {
    'top_players': {
        'best_model': None,
        'best_model_train_score': -1,
        'best_model_test_score': -1,
        'best_model_random_state': -1
    },
    'others': {
        'best_model': None,
        'best_model_train_score': -1,
        'best_model_test_score': -1,
        'best_model_random_state': -1
    }
}

In [20]:
verbose = 1
for description, df in df_dict_top_3_features.items():
    run_models(description, df, verbose, best_models_dict_top_3_features)

for description in df_dict_top_3_features.keys():
    print(f"Results for {description}:\n")
    for key, value in best_models_dict_top_3_features[description].items():
        print(f"{key}: {value}")
    print("\n")

Running models for top_players:

DecisionTreeRegressor test score 0.9619418741025618
MAPE: 0.1657702628597792
R-squared: 0.944480621994912
Adjusted R-squared: 0.9444340713705366


RandomForestRegressor test score 0.9627422244050933
MAPE: 0.16526302872580012
R-squared: 0.9559030241515465
Adjusted R-squared: 0.9558660507229424


XGBRegressor test score 0.9629163961430198
MAPE: 0.1630087943129863
R-squared: 0.971341177085279
Adjusted R-squared: 0.9713171478877541


GradientBoostingRegressor test score 0.9647169381640579
MAPE: 0.16252915400737084
R-squared: 0.9639510108307229
Adjusted R-squared: 0.9639207852948067


Running models for others:

DecisionTreeRegressor test score 0.9704100884693632
MAPE: 0.11941310931626523
R-squared: 0.9704100884693632
Adjusted R-squared: 0.9703983277732063


RandomForestRegressor test score 0.9707507597249323
MAPE: 0.11879734057596918
R-squared: 0.9707507597249323
Adjusted R-squared: 0.9707391344307054


XGBRegressor test score 0.9706871984224955
MAPE: 0.118

## Check the prediction on top class players

In [21]:
cristiano_target = 45_000_000
cristiano_input = np.array([[90,90,36]])
cristiano_pred = np.expm1(best_models_dict_top_3_features['top_players']['best_model'].predict(cristiano_input))
print(cristiano_pred)

[41004810.75530496]




In [22]:
messi_value_eur = 78_000_000
messi_input = np.array([[90,92,34]])
messi_pred = np.expm1(best_models_dict_top_3_features['top_players']['best_model'].predict(messi_input))
print(messi_pred)

[73022494.3106977]




In [23]:
lewandowski_target = 119_500_000
lewandowski_input = np.array([[90,90,32]])
lewandowski_pred = np.expm1(best_models_dict_top_3_features['top_players']['best_model'].predict(lewandowski_input))
print(lewandowski_pred)

[90356578.26658726]




In [24]:
neymar_target = 129_000_000
neymar_input = np.array([[90,90,29]])
neymar_pred = np.expm1(best_models_dict_top_3_features['top_players']['best_model'].predict(neymar_input))
print(neymar_pred)

[1.05523758e+08]




## The best models for the top_3_features gave worse results than the original models

### Save the best_models into .pkl files

In [25]:
pickle.dump(best_models_dict['top_players']['best_model'], open('best_model_top_players.pkl', 'wb'))
pickle.dump(best_models_dict['others']['best_model'], open('best_model_others.pkl', 'wb'))