In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_validate, cross_val_score, RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [2]:
data = pd.read_csv('Final_scaled_data_modeling.csv')

In [3]:
data.head()

Unnamed: 0,Overall,Potential,Wage,International Reputation,Skill Moves,Value,Release Clause,Club_Club Class C,Club_Club Class B,Contract Expire Year
0,1.0,0.978723,1.0,1.0,0.96875,0.932484,0.992985,0.0,1.0,0.375
1,1.0,0.978723,0.716312,1.0,0.96875,0.649759,0.557187,0.0,0.0,0.5
2,0.958333,0.957447,0.512411,1.0,0.9375,1.0,1.0,0.0,1.0,0.5
3,0.9375,0.957447,0.45922,0.75,0.328125,0.607562,0.607606,0.0,1.0,0.25
4,0.9375,0.93617,0.62766,0.75,1.0,0.860748,0.861018,0.0,1.0,0.625


In [4]:
inputs = data.drop('Value', axis=1)
target = data['Value']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42)

In [6]:
n_split = 5
k_fold = KFold(n_splits=n_split, shuffle=True, random_state=42)

In [7]:
models = [('GB', GradientBoostingRegressor()), ('RF', RandomForestRegressor()), ('LGBM', LGBMRegressor()),
         ('XGB', XGBRegressor())]

In [8]:
for name, model in models:
    cv_result = cross_val_score(model, x_train, y_train, cv=k_fold, scoring='neg_root_mean_squared_error', n_jobs=-1)
    score = np.mean(cv_result)
    print(f'{name} cross validation RMSE score is : {-score}')

GB cross validation RMSE score is : 0.005927024110644286
RF cross validation RMSE score is : 0.005696295276711295
LGBM cross validation RMSE score is : 0.006500210044075083
XGB cross validation RMSE score is : 0.005660890651391809


XGB and RF Performing best among all. As we can see that Rmse value is very low so i won't perform hyperparameter tuning because it may overfit the model.

In [13]:
#xgb selected
xgb = XGBRegressor(learning_rate=0.5)

In [14]:
xgb.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.5, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [15]:
y_pred = xgb.predict(x_test)

In [16]:
np.sqrt(mean_squared_error(y_test, y_pred))

0.004976787472753321

Our train and test Rmse almost the same so our model doestn't overfit

In [17]:
#saveing model
pickle.dump(xgb, open('fifa_2019_players_value_model.pkl', 'wb'))