### Import data

In [47]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')

### Drop unnecessary columns and rows having missing / null values

In [48]:
columns_to_drop = [0, 1, 3, 4, 5, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 87]
df.drop(labels=df.columns[columns_to_drop], axis=1, inplace=True)
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

### Remove currency symbols in Wage and Value columns, turn them into numeric type, and change the column names

In [49]:
def clean_value(v):
    v = (v[1:-1].strip())
    if v == '':
        return 0
    return float(v)
df.Wage = df.Wage.apply(clean_value)
df.Value = df.Value.apply(clean_value)
df.rename(columns={'Wage': 'Wage_in_K', 'Value': 'Value_in_M'}, inplace=True)

### Remove plus signs in columns[11:37], add the two numbers, and covert the columns to numeric type

In [50]:
def clean_value(v):
    v = v.split('+')
    v = list(map(int, v))
    return sum(v)
for i in df.columns[11:37]:
    df[i] = df[i].apply(clean_value)

### Retrieve categorical columns

In [51]:
num_cols = df._get_numeric_data().columns
cat_cols = list(set(df.columns) - set(num_cols))

### Create dictionary of ordinal to integer mapping to Work Rate column

In [52]:
from sklearn.preprocessing import LabelEncoder

work_rate_dict = {'Low/ Low':2, 
            'Low/ Medium':3, 
            'Low/ High':4, 
            'Medium/ Low':3, 
            'Medium/ Medium':4, 
            'Medium/ High':5,
            'High/ Low':4,
            'High/ Medium':5,
            'High/ High':6}
df['Work Rate'] = df['Work Rate'].map(work_rate_dict)

### Create two dummies columns for Preferred Foot column

In [53]:
foot_dummies = pd.get_dummies(df['Preferred Foot'], prefix='Preferred Foot')
df = pd.concat([df, foot_dummies], axis=1)
df.drop(labels='Preferred Foot', axis=1, inplace=True)

In [54]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='Value_in_M')
y = df.Value_in_M
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Use GridSearch to look for the best parameters for XGBRegressor

In [55]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

gbm = xgb.XGBRegressor()
reg_cv = GridSearchCV(gbm, {"colsample_bytree":[1.0],"min_child_weight":[1.0,1.2]
                            ,'max_depth': [3,4,6], 'n_estimators': [500,1000]}, verbose=1)
reg_cv.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  8.3min finished


GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
      

### Print the best parameters out and apply it

In [57]:
print(reg_cv.best_params_)
gbm = xgb.XGBRegressor(**reg_cv.best_params_)
gbm.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1.0, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

### Use the model to predict the market value of soccer players

In [58]:
predictions = gbm.predict(X_test)

### The accuracy of the predicted value with the market value

In [59]:
gbm.score(X_test,y_test)

0.9037250247532549