### Import data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')

### Drop unnecessary columns and rows having missing / null values

In [2]:
columns_to_drop = [0, 1, 3, 4, 5, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 87]
df.drop(labels=df.columns[columns_to_drop], axis=1, inplace=True)
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

### Remove currency symbols in Wage and Value columns, turn them into numeric type, and change the column names

In [3]:
def clean_value(v):
    v = (v[1:-1].strip())
    if v == '':
        return 0
    return float(v)
df.Wage = df.Wage.apply(clean_value)
df.Value = df.Value.apply(clean_value)
df.rename(columns={'Wage': 'Wage_in_K', 'Value': 'Value_in_M'}, inplace=True)

### Remove plus signs in columns[11:37], add the two numbers, and covert the columns to numeric type

In [4]:
def clean_value(v):
    v = v.split('+')
    v = list(map(int, v))
    return sum(v)
for i in df.columns[11:37]:
    df[i] = df[i].apply(clean_value)

### Retrieve categorical columns

In [5]:
num_cols = df._get_numeric_data().columns
cat_cols = list(set(df.columns) - set(num_cols))

### Create dictionary of ordinal to integer mapping to Work Rate column

In [6]:
from sklearn.preprocessing import LabelEncoder

work_rate_dict = {'Low/ Low':2, 
            'Low/ Medium':3, 
            'Low/ High':4, 
            'Medium/ Low':3, 
            'Medium/ Medium':4, 
            'Medium/ High':5,
            'High/ Low':4,
            'High/ Medium':5,
            'High/ High':6}
df['Work Rate'] = df['Work Rate'].map(work_rate_dict)

### Create two dummies columns for Preferred Foot column

In [7]:
foot_dummies = pd.get_dummies(df['Preferred Foot'], prefix='Preferred Foot')
df = pd.concat([df, foot_dummies], axis=1)
df.drop(labels='Preferred Foot', axis=1, inplace=True)

In [8]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='Value_in_M')
y = df.Value_in_M
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Use GridSearch to look for the best parameters for XGBRegressor

In [9]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

gbm = XGBRegressor()
reg_cv = GridSearchCV(gbm, {"colsample_bytree":[1.0],"min_child_weight":[1.0,1.2]
                            ,'max_depth': [3,4,6], 'n_estimators': [500,1000]}, verbose=1)
reg_cv.fit(X_train,y_train)

NameError: name 'xgb' is not defined

### Print the best parameters out and apply it

In [None]:
print(reg_cv.best_params_)
gbm = xgb.XGBRegressor(**reg_cv.best_params_)
gbm.fit(X_train,y_train)

### Use the model to predict the market value of soccer players

In [None]:
predictions = gbm.predict(X_test)

### The accuracy of the predicted value with the market value

In [None]:
gbm.score(X_test,y_test)