## Import Modules

In [17]:
# import modules
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import pickle

## Loading Data

In [29]:
# read data
training_data = pd.read_csv('./players_21.csv')
new_testing_data = pd.read_csv('./players_22.csv')

  new_testing_data = pd.read_csv('./players_22.csv')


## Data Preprocessing

### Remove columns with na values that exceed 30%

In [19]:
# Calculate the percentage of na values in each column
na_percentages = training_data.isna().sum() / len(training_data) * 100

# Select the columns where the percentage of na values exceeds 30%
cols_to_drop = na_percentages[na_percentages > 30].index

# Drop the selected columns
training_data.drop(cols_to_drop, axis=1, inplace=True)
new_testing_data.drop(cols_to_drop, axis=1, inplace=True)

### Remove columns where the values do not obviously contribute a player's overall rating

In [20]:
# urls do not help to predict a player's rating
# remove columns that have 'url' in their name
cols_to_drop = [col for col in training_data.columns if 'url' in col]
training_data.drop(cols_to_drop, axis=1, inplace=True)
new_testing_data.drop(cols_to_drop, axis=1, inplace=True)

In [21]:
# columns that obviously do not contribute to a player's rating
cols_to_drop = [
    "age",
    "sofifa_id",
    "short_name",
    "long_name",
    "real_face",
]

# drop the columns
training_data.drop(cols_to_drop, axis=1, inplace=True)
new_testing_data.drop(cols_to_drop, axis=1, inplace=True)

In [22]:
# columns where input would not be available at the time of prediction
cols_to_drop = [
    "gk", "rb", "rcb", "cb", "lcb", "lb", "rwb", "rdm", "cdm", "ldm", "lwb", "rm", "rcm", "cm", "lcm", "lm", "ram", "cam", "lam", "rw", "rf", "cf", "lf", "lw", "rs", "st", "ls", "club_joined", "club_contract_valid_until"
]

# drop the columns
training_data.drop(cols_to_drop, axis=1, inplace=True)
new_testing_data.drop(cols_to_drop, axis=1, inplace=True)

In [23]:
# remove club, national and league info. They do not explicitly determine a player's rating
cols_to_drop = [
    "club_name", "league_name", "league_level", "club_jersey_number", "nationality_id", "nationality_name", "value_eur", "release_clause_eur", "club_team_id"
]

# drop the columns
training_data.drop(cols_to_drop, axis=1, inplace=True)
new_testing_data.drop(cols_to_drop, axis=1, inplace=True)

#### Encoding data

In [11]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player_positions             18944 non-null  object 
 1   overall                      18944 non-null  int64  
 2   potential                    18944 non-null  int64  
 3   wage_eur                     18719 non-null  float64
 4   dob                          18944 non-null  object 
 5   height_cm                    18944 non-null  int64  
 6   weight_kg                    18944 non-null  int64  
 7   club_position                18719 non-null  object 
 8   preferred_foot               18944 non-null  object 
 9   weak_foot                    18944 non-null  int64  
 10  skill_moves                  18944 non-null  int64  
 11  international_reputation     18944 non-null  int64  
 12  work_rate                    18944 non-null  object 
 13  body_type       

In [24]:
# use pd.factorize to convert categorical columns to numerical
# check if dtype is object

# get categorical columns
cat_cols = [col for col in training_data.columns if training_data[col].dtype == 'object']

# factorize the categorical columns
for col in cat_cols:
    training_data[col], c1 = pd.factorize(training_data[col])
    new_testing_data[col], c2 = pd.factorize(new_testing_data[col])

In [27]:
s = '1990-10-03'
a = pd.factorize(s)

In [33]:
len(training_data['dob'].unique())

6236

#### Imputing Data

In [14]:
imputer = SimpleImputer(strategy='most_frequent')
training_data = pd.DataFrame(imputer.fit_transform(training_data), columns=training_data.columns)
new_testing_data = pd.DataFrame(imputer.transform(new_testing_data), columns=new_testing_data.columns)

In [15]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player_positions             18944 non-null  float64
 1   overall                      18944 non-null  float64
 2   potential                    18944 non-null  float64
 3   wage_eur                     18944 non-null  float64
 4   dob                          18944 non-null  float64
 5   height_cm                    18944 non-null  float64
 6   weight_kg                    18944 non-null  float64
 7   club_position                18944 non-null  float64
 8   preferred_foot               18944 non-null  float64
 9   weak_foot                    18944 non-null  float64
 10  skill_moves                  18944 non-null  float64
 11  international_reputation     18944 non-null  float64
 12  work_rate                    18944 non-null  float64
 13  body_type       

### Setup training and testing data

In [16]:
trainX = training_data.drop('overall', axis=1)
trainY = training_data['overall']
new_testX = new_testing_data.drop('overall', axis=1)
new_testY = new_testing_data['overall']

#### Scaling the independent variables

In [17]:
scaler = StandardScaler()
trainX = pd.DataFrame(scaler.fit_transform(trainX), columns=trainX.columns)
new_testX = pd.DataFrame(scaler.transform(new_testX), columns=new_testX.columns)

In [18]:
trainX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 53 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   player_positions             18944 non-null  float64
 1   potential                    18944 non-null  float64
 2   wage_eur                     18944 non-null  float64
 3   dob                          18944 non-null  float64
 4   height_cm                    18944 non-null  float64
 5   weight_kg                    18944 non-null  float64
 6   club_position                18944 non-null  float64
 7   preferred_foot               18944 non-null  float64
 8   weak_foot                    18944 non-null  float64
 9   skill_moves                  18944 non-null  float64
 10  international_reputation     18944 non-null  float64
 11  work_rate                    18944 non-null  float64
 12  body_type                    18944 non-null  float64
 13  pace            

#### Create feature subsets that better correlate with the overall rating

In [19]:
# create feature subsets which show better correlation with the overall rating

# create a list of all the columns with a correlation greater than 0.5
feature_cols = list(trainX.corrwith(trainY)[abs(trainX.corrwith(trainY)) > 0.5].index)

print(feature_cols)
print(len(feature_cols))

['potential', 'wage_eur', 'dob', 'passing', 'dribbling', 'attacking_short_passing', 'movement_reactions', 'power_shot_power', 'mentality_vision', 'mentality_composure']
10


In [20]:
# set trainX and testX to the new feature subset
trainX = trainX[feature_cols]
new_testX = new_testX[feature_cols]

In [21]:
trainX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18944 entries, 0 to 18943
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   potential                18944 non-null  float64
 1   wage_eur                 18944 non-null  float64
 2   dob                      18944 non-null  float64
 3   passing                  18944 non-null  float64
 4   dribbling                18944 non-null  float64
 5   attacking_short_passing  18944 non-null  float64
 6   movement_reactions       18944 non-null  float64
 7   power_shot_power         18944 non-null  float64
 8   mentality_vision         18944 non-null  float64
 9   mentality_composure      18944 non-null  float64
dtypes: float64(10)
memory usage: 1.4 MB


## Training & Evaluating Models

In [22]:
X = trainX
y = trainY

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Cross validation with RandomForest

In [23]:
# do cross validation training with either RandomForest, XGBoost, Gradient Boost Regressors that can predict a player rating.

# RandomForestRegressor cross validation training
cv = KFold(n_splits=3)

# parameters for the RandomForestRegressor
PARAMETERS = {
    "max_depth": [12,35, 40],
    "n_estimators": [100, 500, 1000]

}

rf = RandomForestRegressor()
model_rf = GridSearchCV(rf, cv=cv, param_grid=PARAMETERS, scoring="neg_mean_squared_error")
model_rf.fit(X_train, y_train)
model_rf.best_params_

{'max_depth': 40, 'n_estimators': 1000}

In [41]:

y_pred = model_rf.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R2) Score: {r2}")
print(f"Mean Absolute Percentage Error: {mape}")

Mean Absolute Error: 0.43383161784111907
Mean Squared Error: 0.40552474689891804
R-squared (R2) Score: 0.9913965212807883
Mean Absolute Percentage Error: 0.0069205798726614046


#### Fine Tuning

In [33]:
model_rf = RandomForestRegressor(max_depth=40, n_estimators=1000)
model_rf.fit(trainX, trainY)

### Cross validation with XGBoost

In [26]:
cv = KFold(n_splits=3)

# parameters for the XGBRegressor
PARAMETERS = {
    "max_depth": [12,35, 40],
    "learning_rate":[0.3, 0.1, 0.03],
    "n_estimators": [100, 500, 1000]
}

model_xgb = XGBRegressor()
model_xgb_gs = GridSearchCV(model_xgb, cv=cv, param_grid=PARAMETERS, scoring="neg_mean_absolute_error")
model_xgb_gs.fit(X_train, y_train)
model_xgb_gs.best_params_

{'learning_rate': 0.03, 'max_depth': 12, 'n_estimators': 500}

In [42]:
y_pred = model_xgb_gs.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R2) Score: {r2}")
print(f"Mean Absolute Percentage Error: {mape}")

Mean Absolute Error: 1.1517305804544844
Mean Squared Error: 2.8159586901132974
R-squared (R2) Score: 0.9402575530844043
Mean Absolute Percentage Error: 0.01838431908371624


#### Fine Tuning

In [40]:
model_xgb = XGBRegressor(learning_rate=0.03, max_depth=12, n_estimators=500)
model_xgb.fit(trainX, trainY)

Cross validation with AdaBoost

In [28]:
cv = KFold(n_splits=4)

PARAMETERS ={
    "random_state":[12,25, 36, 48],
    # "min_child_weight":[1,5,15],
    "learning_rate":[0.003, 0.1, 0.03],
    "n_estimators":[100,500,1000]
}

ada = AdaBoostRegressor()
model_ada = GridSearchCV(ada,param_grid=PARAMETERS,cv=cv,scoring="neg_mean_absolute_error")
model_ada.fit(X_train, y_train)
model_ada.best_params_

{'learning_rate': 0.03, 'n_estimators': 500, 'random_state': 12}

In [43]:
y_pred = model_ada.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R2) score
r2 = r2_score(y_test, y_pred)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R2) Score: {r2}")
print(f"Mean Absolute Percentage Error: {mape}")

Mean Absolute Error: 1.9649319921852926
Mean Squared Error: 6.264610087853353
R-squared (R2) Score: 0.8670921072334956
Mean Absolute Percentage Error: 0.030629600797093353


#### Fine Tuning

In [44]:
model_ada = AdaBoostRegressor(random_state=12, learning_rate=0.03, n_estimators=500)
model_ada.fit(X_train, y_train)

## Testing with new dataset

Random Forest

In [45]:
y_pred = model_rf.predict(new_testX)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(new_testY, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(new_testY, y_pred)

# Calculate R-squared (R2) score
r2 = r2_score(new_testY, y_pred)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(new_testY, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R2) Score: {r2}")
print(f"Mean Absolute Percentage Error: {mape}")

Mean Absolute Error: 1.1732340038463538
Mean Squared Error: 2.8538638849732316
R-squared (R2) Score: 0.9397093832592885
Mean Absolute Percentage Error: 0.01878310210830471


XGBoost

In [48]:
y_pred = model_xgb.predict(new_testX)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(new_testY, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(new_testY, y_pred)

# Calculate R-squared (R2) score
r2 = r2_score(new_testY, y_pred)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(new_testY, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R2) Score: {r2}")
print(f"Mean Absolute Percentage Error: {mape}")

Mean Absolute Error: 1.1147786553904002
Mean Squared Error: 2.6202423374166304
R-squared (R2) Score: 0.9446448629295948
Mean Absolute Percentage Error: 0.017844059405153778


AdaBoost

In [47]:
y_pred = model_ada.predict(new_testX)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(new_testY, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(new_testY, y_pred)

# Calculate R-squared (R2) score
r2 = r2_score(new_testY, y_pred)

# Calculate Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(new_testY, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R2) Score: {r2}")
print(f"Mean Absolute Percentage Error: {mape}")

Mean Absolute Error: 1.9861459022186907
Mean Squared Error: 6.314396268398642
R-squared (R2) Score: 0.8666023115637933
Mean Absolute Percentage Error: 0.03091573797364007


## Save model

In [49]:
# save the best model
pickle.dump(model_xgb, open('../models/model_xgb.pkl', 'wb'))

Best model is XGBoost.