In [1]:
from common_functions import *
import pandas as pd
PATH = "datasets/Fifa"

fifa = load_housing_data(PATH, "fifa_data.csv")

In [2]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

fifa_preferred_foot = fifa['Preferred Foot'].astype('str')
fifa_preferred_foot_encoded = encoder.fit_transform(fifa_preferred_foot)
fifa['Preferred Foot'] = fifa_preferred_foot_encoded

fifa_club_encoded = encoder.fit_transform(fifa['Club'].astype('str'))
fifa['Club'] = fifa_club_encoded

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
# Drop fifa objects
class FeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, features_names):
        self._features_names = features_names

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.drop(self._features_names, axis=1)
        return X

In [4]:
class WealthToInt(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self._feature_names = feature_names
        self._type = type 

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None ):
        for column in self._feature_names:
            X[column] = X[column].str.replace('€', '')
            X[column] = X[column].str.replace('K', '')
            X[column] = X[column].str.replace('M', '000')
            X[column] = pd.to_numeric(X[column])
        return X

In [5]:
class WeightToInt(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X['Weight'] = X['Weight'].str.replace('lbs', '')
        X['Weight'] = pd.to_numeric(X['Weight'])
        return X    
    

In [6]:
# class ObjectToInt(BaseEstimator, TransformerMixin):
#     def __init__(self, features):
#         self._features = features
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X, y=None):
#         for column in self._features:
#             if not isinstance(X[column], int):
#                 post_series = X[column].str.split("+", expand=True)
#                 X[column] = post_series[0].astype('int') + post_series[1].astype('int')
#             else:
#                 X[column] = X[column]
#         return X

In [7]:
# Pipeline
from sklearn.pipeline import Pipeline


positions = ["CAM", "CB", "CDM", "CF", "CM", "LAM", "LB", "LCB", "LCM", "LDM", "LF", "LM", "LS", "LW", "LWB", 
                  "RAM", "RB", "RCB", "RCM", "RDM", "RF", "RWB", "ST", "RM", "RS", "RW"]
dropped_columns  =["Unnamed: 0", "Photo", "ID", "Loaned From", "Contract Valid Until", "Loaned From", 
                               "Club Logo", "Joined","Flag", "Height", "Name", "Nationality", "Position", "Body Type",
                               "Real Face", "Work Rate"]
wealth_columns = ['Wage', 'Release Clause', 'Value']

preprocessing_pipeline = Pipeline([
    ('preprocessing_dropping', FeatureDropper(dropped_columns)),
    ('preprocessing_wealth', WealthToInt(wealth_columns)),
    ('preprocessing_weight', WeightToInt()),
    # ("preprocessing_position", ObjectToInt(positions))
])
fifa_dataset = preprocessing_pipeline.fit_transform(fifa)

In [8]:
fifa_dataset.dropna(axis=0, inplace=True)

In [9]:
def transformObjectIntoInt(column):
    if not isinstance(column, int):
        post_series = column.str.split("+", expand=True)
        column = post_series[0].astype('int') + post_series[1].astype('int')
    return column

array_position = ["CAM", "CB", "CDM", "CF", "CM", "LAM", "LB", "LCB", "LCM", "LDM", "LF", "LM", "LS", "LW", "LWB", 
                  "RAM", "RB", "RCB", "RCM", "RDM", "RF", "RWB", "ST", "RM", "RS", "RW"]
for position in array_position:
    fifa_dataset[position] = fifa_dataset[position].transform(transformObjectIntoInt)

In [10]:
# Create dataset
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(fifa_dataset, test_size=0.2, random_state=42)

In [11]:
# Create copy of traning set 
fifa = train_set.copy()

In [12]:
# Separate features of dataset and the labels
fifa_dataset_prepared = fifa.drop("Potential", axis=1)
fifa_labels = fifa["Potential"].copy()

X_test = test_set.drop("Potential", axis=1)
y_test = test_set["Potential"].copy()

In [13]:
# # Verify the error percentage between the train set and the overall
# compare_props = pd.DataFrame({
#     "Overall": fifa["Overall"].value_counts()/len(fifa),
#     "Train_set": train_set["Overall"].value_counts()/len(train_set),
# }).sort_index()
# compare_props["%error"] = 100 * compare_props["Train_set"] / compare_props["Overall"] - 100
# # compare_props

In [14]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

In [15]:
# from sklearn.linear_model import LinearRegression
# 
# lin_reg = LinearRegression()
# lin_reg.fit(fifa_dataset_prepared, fifa_labels)

In [16]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.metrics import mean_squared_error
# import numpy as np
# 
# tree_reg = DecisionTreeRegressor()
# tree_reg.fit(fifa_dataset_prepared, fifa_labels)
# 
# fifa_predictions = tree_reg.predict(fifa_dataset_prepared)
# tree_mse = mean_squared_error(fifa_labels, fifa_predictions)
# tree_rmse = np.sqrt(tree_mse)
# tree_rmse

In [17]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(tree_reg, fifa_dataset_prepared, fifa_labels, scoring="neg_mean_squared_error", cv=10)
# rmse_scores = np.sqrt(-scores)
#     
# display_scores(rmse_scores)

In [18]:
# lin_scores = cross_val_score(lin_reg, fifa_dataset_prepared, fifa_labels, scoring="neg_mean_squared_error", cv=10)
# lin_rmse_scores = np.sqrt(-lin_scores)
# display_scores(lin_rmse_scores)

In [19]:
# RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

forest_reg = RandomForestRegressor()
forest_reg.fit(fifa_dataset_prepared, fifa_labels)
fifa_predictions = forest_reg.predict(fifa_dataset_prepared)
# forest_mse = mean_squared_error(fifa_labels, fifa_predictions)

forest_scores = cross_val_score(forest_reg, fifa_dataset_prepared, fifa_labels, scoring="neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores:  [1.21512909 1.23708314 1.22665669 1.20434491 1.21954993]
Mean:  1.220552753693714
Standard deviation:  0.010987516429950722


In [20]:
# from sklearn.model_selection import GridSearchCV
# param_grid = [
#     {'n_estimators' : [30, 100, 300, 1000], 'max_features': [8, 10, 12, 14]},
#     {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
# ]
# forest_reg = RandomForestRegressor()
# grid_search = GridSearchCV(forest_reg, param_grid, cv = 5, scoring="neg_mean_squared_error")
# grid_search.fit(fifa_dataset_prepared, fifa_labels)

In [21]:
# grid_search.best_params_

In [26]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
# Randomized search
distributions = {
    'n_estimators': randint(4, 50),
    'max_features': randint(4, 16),
}

forest_reg = RandomForestRegressor()
rd_search = RandomizedSearchCV(forest_reg, distributions, n_iter=100, cv=5)
model = rd_search.fit(fifa_dataset_prepared, fifa_labels)
print(model.best_params_)



{'max_features': 15, 'n_estimators': 46}


In [28]:
forest_reg_parameterised = RandomForestRegressor(**model.best_params_)
best_model = forest_reg_parameterised.fit(fifa_dataset_prepared, fifa_labels)
some_data_test_set = X_test.iloc[:5]
some_labels = y_test.iloc[:5]
print("Predictions:\t", np.around(best_model.predict(some_data_test_set)))
print("Labels:\t", list(some_labels))


Predictions:	 [75. 81. 72. 69. 80.]
Labels:	 [73, 82, 71, 69, 80]


In [32]:
# Test the models
forest_scores = cross_val_score(forest_reg_parameterised, X_test, y_test, scoring="neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores:  [1.82508767 1.92193035 1.7226857  1.74312226 1.56198378]
Mean:  1.7549619548033562
Standard deviation:  0.119347033178391


In [33]:
forest_scores = cross_val_score(forest_reg, X_test, y_test, scoring="neg_mean_squared_error", cv=5)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores:  [1.42352189 1.38379508 1.27584229 1.35491009 1.26613143]
Mean:  1.340840156465478
Standard deviation:  0.061131781750532625


In [None]:
# import numpy as np
# some_data_test_set = X_test.iloc[:5]
# some_labels = y_test.iloc[:5]
# print("Predictions:\t", np.around(lin_reg.predict(some_data_test_set)))
# print("Labels:\t", list(some_labels))



In [None]:
# from sklearn.metrics import mean_squared_error
# fifa_predictions = lin_reg.predict(fifa_dataset_prepared)
# lin_mse = mean_squared_error(fifa_labels, fifa_predictions)
# lin_rmse = np.sqrt(lin_mse)
# lin_rmse
# 
# error_percentage = (lin_rmse/fifa_predictions.mean())*100
# print("Error percentage is like ",error_percentage,"%")
