In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline 

In [2]:
def clean (data):
    data = pd.read_csv(data)
    data = data.drop_duplicates()  # removing duplicates
    
    # convertion of weight to integer
    weight =lambda x: int(x.replace('lbs',''))  # weight to integer
    data['Weight']=list(map(weight, data['Weight']))
    
    # convertion of height to integer
    def height (x):
        x=x.replace('"','')
        f,i=x.split("\'")
        x=round(int(f)+int(i)/12,2)
        return x
    data['Height']=list(map(height, data['Height']))
    
    # conversion of amounts (€) to integer
    def amount (x):
        x=x.replace('€','')
        if 'K' in x:
            scalar=1000
            x=x.replace('K','')
        elif 'M' in x:
            scalar=1000000
            x=x.replace('M','')
        else:
            scalar=1
        x=pd.to_numeric(x, errors='coerce')*scalar
        return x
    
    data['Value']=list(map(amount, data['Value']))
    data['Wage']=list(map(amount, data['Wage']))
    data['Release Clause']=list(map(amount, data['Release Clause']))
    data['Hits']=list(map(amount, data['Hits']))
    
    #Getting the progress from the main positions
    def progression (x):
        f,i=x.split("+")
        x=int(i)
        return x
    data['GK']=list(map(progression, data['GK']))
    data['CB']=list(map(progression, data['CB']))
    data['LB']=list(map(progression, data['LB']))
    data['RB']=list(map(progression, data['RB']))
    data['CM']=list(map(progression, data['CM']))
    data['CAM']=list(map(progression, data['CAM']))
    data['CDM']=list(map(progression, data['CDM']))
    data['ST']=list(map(progression, data['ST']))
    data['RW']=list(map(progression, data['RW']))
    data['LW']=list(map(progression, data['LW']))
    
    # clean the symbol '★' in those columns which have it
    data['W/F'] = data['W/F'].str.rstrip('★').astype('int')
    data['SM'] = data['SM'].str.rstrip('★').astype('int')
    data['IR'] = data['IR'].str.rstrip('★').astype('int')
    
    data_num=data.select_dtypes(include = np.number)
    
    # Looking for high correlation between columns of the same cathegory with corr > 85%
    
    #I'm going to eliminate the columns related with Attacking
    data_num = data_num.drop(['Crossing', 'Finishing', 'Short Passing', 'Volleys'], axis=1)
    
    #I'm going to eliminate the columns related with Skill
    data_num = data_num.drop(['Dribbling', 'Curve', 'FK Accuracy', 'Long Passing', 'Ball Control'], axis=1)
    
    #I'm going to eliminate the columns which sum is Movement
    data_num = data_num.drop(['Acceleration', 'Sprint Speed', 'Agility', 'Balance'], axis=1)
    
    #I'm going to eliminate the column Strenght
    data_num = data_num.drop(['Strength'], axis=1)

    #I'm going to eliminate none of the columns which sum is Mentality
    #data_num = data_num.drop(['Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties','Composure'], axis=1)
    
    #I'm going to eliminate the columns related with Defending
    data_num = data_num.drop(['Marking', 'Standing Tackle', 'Sliding Tackle'], axis=1)

    #I'm going to eliminate the columns which sum is Goalkeeping
    data_num = data_num.drop(['GK Diving', 'GK Handling', 'GK Kicking', 'GK Positioning', 'GK Reflexes'], axis=1)
    
    #I'm going to replace the NaN with the mean
    for col in data_num:
        mean = int(data_num[col].mean())
        data_num[col] = data_num[col].fillna(mean)
    
    #I'll eliminate all the categories with corr>85%
    data_num = data_num.drop(['Release Clause', 'Wage', 'Total Stats', 'Skill', 'Mentality', 'Long Shots', 'Positioning', 'Defending', 'Interceptions'], axis=1)
    
    data_cat=data.select_dtypes(include = object)
    
    data_cat = data_cat.drop(['Name','Nationality','Club', 'Position', 'Team & Contract', 'Joined', 'Loan Date End', 'Contract'], axis=1)
    
    #I'll eliminate the positions I haven´t used in the progression
    data_cat=data_cat.drop(['LS', 'RS', 'LF', 'CF', 'RF', 'LAM', 'RAM', 'LM',
                            'LCM', 'RCM', 'RM', 'LWB', 'LDM', 'RDM', 'RWB', 'LCB', 'RCB'], axis=1)
    
    data_cat['A/W']=data_cat['A/W'].fillna('Medium')
    
    data_cat['D/W']=data_cat['D/W'].fillna('Medium')
    
    data= pd.concat([data_num, data_cat], axis=1)
    
    from sklearn.preprocessing import OneHotEncoder

    X_num = data.select_dtypes(include = np.number)
    X_cat = data.select_dtypes(include = object)
    
    encoder = OneHotEncoder(drop='first').fit(X_cat)
    cols = encoder.get_feature_names(input_features=X_cat.columns)
    X_cat_encode = pd.DataFrame(encoder.transform(X_cat).toarray(),columns=cols)
    
    data = pd.concat([X_num, X_cat_encode], axis=1)
    
    return (data)

In [3]:
train=clean ('fifa21_train.csv')

In [4]:
test=clean ('fifa21_validate.csv')

In [5]:
#from sklearn.model_selection import train_test_split

In [6]:
y_train = train['OVA']
X_train = train.drop(['OVA'], axis=1)
y_test = test['OVA']
X_test = test.drop(['OVA'], axis=1)

In [7]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

In [9]:
#Now I'll calculate R2 score, MSE, RMSE and MAE
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
train_pred = lm.predict(X_train)
test_pred = lm.predict(X_test)
r2t = r2_score(y_train, train_pred)
print("R2 for train:", r2t)
r2s = r2_score(y_test, test_pred)
print("\nR2 for score:", r2s)
train_mse = mean_squared_error(train_pred, y_train)
print ('\nTrain MSE: {}'.format(train_mse))
test_mse = mean_squared_error(test_pred, y_test)
print ('\nTest MSE: {}'.format(test_mse))
train_rmse = math.sqrt(train_mse)
print("\nTrain RMSE:", train_rmse)
test_rmse = math.sqrt(test_mse)
print("\nTest RMSE:", test_rmse)
train_mae = mean_absolute_error(y_train, train_pred)
print("\nTrain MAE:", train_mae)
test_mae = mean_absolute_error(y_test, test_pred)
print("\nTest MAE:", test_mae)

R2 for train: 0.9158402350097816

R2 for score: 0.9120520683308362

Train MSE: 3.978775061287654

Test MSE: 4.024627607644616

Train RMSE: 1.994686707552756

Test RMSE: 2.006147454113136

Train MAE: 1.5396434268499537

Test MAE: 1.5603349234193786
