In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor

pd.set_option("display.max_columns",500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.simplefilter("ignore")

# FIFA 2019 Data

In [4]:
# we will read the file "players_19.csv" as a Pandas dataframe
fifa19 = pd.read_csv(r'C:\Users\vivek\Desktop\FIFA\players_19.csv\players_19.csv')

#we will take a look at the first three entries
fifa19.head(3)

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club_name,league_name,league_rank,overall,potential,value_eur,wage_eur,player_positions,preferred_foot,international_reputation,weak_foot,skill_moves,work_rate,body_type,real_face,release_clause_eur,player_tags,team_position,team_jersey_number,loaned_from,joined,contract_valid_until,nation_position,nation_jersey_number,pace,shooting,passing,dribbling,defending,physic,gk_diving,gk_handling,gk_kicking,gk_reflexes,gk_speed,gk_positioning,player_traits,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,33,1985-02-05,187,83,Portugal,Juventus,Italian Serie A,1.0,94,94,77000000,400000,"ST, LW",Right,5,4,5,High/Low,C. Ronaldo,Yes,127100000.0,"#Speedster, #Dribbler, #Distance Shooter, #Acr...",ST,7.0,,2018-07-10,2022.0,LS,7.0,90.0,93.0,81.0,89.0,35.0,79.0,,,,,,,"Power Free-Kick, Diver, Flair, Long Shot Taker...",84,94,89,81,87,88,81,76,77,94,89,91,87,96,70,95,95,88,79,93,63,29,95,82,85,95,28,31,23,7,11,15,14,11,92+2,92+2,92+2,90+0,91+0,91+0,91+0,90+0,88+4,88+4,88+4,88+4,81+4,81+4,81+4,88+4,65+4,61+4,61+4,61+4,65+4,61+4,54+4,54+4,54+4,61+4
1,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,31,1987-06-24,170,72,Argentina,FC Barcelona,Spain Primera Division,1.0,94,94,110500000,575000,"CF, RW, ST",Left,5,4,4,Medium/Medium,Messi,Yes,226500000.0,"#Dribbler, #Distance Shooter, #FK Specialist, ...",RW,10.0,,2004-07-01,2021.0,CF,10.0,88.0,91.0,88.0,96.0,32.0,61.0,,,,,,,"Finesse Shot, Long Shot Taker (AI), Speed Drib...",77,95,70,90,86,97,93,94,87,96,91,86,91,95,95,85,68,72,59,94,48,22,94,94,75,96,33,28,26,6,11,15,14,8,89+5,89+5,89+5,92+2,93+1,93+1,93+1,92+2,93+1,93+1,93+1,91+3,85+7,85+7,85+7,91+3,63+7,61+7,61+7,61+7,63+7,59+7,48+7,48+7,48+7,59+7
2,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar,Neymar da Silva Santos Júnior,26,1992-02-05,175,68,Brazil,Paris Saint-Germain,French Ligue 1,1.0,92,93,118500000,300000,LW,Right,5,5,5,High/Medium,Neymar,Yes,228100000.0,"#Speedster, #Dribbler, #FK Specialist, #Acroba...",LW,10.0,,2017-08-03,2022.0,LW,10.0,92.0,84.0,83.0,95.0,32.0,59.0,,,,,,,"Diver, Selfish, Flair, Speed Dribbler (AI), Te...",79,87,62,84,84,96,88,87,78,95,94,90,96,94,84,80,61,81,49,82,56,36,89,87,81,94,27,24,33,9,9,15,15,11,84+7,84+7,84+7,90+3,90+3,90+3,90+3,90+3,90+3,90+3,90+3,89+4,81+7,81+7,81+7,89+4,66+7,61+7,61+7,61+7,66+7,61+7,47+7,47+7,47+7,61+7


# Data cleaning, encoding, imputing

In [5]:
#we will clean, encode the data by using a function named data_cleaning
def data_cleaning(df):
    df = df.drop(['potential','nation_jersey_number','team_jersey_number','preferred_foot','contract_valid_until'],axis=1)
    df['player_positions'] = df['player_positions'].str.split(',',expand=True)[0]
    df['nation_position'] = df['nation_position'].str.split(',',expand=True)[0]
    df['team_position'] = df['team_position'].str.split(',',expand=True)[0]
    df['work_rate'] = df['work_rate'].str.split('/',expand=True)[0]

    label_encoder = LabelEncoder()
    df['player_positions'] = label_encoder.fit_transform(df['player_positions'])
    df['nation_position'] = label_encoder.fit_transform(df['nation_position'])
    df['team_position'] = label_encoder.fit_transform(df['team_position'])
    df['work_rate'] = label_encoder.fit_transform(df['work_rate'])

    col1 = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram',
        'lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb']

    for i in col1:
        if df[i].dtype=='O':
            df[i] = df[i].str.replace('+','!')
            df[i] = df[i].str.replace('-','!')
            df[i] = df[i].str.split('!',expand=True)[0].fillna('0').astype('int64')

    numeric_columns = []
    string_columns = []

    for col in df.columns:
        if df[col].dtype != 'object':
            numeric_columns.append(col)
        else:
            string_columns.append(col)

    print(numeric_columns)
    print(string_columns)

    imputer = KNNImputer()
    impute_arr = imputer.fit_transform(df[numeric_columns])

    return pd.DataFrame(impute_arr,columns=df[numeric_columns].columns)
    

# Clean FIFA 2019 data

In [6]:
# we will look at the cleaned data using the created function
clean_fifa19 = data_cleaning(fifa19.drop('league_rank',axis=1))

#take a quick look at the data 
clean_fifa19.head()

['sofifa_id', 'age', 'height_cm', 'weight_kg', 'overall', 'value_eur', 'wage_eur', 'player_positions', 'international_reputation', 'weak_foot', 'skill_moves', 'work_rate', 'release_clause_eur', 'team_position', 'nation_position', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking', 'defending_standing_tackle'

Unnamed: 0,sofifa_id,age,height_cm,weight_kg,overall,value_eur,wage_eur,player_positions,international_reputation,weak_foot,skill_moves,work_rate,release_clause_eur,team_position,nation_position,pace,shooting,passing,dribbling,defending,physic,gk_diving,gk_handling,gk_kicking,gk_reflexes,gk_speed,gk_positioning,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,20801.0,33.0,187.0,83.0,94.0,77000000.0,400000.0,14.0,5.0,4.0,5.0,0.0,127100000.0,27.0,13.0,90.0,93.0,81.0,89.0,35.0,79.0,87.0,86.6,83.4,89.6,53.4,86.0,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,92.0,92.0,92.0,90.0,91.0,91.0,91.0,90.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,54.0,54.0,54.0,61.0
1,158023.0,31.0,170.0,72.0,94.0,110500000.0,575000.0,3.0,5.0,4.0,4.0,2.0,226500000.0,25.0,3.0,88.0,91.0,88.0,96.0,32.0,61.0,83.2,80.4,77.6,85.8,47.8,81.6,77.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,89.0,89.0,89.0,92.0,93.0,93.0,93.0,92.0,93.0,93.0,93.0,91.0,85.0,85.0,85.0,91.0,63.0,61.0,61.0,61.0,63.0,59.0,48.0,48.0,48.0,59.0
2,190871.0,26.0,175.0,68.0,92.0,118500000.0,300000.0,8.0,5.0,5.0,5.0,0.0,228100000.0,14.0,14.0,92.0,84.0,83.0,95.0,32.0,59.0,84.0,84.6,76.8,86.6,50.8,84.0,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,84.0,84.0,84.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,89.0,81.0,81.0,81.0,89.0,66.0,61.0,61.0,61.0,66.0,61.0,47.0,47.0,47.0,61.0
3,193080.0,27.0,193.0,76.0,91.0,72000000.0,250000.0,5.0,4.0,3.0,1.0,2.0,138600000.0,5.0,5.0,77.8,81.4,86.4,87.4,57.4,71.4,90.0,85.0,87.0,94.0,58.0,88.0,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,33.0,33.0,33.0,37.0,38.0,38.0,38.0,37.0,43.0,43.0,43.0,40.0,45.0,45.0,45.0,40.0,36.0,41.0,41.0,41.0,36.0,35.0,34.0,34.0,34.0,35.0
4,192985.0,27.0,181.0,70.0,91.0,102000000.0,350000.0,0.0,4.0,5.0,4.0,0.0,196400000.0,19.0,19.0,77.0,86.0,92.0,87.0,60.0,78.0,85.4,86.4,80.8,88.4,51.2,85.6,93.0,83.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,60.0,87.0,94.0,79.0,88.0,67.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,83.0,83.0,83.0,87.0,87.0,87.0,87.0,87.0,89.0,89.0,89.0,88.0,88.0,88.0,88.0,88.0,77.0,77.0,77.0,77.0,77.0,73.0,67.0,67.0,67.0,73.0


# X : Predictors
# y : Response variable

In [7]:
#we will assign the X and y column values as Predictors and Response variables as below
X = clean_fifa19.drop(['sofifa_id','overall'],axis=1)
y = clean_fifa19['overall']

# Scaling

In [8]:
#we will scale the data using the Standardscalar function
scaler = StandardScaler()
scale_X = scaler.fit_transform(X)

# Spliting data into train and test

In [9]:
#Split the dataset into train and validation sets with 80% training set 
X_train,X_test,y_train,y_test = train_test_split(scale_X,y,train_size=0.8,random_state=42)

# Linear Regression

In [10]:
# Initialize a linear regression model
lreg = LinearRegression()

#fit the linear regression model on train data
lreg.fit(X_train,y_train)

#Use the trained model to predict on the test data
y_test_pred = lreg.predict(X_test)

#calculating the mse on the test data
lreg_test_mse = mean_squared_error(y_test,y_test_pred)
print(f'The MSE of linear regression model on testidation set is {lreg_test_mse}')

The MSE of linear regression model on testidation set is 1.5128827500455941


# Cross Validation Linear Regression

In [11]:
#applying cross validation on the Linear regression
cv = cross_validate(LinearRegression(),scale_X,y,scoring='neg_mean_squared_error',cv=5,return_train_score=True)

In [12]:
#we will take a quick look at the cross validated data
cv

{'fit_time': array([0.05810857, 0.04914761, 0.04598427, 0.04807949, 0.04693294]),
 'score_time': array([0.00210571, 0.00100017, 0.00099993, 0.        , 0.00099993]),
 'test_score': array([-35.91073932,  -1.49649673,  -1.26168114,  -1.45859407,
         -3.17630725]),
 'train_score': array([-1.36936258, -1.56701783, -1.611567  , -1.57309438, -1.38445446])}

# MSE after Cross Validation (Linear Regression)

In [13]:
#checking the new mse calculated after cross validation
-np.mean(cv['test_score'])

8.660763702767492

# LASSO

In [14]:
#Now  we will implement the lasso regularisation
for i in [0.001,0.01,0.1,10,100,1000]:
  lasso = Lasso(alpha = i)
  cv = cross_validate(lasso,scale_X,y,scoring='neg_mean_squared_error',cv=5,return_train_score=True) 
  m = -np.mean(cv['test_score'])
  print(f'The MSE of Lasso regression(alpha : {i} ) model on test data set is {m}')

The MSE of Lasso regression(alpha : 0.001 ) model on test data set is 7.946731441317074
The MSE of Lasso regression(alpha : 0.01 ) model on test data set is 3.0461877042304795
The MSE of Lasso regression(alpha : 0.1 ) model on test data set is 2.5047412670345253
The MSE of Lasso regression(alpha : 10 ) model on test data set is 73.88032442145493
The MSE of Lasso regression(alpha : 100 ) model on test data set is 73.88032442145493
The MSE of Lasso regression(alpha : 1000 ) model on test data set is 73.88032442145493


In [15]:
#find mse of lasso regression model by choosing best alpha
best_alpha = 0.1
lasso = Lasso(alpha = best_alpha)
lasso.fit(X_train,y_train)
lasso_pred = lasso.predict(X_test)
lasso_test_mse = mean_squared_error(y_test,lasso_pred)
print(f'The MSE of lasso regression model on testidation set is {lasso_test_mse}')

The MSE of lasso regression model on testidation set is 1.711574191454219


# DecisionTreeRegressor

In [16]:
#Now we will use DecisionTreeRegressor to build model in form of tree structure
tree1 = DecisionTreeRegressor()
tree1.fit(X_train,y_train)
y_test_tree_pred = tree1.predict(X_test)
tree_test_mse = mean_squared_error(y_test,y_test_tree_pred)
print(f'The MSE of DecisionTreeRegressor model on test data set is {tree_test_mse}')

The MSE of DecisionTreeRegressor model on test data set is 1.0514238319048936


In [17]:
#we will cross validate the DecisionTreeRegressor model 
tree_cv = cross_validate(DecisionTreeRegressor(),scale_X,y,scoring='neg_mean_squared_error',cv=5,return_train_score=True)

In [18]:
#we will take a quick look at the cross validated data
tree_cv

{'fit_time': array([0.46616173, 0.51165533, 0.48469138, 0.5490129 , 0.46785021]),
 'score_time': array([0.0010047 , 0.00103331, 0.00182843, 0.00101042, 0.002249  ]),
 'test_score': array([-32.32374896,  -3.95189383,  -4.09427703,  -6.33619021,
        -30.66712745]),
 'train_score': array([-0., -0., -0., -0., -0.])}

# MSE after Cross Validation (DecisionTreeRegression )

In [19]:
#we will calculate the new mse after cross validation
-np.mean(tree_cv['test_score'])

15.474647497926458

# Bagging

In [20]:
# we will apply bagging on the DecisionTreeRegressor classifier used earlier
bag = BaggingRegressor(DecisionTreeRegressor())
# we will fit the  on train data
bag.fit(X_train,y_train)

#we will predict it on test data
bag_pred = bag.predict(X_test)

#we will find mse on the test data on X and predicted data on y
bag_test_mse = mean_squared_error(y_test,bag_pred)
print(f'The MSE of Bagging model on test data set is {bag_test_mse}')

The MSE of Bagging model on test data set is 0.5191816422449543


In [21]:
#we will cross validate the data in bagging
bag_cv = cross_validate(bag,scale_X,y,scoring='neg_mean_squared_error',cv=5,return_train_score=True)

In [22]:
#we will take a quick look at the  cross validated data
bag_cv

{'fit_time': array([3.01178336, 3.18561316, 3.17364049, 3.15124679, 3.04384518]),
 'score_time': array([0.01914334, 0.02019262, 0.02335548, 0.02111411, 0.02173567]),
 'test_score': array([-32.57219795,  -2.44725187,  -1.89123307,  -2.39759746,
        -30.81516727]),
 'train_score': array([-0.10302253, -0.11090614, -0.09980716, -0.10424316, -0.07418786])}

In [23]:
#we will calculate the new mse after cross validation is done
-np.mean(bag_cv['test_score'])

14.02468952170307

# RandomForestRegressor

In [24]:
#we will now use RandomForestRegressor
forest1 = RandomForestRegressor()
#we will now fit it on train data
forest1.fit(X_train,y_train)
#we will now predict it on test data 
y_test_forest_pred = forest1.predict(X_test)
#we will find mse on the test data
forest_test_mse = mean_squared_error(y_test,y_test_forest_pred)
print(f'The MSE of RandomForestRegressor model on test data set is {forest_test_mse}')

The MSE of RandomForestRegressor model on test data set is 0.42828448990876405


In [25]:
#we will cross validate the data 
#forest_cv = cross_validate(RandomForestRegressor(),scale_X,y,scoring='neg_mean_squared_error',return_train_score=True)

In [26]:
#we will take a quick look at the data after cross validation
 #forest_cv

# MSE after Cross Validation (RandomForestRegressor )

In [27]:
#we will now find the new mse after cross validation
#-np.mean(forest_cv['test_score'])

# Boosting

In [28]:
#we will now apply boosting using the GradientBoostingRegressor function
boost = GradientBoostingRegressor(n_estimators=1000,max_depth=1,learning_rate=0.01)
#we will fit the data on trained data
boost.fit(X_train,y_train)
#we will predict the results on test data
y_test_boost_pred = boost.predict(X_test)
#we will find mse on the test data
boost_test_mse = mean_squared_error(y_test,y_test_boost_pred)
print(f'The MSE of Boosting model on test data set is {boost_test_mse}')

The MSE of Boosting model on test data set is 1.8150124999726331


# FIFA 2020 Data

In [29]:
#we will read the file "players_20.csv" as a panda dataframe
fifa20 = pd.read_csv(r'C:\Users\vivek\Desktop\FIFA\players_20.csv\players_20.csv')
#we will take a quick look at the data 
fifa20.head(2)

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,overall,potential,value_eur,wage_eur,player_positions,preferred_foot,international_reputation,weak_foot,skill_moves,work_rate,body_type,real_face,release_clause_eur,player_tags,team_position,team_jersey_number,loaned_from,joined,contract_valid_until,nation_position,nation_jersey_number,pace,shooting,passing,dribbling,defending,physic,gk_diving,gk_handling,gk_kicking,gk_reflexes,gk_speed,gk_positioning,player_traits,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona,94,94,95500000,565000,"RW, CF, ST",Left,5,4,4,Medium/Low,Messi,Yes,195800000.0,"#Dribbler, #Distance Shooter, #Crosser, #FK Sp...",RW,10.0,,2004-07-01,2021.0,,,87.0,92.0,92.0,96.0,39.0,66.0,,,,,,,"Beat Offside Trap, Argues with Officials, Earl...",88,95,70,92,88,97,93,94,92,96,91,84,93,95,95,86,68,75,68,94,48,40,94,94,75,96,33,37,26,6,11,15,14,8,89+2,89+2,89+2,93+2,93+2,93+2,93+2,93+2,93+2,93+2,93+2,92+2,87+2,87+2,87+2,92+2,68+2,66+2,66+2,66+2,68+2,63+2,52+2,52+2,52+2,63+2
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus,93,93,58500000,405000,"ST, LW",Right,5,4,5,High/Low,C. Ronaldo,Yes,96500000.0,"#Speedster, #Dribbler, #Distance Shooter, #Acr...",LW,7.0,,2018-07-10,2022.0,LS,7.0,90.0,93.0,82.0,89.0,35.0,78.0,,,,,,,"Long Throw-in, Selfish, Argues with Officials,...",84,94,89,83,87,89,81,76,77,92,89,91,87,96,71,95,95,85,78,93,63,29,95,82,85,95,28,32,24,7,11,15,14,11,91+3,91+3,91+3,89+3,90+3,90+3,90+3,89+3,88+3,88+3,88+3,88+3,81+3,81+3,81+3,88+3,65+3,61+3,61+3,61+3,65+3,61+3,53+3,53+3,53+3,61+3


In [30]:
#we will clean the data using the data_cleaning function created earlier
clean_fifa20 = data_cleaning(fifa20)
clean_fifa20.head()

['sofifa_id', 'age', 'height_cm', 'weight_kg', 'overall', 'value_eur', 'wage_eur', 'player_positions', 'international_reputation', 'weak_foot', 'skill_moves', 'work_rate', 'release_clause_eur', 'team_position', 'nation_position', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_reflexes', 'gk_speed', 'gk_positioning', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure', 'defending_marking', 'defending_standing_tackle'

Unnamed: 0,sofifa_id,age,height_cm,weight_kg,overall,value_eur,wage_eur,player_positions,international_reputation,weak_foot,skill_moves,work_rate,release_clause_eur,team_position,nation_position,pace,shooting,passing,dribbling,defending,physic,gk_diving,gk_handling,gk_kicking,gk_reflexes,gk_speed,gk_positioning,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023.0,32.0,170.0,72.0,94.0,95500000.0,565000.0,12.0,5.0,4.0,4.0,2.0,195800000.0,25.0,26.0,87.0,92.0,92.0,96.0,39.0,66.0,84.0,81.0,78.4,86.0,49.6,81.0,88.0,95.0,70.0,92.0,88.0,97.0,93.0,94.0,92.0,96.0,91.0,84.0,93.0,95.0,95.0,86.0,68.0,75.0,68.0,94.0,48.0,40.0,94.0,94.0,75.0,96.0,33.0,37.0,26.0,6.0,11.0,15.0,14.0,8.0,89.0,89.0,89.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,92.0,87.0,87.0,87.0,92.0,68.0,66.0,66.0,66.0,68.0,63.0,52.0,52.0,52.0,63.0
1,20801.0,34.0,187.0,83.0,93.0,58500000.0,405000.0,14.0,5.0,4.0,5.0,0.0,96500000.0,14.0,12.0,90.0,93.0,82.0,89.0,35.0,78.0,87.2,83.8,81.0,89.2,54.2,85.2,84.0,94.0,89.0,83.0,87.0,89.0,81.0,76.0,77.0,92.0,89.0,91.0,87.0,96.0,71.0,95.0,95.0,85.0,78.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,32.0,24.0,7.0,11.0,15.0,14.0,11.0,91.0,91.0,91.0,89.0,90.0,90.0,90.0,89.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,53.0,53.0,53.0,61.0
2,190871.0,27.0,175.0,68.0,92.0,105500000.0,290000.0,8.0,5.0,5.0,5.0,0.0,195200000.0,0.0,13.0,91.0,85.0,87.0,95.0,32.0,58.0,84.0,81.0,78.4,86.0,49.6,81.0,87.0,87.0,62.0,87.0,87.0,96.0,88.0,87.0,81.0,95.0,94.0,89.0,96.0,92.0,84.0,80.0,61.0,81.0,49.0,84.0,51.0,36.0,87.0,90.0,90.0,94.0,27.0,26.0,29.0,9.0,9.0,15.0,15.0,11.0,84.0,84.0,84.0,90.0,89.0,89.0,89.0,90.0,90.0,90.0,90.0,89.0,82.0,82.0,82.0,89.0,66.0,61.0,61.0,61.0,66.0,61.0,46.0,46.0,46.0,61.0
3,200389.0,26.0,188.0,87.0,91.0,77500000.0,125000.0,5.0,3.0,3.0,1.0,2.0,164700000.0,5.0,5.0,78.0,80.8,81.6,82.4,61.8,81.4,87.0,92.0,78.0,89.0,52.0,90.0,13.0,11.0,15.0,43.0,13.0,12.0,13.0,14.0,40.0,30.0,43.0,60.0,67.0,88.0,49.0,59.0,78.0,41.0,78.0,12.0,34.0,19.0,11.0,65.0,11.0,68.0,27.0,12.0,18.0,87.0,92.0,78.0,90.0,89.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,183277.0,28.0,175.0,74.0,91.0,90000000.0,470000.0,8.0,4.0,4.0,4.0,0.0,184500000.0,14.0,10.0,91.0,83.0,86.0,94.0,35.0,66.0,84.0,81.0,78.4,86.0,49.6,81.0,81.0,84.0,61.0,89.0,83.0,95.0,83.0,79.0,83.0,94.0,94.0,88.0,95.0,90.0,94.0,82.0,56.0,84.0,63.0,80.0,54.0,41.0,87.0,89.0,88.0,91.0,34.0,27.0,22.0,11.0,12.0,6.0,8.0,8.0,83.0,83.0,83.0,89.0,88.0,88.0,88.0,89.0,89.0,89.0,89.0,89.0,83.0,83.0,83.0,89.0,66.0,63.0,63.0,63.0,66.0,61.0,49.0,49.0,49.0,61.0


In [31]:
#we will remove the column 'sofifa_id' from the data frame
X = clean_fifa20.drop(['sofifa_id','overall'],axis=1)

In [32]:
#we will take a quick look at the data X
X.head(3)

Unnamed: 0,age,height_cm,weight_kg,value_eur,wage_eur,player_positions,international_reputation,weak_foot,skill_moves,work_rate,release_clause_eur,team_position,nation_position,pace,shooting,passing,dribbling,defending,physic,gk_diving,gk_handling,gk_kicking,gk_reflexes,gk_speed,gk_positioning,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,32.0,170.0,72.0,95500000.0,565000.0,12.0,5.0,4.0,4.0,2.0,195800000.0,25.0,26.0,87.0,92.0,92.0,96.0,39.0,66.0,84.0,81.0,78.4,86.0,49.6,81.0,88.0,95.0,70.0,92.0,88.0,97.0,93.0,94.0,92.0,96.0,91.0,84.0,93.0,95.0,95.0,86.0,68.0,75.0,68.0,94.0,48.0,40.0,94.0,94.0,75.0,96.0,33.0,37.0,26.0,6.0,11.0,15.0,14.0,8.0,89.0,89.0,89.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,93.0,92.0,87.0,87.0,87.0,92.0,68.0,66.0,66.0,66.0,68.0,63.0,52.0,52.0,52.0,63.0
1,34.0,187.0,83.0,58500000.0,405000.0,14.0,5.0,4.0,5.0,0.0,96500000.0,14.0,12.0,90.0,93.0,82.0,89.0,35.0,78.0,87.2,83.8,81.0,89.2,54.2,85.2,84.0,94.0,89.0,83.0,87.0,89.0,81.0,76.0,77.0,92.0,89.0,91.0,87.0,96.0,71.0,95.0,95.0,85.0,78.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,32.0,24.0,7.0,11.0,15.0,14.0,11.0,91.0,91.0,91.0,89.0,90.0,90.0,90.0,89.0,88.0,88.0,88.0,88.0,81.0,81.0,81.0,88.0,65.0,61.0,61.0,61.0,65.0,61.0,53.0,53.0,53.0,61.0
2,27.0,175.0,68.0,105500000.0,290000.0,8.0,5.0,5.0,5.0,0.0,195200000.0,0.0,13.0,91.0,85.0,87.0,95.0,32.0,58.0,84.0,81.0,78.4,86.0,49.6,81.0,87.0,87.0,62.0,87.0,87.0,96.0,88.0,87.0,81.0,95.0,94.0,89.0,96.0,92.0,84.0,80.0,61.0,81.0,49.0,84.0,51.0,36.0,87.0,90.0,90.0,94.0,27.0,26.0,29.0,9.0,9.0,15.0,15.0,11.0,84.0,84.0,84.0,90.0,89.0,89.0,89.0,90.0,90.0,90.0,90.0,89.0,82.0,82.0,82.0,89.0,66.0,61.0,61.0,61.0,66.0,61.0,46.0,46.0,46.0,61.0


In [33]:
#we will standardize the data using StandardScaler() function
scaler = StandardScaler()
scale_X = scaler.fit_transform(X)

# Here we use RendomForest model to predict.

In [34]:
#we will predict the data on  scale_x
y_pred = forest1.predict(scale_X)

In [35]:
#we will find mse on the predicted values of y
mean_squared_error(clean_fifa20['overall'],y_pred)

1.9217446055367107

In [36]:
fifa20_prediction = pd.DataFrame({'Name':fifa20['short_name'],'Overall':y_pred})

In [37]:
#we will take a look at the data
fifa20_prediction

Unnamed: 0,Name,Overall
0,L. Messi,90.23
1,Cristiano Ronaldo,89.38
2,Neymar Jr,89.54
3,J. Oblak,89.36
4,E. Hazard,89.80
...,...,...
18273,Shao Shuai,50.08
18274,Xiao Mingjie,50.38
18275,Zhang Wei,50.01
18276,Wang Haijian,49.91


In [38]:
#we will sort the data according to the name of the players
fifa20_prediction.sort_values(by='Overall', ascending=False)

Unnamed: 0,Name,Overall
0,L. Messi,90.23
4,E. Hazard,89.80
2,Neymar Jr,89.54
5,K. De Bruyne,89.47
22,A. Griezmann,89.42
...,...,...
18209,J. Wright,49.23
18176,O. Battersby,49.16
18239,V. Storsve,49.13
18251,G. Sykes-Kenworthy,49.10


In [40]:
pd.DataFrame({'sofifaid':clean_fifa20['sofifa_id'],'Overall':y_pred}).to_csv('Part_A.csv')

# Best player according to prediction :
# Name : L. Messi
# Overall : 90.23