In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame as df
%matplotlib inline

In [2]:
import warnings
warnings.simplefilter(action='ignore')

In [3]:
## Import up sound alert dependencies
from IPython.display import Audio, display

def allDone():
  display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))


In [5]:
%time data = pd.read_csv("../../data/raw/fifa.csv",  delimiter=',', index_col=0)

CPU times: user 1.03 s, sys: 60.3 ms, total: 1.09 s
Wall time: 1.09 s


In [8]:
numcols = ['Overall', 'Crossing','Finishing',  'ShortPassing',  'Dribbling','LongPassing', 'BallControl', 'Acceleration','SprintSpeed', 'Agility',  'Stamina','Volleys','FKAccuracy','Reactions','Balance','ShotPower','Strength','LongShots','Aggression','Interceptions']
catcols = ['Preferred Foot','Position','Body Type','Nationality','Weak Foot']

In [16]:
player_df = data[numcols+catcols]
traindf = pd.concat([player_df[numcols], pd.get_dummies(player_df[catcols])],axis=1)
features = traindf.columns
traindf = traindf.dropna()
traindf = pd.DataFrame(traindf,columns=features)

In [36]:
y = traindf['Overall']
X = traindf.copy()
del X['Overall']

In [37]:
len(X.columns)

223

In [38]:
feature_name = list(data.columns)
num_feats = 30

In [39]:
def pearson_cor_selector(X, y, num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    cor_feature = X.iloc[:, np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = pearson_cor_selector(X, y, num_feats)
print(str(len(cor_feature)), ' selected features')
print(cor_feature)

30  selected features
['Position_RCB', 'Nationality_Japan', 'Nationality_Portugal', 'Balance', 'Position_CM', 'Nationality_Saudi Arabia', 'Nationality_Republic of Ireland', 'Nationality_Spain', 'Nationality_England', 'Nationality_China PR', 'Nationality_Brazil', 'Acceleration', 'SprintSpeed', 'Weak Foot', 'Agility', 'Interceptions', 'Finishing', 'Strength', 'Stamina', 'Dribbling', 'Volleys', 'Crossing', 'Aggression', 'FKAccuracy', 'LongShots', 'ShotPower', 'BallControl', 'LongPassing', 'ShortPassing', 'Reactions']


In [40]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

print(chi_feature)

30 selected features
['Crossing', 'ShortPassing', 'Dribbling', 'LongPassing', 'BallControl', 'Volleys', 'FKAccuracy', 'Reactions', 'ShotPower', 'LongShots', 'Aggression', 'Interceptions', 'Position_CM', 'Position_GK', 'Position_LF', 'Position_RF', 'Body Type_C. Ronaldo', 'Body Type_Courtois', 'Body Type_Messi', 'Body Type_Neymar', 'Body Type_PLAYER_BODY_TYPE_25', 'Body Type_Shaqiri', 'Nationality_Argentina', 'Nationality_Brazil', 'Nationality_China PR', 'Nationality_England', 'Nationality_Portugal', 'Nationality_Republic of Ireland', 'Nationality_Saudi Arabia', 'Nationality_Spain']


In [41]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
print(rfe_feature)

Fitting estimator with 223 features.
Fitting estimator with 213 features.
Fitting estimator with 203 features.
Fitting estimator with 193 features.
Fitting estimator with 183 features.
Fitting estimator with 173 features.
Fitting estimator with 163 features.
Fitting estimator with 153 features.
Fitting estimator with 143 features.
Fitting estimator with 133 features.
Fitting estimator with 123 features.
Fitting estimator with 113 features.
Fitting estimator with 103 features.
Fitting estimator with 93 features.
Fitting estimator with 83 features.
Fitting estimator with 73 features.
Fitting estimator with 63 features.
Fitting estimator with 53 features.
Fitting estimator with 43 features.
Fitting estimator with 33 features.
30 selected features
['Crossing', 'ShortPassing', 'Dribbling', 'LongPassing', 'BallControl', 'Agility', 'Stamina', 'Volleys', 'Reactions', 'Balance', 'ShotPower', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Preferred Foot_Left', 'Preferred Foot_Right', '

In [42]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')
print(embeded_lr_feature)

30 selected features
['Crossing', 'ShortPassing', 'Dribbling', 'LongPassing', 'BallControl', 'Agility', 'Stamina', 'Volleys', 'Reactions', 'Balance', 'ShotPower', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Position_CB', 'Position_CDM', 'Position_CM', 'Position_GK', 'Position_LCB', 'Position_RCB', 'Nationality_Brazil', 'Nationality_China PR', 'Nationality_Denmark', 'Nationality_England', 'Nationality_Korea Republic', 'Nationality_Poland', 'Nationality_Republic of Ireland', 'Nationality_Saudi Arabia', 'Nationality_Spain']


In [43]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
print(embeded_rf_feature)

27 selected features
['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina', 'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Weak Foot', 'Preferred Foot_Left', 'Preferred Foot_Right', 'Position_CM', 'Body Type_Lean', 'Body Type_Normal', 'Nationality_England', 'Nationality_Germany']


In [44]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
embeded_lgb_selector.fit(X, y)
embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')
print(embeded_lgb_feature)

19 selected features
['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina', 'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower', 'Strength', 'LongShots', 'Aggression', 'Interceptions']


In [45]:
pd.set_option('display.max_rows', None)
# put all selection together
d = {'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support,
     'Logistics':embeded_lr_support,'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support}
feature_selection_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d.items() ]))
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,LightGBM,Total
1,Work Rate,True,True,True,True,True,True,6
2,Value,True,True,True,True,True,True,6
3,Special,True,True,True,True,True,True,6
4,Skill Moves,True,True,True,True,True,True,6
5,Photo,True,True,True,True,True,True,6
6,Nationality,True,True,True,True,True,True,6
7,International Reputation,True,True,True,True,True,True,6
8,ID,True,True,True,True,True,True,6
9,Flag,True,True,True,True,True,True,6
10,Body Type,True,True,True,True,True,True,6
