## Tarea Feature Engineering

- Ejercicio 1
Coge un conjunto de datos de tema deportivo que te guste y normaliza los atributos categóricos en dummy. Estandariza los atributos numéricos con StandardScaler.

In [125]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

In [126]:
medal_type = pd.CategoricalDtype(categories = ['None','Bronze','Silver','Gold'], ordered=True)

df = pd.read_csv('../Entrega11/athlete_events.csv', 
                 index_col='ID',
                 dtype={'Medal':medal_type,
                        'Sex': 'category',
                        'Season': 'category'})
df.head()

Unnamed: 0_level_0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


Scaling is a much needed tool to prepare data for a Machine Learning model, before performing the scaling we will set the target and training columns.

In [127]:
df_new = df[['Age','Sex','Height','Weight','NOC', 'Sport', 'Medal']].copy()

In [128]:
#Target Column
target = ['Medal']
print(f'The target column is:{target}')

#Numerical columns
numerical_col = list(df_new.select_dtypes('number').columns)
print(f'the numerical column is: {numerical_col}')

#Categorical columns
categorical_cols= list(set(df_new.columns)-set(numerical_col)-set(target))
print(f'The categorical columns are: {categorical_cols}')

The target column is:['Medal']
the numerical column is: ['Age', 'Height', 'Weight']
The categorical columns are: ['Sport', 'Sex', 'NOC']


In [129]:
#Standarization of numerical columns
"""1. SimpleImputer to fill missing value with the mean of that column.
   2. StandardScaler to scale values from o to 1.
"""
numerical = Pipeline([('imputer', SimpleImputer(strategy = 'median')),
                             ('scaler', StandardScaler())])

In [140]:
#Standarization of categorical columns
"""
OneHotEncoder to spit to many numerical columns for model training. 
(handle_unknown=’ignore’ is specified to prevent error when found an unseen category in the test set)"""

categorical = Pipeline([('one-hot', OneHotEncoder(sparse= False))])

In [141]:
#Apply the pipeline by transforming the columns 
col_trans = ColumnTransformer(transformers=[('num_pipeline',numerical,numerical_col),
                                            ('cat_pipeline',categorical,categorical_cols)])

X = col_trans.fit_transform(df_new);



In [142]:
categorical_new_columns = col_trans.named_transformers_['cat_pipeline']['one-hot'].get_feature_names(categorical_cols).tolist()
col = numerical_col + categorical_new_columns


In [143]:
df_new

Unnamed: 0_level_0,Age,Sex,Height,Weight,NOC,Sport,Medal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,24.0,M,180.0,80.0,CHN,Basketball,
2,23.0,M,170.0,60.0,CHN,Judo,
3,24.0,M,,,DEN,Football,
4,34.0,M,,,DEN,Tug-Of-War,Gold
5,21.0,F,185.0,82.0,NED,Speed Skating,
...,...,...,...,...,...,...,...
135569,29.0,M,179.0,89.0,POL,Luge,
135570,27.0,M,176.0,59.0,POL,Ski Jumping,
135570,27.0,M,176.0,59.0,POL,Ski Jumping,
135571,30.0,M,185.0,96.0,POL,Bobsleigh,


In [144]:
X.shape

(271116, 301)

In [146]:
df_X = pd.DataFrame(X, columns=col)
df_X.head()

Unnamed: 0,Age,Height,Weight,Sport_Aeronautics,Sport_Alpine Skiing,Sport_Alpinism,Sport_Archery,Sport_Art Competitions,Sport_Athletics,Sport_Badminton,...,NOC_VIE,NOC_VIN,NOC_VNM,NOC_WIF,NOC_YAR,NOC_YEM,NOC_YMD,NOC_YUG,NOC_ZAM,NOC_ZIM
0,-0.238971,0.51042,0.752137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.39802,-0.567265,-0.837921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.238971,-0.028423,-0.042892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.351524,-0.028423,-0.042892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.716119,1.049262,0.911143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Ejercicio 2
Continúa con el conjunto de datos de tema deportivo que te guste y aplica el análisis de componentes principales.



In [149]:
pca = PCA(n_components=3)
X_d = pca.fit_transform(X[:,0:3])

In [152]:
pca_columns = [f'PCA_{x}' for x in range(1, 3+1)]
pca_columns

['PCA_1', 'PCA_2', 'PCA_3']

In [154]:
df_pca = pd.DataFrame(X_d, columns= pca_columns)
df_pca

Unnamed: 0,PCA_1,PCA_2,PCA_3
0,0.819222,-0.420686,0.187732
1,-1.057796,-0.178749,-0.177305
2,-0.101798,-0.222239,0.001287
3,0.247977,1.327330,-0.077250
4,1.194270,-1.007350,-0.054073
...,...,...,...
271111,1.416336,0.288388,0.732415
271112,-0.529351,0.327506,-0.719178
271113,-0.529351,0.327506,-0.719178
271114,2.280888,0.253117,0.665913


In [156]:
df_pca['Medal'] = df_new[target].to_numpy()
df_pca.head(5)

Unnamed: 0,PCA_1,PCA_2,PCA_3,Medal
0,0.819222,-0.420686,0.187732,
1,-1.057796,-0.178749,-0.177305,
2,-0.101798,-0.222239,0.001287,
3,0.247977,1.32733,-0.07725,Gold
4,1.19427,-1.00735,-0.054073,
