# Tasca Feature Engineering

**Descripció**
Aprèn a gestionar paràmetres amb Python.

In [21]:
import pandas as pd
import numpy as np
import random

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.decomposition import PCA

### Dades escollides

Agafem les dades [mlbBat10.txt](./data-sources/mlbBat10.txt). La descripció dels camps la trobem [aquí](https://www.openintro.org/data/index.php?data=mlbbat10)

Format
A data frame with 1199 observations on the following 19 variables.

|camp|descripció|
|-|-|
|**name**|Player name|
|**team**|Team abbreviation|
|**position**|Player position|
|**G** game|Number of games|
|**AB** at_bat|Number of at bats|
|**R** run|Number of runs|
|**H** hit|Number of hits|
|**2B** double|Number of doubles|
|**3B** triple|Number of triples|
|**HR** home_run|Number of home runs|
|**RBI** rbi|Number of runs batted in|
|**TB** total_base|Total bases, computed as 3HR + 23B + 1*2B + H|
|**BB** walk|Number of walks|
|**SO** strike_out|Number of strikeouts|
|**SB** stolen_base|Number of stolen bases|
|**CS** caught_stealing|Number of times caught stealing|
|**OBP** obp|On base percentage|
|**SLG** slg|Slugging percentage (total_base / at_bat)|
|**AVG** bat_avg|Batting average|

In [2]:
dtypes = {'position':'category', 'team':'category'}
mlb = pd.read_csv('../data-sources/mlbBat10.txt', sep='\t', dtype=dtypes)


## Nivell 1
### Exercici 1
Agafa un conjunt de dades de tema esportiu que t'agradi i normalitza els atributs categòrics en dummy. Normalitza els atributs numèrics amb StandardScaler.

In [3]:
mlb.head()

Unnamed: 0,name,team,position,G,AB,R,H,2B,3B,HR,RBI,TB,BB,SO,SB,CS,OBP,SLG,AVG
0,I Suzuki,SEA,OF,162,680,74,214,30,3,6,43,268,45,86,42,9,0.359,0.394,0.315
1,D Jeter,NYY,SS,157,663,111,179,30,3,10,67,245,63,106,18,5,0.34,0.37,0.27
2,M Young,TEX,3B,157,656,99,186,36,3,21,91,291,50,115,4,2,0.33,0.444,0.284
3,J Pierre,CWS,OF,160,651,96,179,18,3,1,47,206,45,47,68,18,0.341,0.316,0.275
4,R Weeks,MIL,2B,160,651,112,175,32,4,29,83,302,76,184,11,4,0.366,0.464,0.269


In [4]:
mlb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1199 entries, 0 to 1198
Data columns (total 19 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   name      1199 non-null   object  
 1   team      1199 non-null   category
 2   position  1199 non-null   category
 3   G         1199 non-null   int64   
 4   AB        1199 non-null   int64   
 5   R         1199 non-null   int64   
 6   H         1199 non-null   int64   
 7   2B        1199 non-null   int64   
 8   3B        1199 non-null   int64   
 9   HR        1199 non-null   int64   
 10  RBI       1199 non-null   int64   
 11  TB        1199 non-null   int64   
 12  BB        1199 non-null   int64   
 13  SO        1199 non-null   int64   
 14  SB        1199 non-null   int64   
 15  CS        1199 non-null   int64   
 16  OBP       1199 non-null   float64 
 17  SLG       1199 non-null   float64 
 18  AVG       1199 non-null   float64 
dtypes: category(2), float64(3), int64(13), object(1)

In [5]:
cols_cat = ['team','position']
cols_num = [col for col in mlb.columns if mlb[col].dtype in ['int64','float64']]


#### Dummy

In [6]:
for col in cols_cat:
    mlb = pd.concat([mlb, pd.get_dummies(mlb[col], prefix=col)], axis=1)

In [7]:
mlb.head()

Unnamed: 0,name,team,position,G,AB,R,H,2B,3B,HR,...,team_WSH,position_-,position_1B,position_2B,position_3B,position_C,position_DH,position_OF,position_P,position_SS
0,I Suzuki,SEA,OF,162,680,74,214,30,3,6,...,0,0,0,0,0,0,0,1,0,0
1,D Jeter,NYY,SS,157,663,111,179,30,3,10,...,0,0,0,0,0,0,0,0,0,1
2,M Young,TEX,3B,157,656,99,186,36,3,21,...,0,0,0,0,1,0,0,0,0,0
3,J Pierre,CWS,OF,160,651,96,179,18,3,1,...,0,0,0,0,0,0,0,1,0,0
4,R Weeks,MIL,2B,160,651,112,175,32,4,29,...,0,0,0,1,0,0,0,0,0,0


#### StandardScaler

In [11]:
mlb.describe().round(3)

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,TB,BB,...,team_WSH,position_-,position_1B,position_2B,position_3B,position_C,position_DH,position_OF,position_P,position_SS
count,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,...,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0,1199.0
mean,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,...,0.038,0.007,0.058,0.06,0.059,0.094,0.021,0.188,0.454,0.059
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.19,0.081,0.233,0.238,0.236,0.292,0.143,0.391,0.498,0.236
min,-1.0,-0.733,-0.657,-0.684,-0.655,-0.451,-0.522,-0.636,-0.664,-0.635,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.859,-0.728,-0.657,-0.684,-0.655,-0.451,-0.522,-0.636,-0.664,-0.635,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.394,-0.537,-0.546,-0.568,-0.562,-0.451,-0.522,-0.56,-0.568,-0.538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.676,0.463,0.323,0.395,0.363,0.173,0.021,0.303,0.322,0.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,2.251,2.883,3.596,3.44,3.879,8.292,6.806,4.097,3.531,4.865,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
sc = StandardScaler()

In [10]:
for col in cols_num:
    mlb[col] = sc.fit_transform(np.array(mlb[col]).reshape(-1, 1))

#### Eliminem columnes 

In [15]:
mlb.drop(columns=cols_cat, inplace=True)
mlb.drop(columns=['name'], inplace=True)

In [16]:
mlb.head()

Unnamed: 0,G,AB,R,H,2B,3B,HR,RBI,TB,BB,...,team_WSH,position_-,position_1B,position_2B,position_3B,position_C,position_DH,position_OF,position_P,position_SS
0,2.250748,2.882719,2.079459,3.440347,2.120792,1.422407,0.292109,0.979482,2.538682,1.53606,...,0,0,0,0,0,0,0,1,0,0
1,2.149786,2.792317,3.447803,2.765802,2.120792,1.422407,0.834903,1.880873,2.263835,2.404417,...,0,0,0,0,0,0,0,0,0,1
2,2.149786,2.755093,3.004016,2.900711,2.675914,1.422407,2.327588,2.782264,2.813528,1.77727,...,0,0,0,0,1,0,0,0,0,0
3,2.210363,2.728504,2.893069,2.765802,1.010548,1.422407,-0.386385,1.129714,1.797791,1.53606,...,0,0,0,0,0,0,0,1,0,0
4,2.210363,2.728504,3.484785,2.688711,2.305833,2.046891,3.413177,2.4818,2.944977,3.031563,...,0,0,0,1,0,0,0,0,0,0


In [23]:
mlb.G.std()

1.0004172752111498

## Nivell 2
### Exercici 2
Continua amb el conjunt de dades de tema esportiu que t'agradi i aplica l'anàlisi de components principals.

In [20]:
pca = PCA(n_components=10)

pca.fit_transform(mlb).shape

## Nivell 3
### Exercici 3

Continua amb el conjunt de dades de tema esportiu que t'agradi i normalitza les dades tenint en compte els outliers.

In [None]:

re = RobustScaler()