# Métodos de Pré-Processamento

- Binarização
- Escalar
- Normalização
- Remoção da Média

#### 1. Binarização

In [2]:
from sklearn import preprocessing 
from IPython.display import display
import numpy as np 
import pandas as pd

data = np.array([[2.2, 5.9, -1.8], [5.4, -3.2, -5.1], [-1.9, 4.2, 3.2]])

In [4]:
bin_data = preprocessing.Binarizer(threshold=1.5).transform(data)

In [5]:
bin_data

array([[1., 1., 0.],
       [1., 0., 0.],
       [0., 1., 1.]])

#### 2. Remoção da Média

In [6]:
data.mean(axis=0)

array([ 1.9       ,  2.3       , -1.23333333])

In [7]:
data.std(axis=0)

array([2.98775278, 3.95052739, 3.41207008])

In [8]:
scaled_data = preprocessing.scale(data)

In [9]:
scaled_data.mean(axis=0)

array([0.00000000e+00, 0.00000000e+00, 7.40148683e-17])

In [10]:
scaled_data.std(axis=0)

array([1., 1., 1.])

#### 3. Escalar

- StandardScaler => features com média = 0 e variância = 1
- MinMaxScaler => features em um range entre 0 e 1
- Normalizer => vetor de features para um comprimento Euclideano = 1

In [11]:
data

array([[ 2.2,  5.9, -1.8],
       [ 5.4, -3.2, -5.1],
       [-1.9,  4.2,  3.2]])

In [13]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
data_min_max = min_max_scaler.fit_transform(data)

In [14]:
data_min_max

array([[0.56164384, 1.        , 0.39759036],
       [1.        , 0.        , 0.        ],
       [0.        , 0.81318681, 1.        ]])

#### 4. Normalização

Ato de trazer os valores de cada vetor de features em uma escala comum

- **L1** - Least Absolute Deviations - soma de valores absolutos (em cada linha) = 1. É insensitivo a outliers
- **L2** - Least Squares - soma dos quadrados (em cada linha) = 1. Toma outliers em consideração durante o treinamento

In [15]:
data

array([[ 2.2,  5.9, -1.8],
       [ 5.4, -3.2, -5.1],
       [-1.9,  4.2,  3.2]])

In [16]:
data_l1 = preprocessing.normalize(data, norm='l1')

In [17]:
data_l2 = preprocessing.normalize(data, norm='l2')

In [18]:
data_l1

array([[ 0.22222222,  0.5959596 , -0.18181818],
       [ 0.39416058, -0.23357664, -0.37226277],
       [-0.20430108,  0.4516129 ,  0.34408602]])

In [19]:
data_l2

array([[ 0.3359268 ,  0.90089461, -0.2748492 ],
       [ 0.6676851 , -0.39566524, -0.63059148],
       [-0.33858465,  0.74845029,  0.57024784]])

#### 5. Label Encoding

- setosa => 0 
- versicolor => 1
- virginica => 2

In [22]:
labels = ['setosa', 'versicolor', 'virginica']

In [23]:
encoder = preprocessing.LabelEncoder()

In [24]:
encoder.fit(labels)

LabelEncoder()

In [26]:
for i, items in enumerate(encoder.classes_):
    print(items, i)

setosa 0
versicolor 1
virginica 2


In [27]:
more_labels = ['versicolor', 'versicolor', 'virginica', 'setosa', 'versicolor']

In [28]:
more_labels_encoded = encoder.transform(more_labels)

In [29]:
more_labels

['versicolor', 'versicolor', 'virginica', 'setosa', 'versicolor']

In [31]:
more_labels_encoded

array([1, 1, 2, 0, 1])

#### 5. One-Hot Encoding

- Utilizado em variáveis categóricas
- Substitui uma variável categórica/feature com um ou mais novos features que terão o valor de 0 ou 1
- Aumenta a carga de dados
- Aumenta a eficiência do processo

In [3]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None,
                  index_col=False, names=['age', 'workclass', 'fnlwgt', 'education', 
                                          'education-num', 'marital-status', 'occupation', 
                                          'relationship', 'race', 'gender', 'capital-gain', 
                                          'capital-loss', 'hours-per-week', 'native-country', 
                                          'income'])

In [41]:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]

In [42]:
display(data)

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K
...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,Female,38,Tech-support,<=50K
32557,40,Private,HS-grad,Male,40,Machine-op-inspct,>50K
32558,58,Private,HS-grad,Female,40,Adm-clerical,<=50K
32559,22,Private,HS-grad,Male,20,Adm-clerical,<=50K


In [43]:
data.head(10)

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K
5,37,Private,Masters,Female,40,Exec-managerial,<=50K
6,49,Private,9th,Female,16,Other-service,<=50K
7,52,Self-emp-not-inc,HS-grad,Male,45,Exec-managerial,>50K
8,31,Private,Masters,Female,50,Prof-specialty,>50K
9,42,Private,Bachelors,Male,40,Exec-managerial,>50K


In [44]:
list(data.columns)

['age',
 'workclass',
 'education',
 'gender',
 'hours-per-week',
 'occupation',
 'income']

In [45]:
data_dummies = pd.get_dummies(data)

In [47]:
list(data_dummies.columns)

['age',
 'hours-per-week',
 'workclass_ ?',
 'workclass_ Federal-gov',
 'workclass_ Local-gov',
 'workclass_ Never-worked',
 'workclass_ Private',
 'workclass_ Self-emp-inc',
 'workclass_ Self-emp-not-inc',
 'workclass_ State-gov',
 'workclass_ Without-pay',
 'education_ 10th',
 'education_ 11th',
 'education_ 12th',
 'education_ 1st-4th',
 'education_ 5th-6th',
 'education_ 7th-8th',
 'education_ 9th',
 'education_ Assoc-acdm',
 'education_ Assoc-voc',
 'education_ Bachelors',
 'education_ Doctorate',
 'education_ HS-grad',
 'education_ Masters',
 'education_ Preschool',
 'education_ Prof-school',
 'education_ Some-college',
 'gender_ Female',
 'gender_ Male',
 'occupation_ ?',
 'occupation_ Adm-clerical',
 'occupation_ Armed-Forces',
 'occupation_ Craft-repair',
 'occupation_ Exec-managerial',
 'occupation_ Farming-fishing',
 'occupation_ Handlers-cleaners',
 'occupation_ Machine-op-inspct',
 'occupation_ Other-service',
 'occupation_ Priv-house-serv',
 'occupation_ Prof-specialty',


In [51]:
features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']

In [52]:
X = features.values
y = data_dummies['income_ >50K'].values

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [60]:
logreg = LogisticRegression(max_iter=1500)

In [61]:
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
logreg.score(X_test, y_test)

0.8088686893502027