## Setup

In [1]:
import pandas as pd
import scipy as scp
import numpy as np

#### Vamos trabalhar com o Dataset de Diabetes de Índios Pina

Colunas:

* 0. Número de vezes grávidas
* 1. Concentração de glicose a 2h de um teste de tolerância a glicose.
* 2. Pressão sanguínea diastólica (mm Hg).
* 3. Mensuração da espessura da pregas cutâneas Tricipital (mm).
* 4. Nível sérico de Insulina após 2h (mu U/ml).
* 5. Body mass index (peso em kg/(altura em m)^2).
* 6. Função Pedigree Diabetes.
* 7. Idade (anos).
* 8. Variável Classe (0 ou 1).

In [2]:
dataset_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataset = pd.read_csv('../dados/pima-indians-diabetes.data.txt', 
                      names= dataset_names)


In [3]:
dataset.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
raw_input = dataset.loc[:, dataset.columns != "class"]
raw_input

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [5]:
output = dataset.loc[:,"class"]
output

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: class, Length: 768, dtype: int64

### MinMax Scaling

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_input = pd.DataFrame(scaler.fit_transform(raw_input), 
                              columns=raw_input.columns)

In [7]:
rescaled_input

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,0.352941,0.743719,0.590164,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.000000,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...,...
763,0.588235,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.117647,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.294118,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.058824,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


In [8]:
rescaled_input.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.22618,0.60751,0.566438,0.207439,0.094326,0.47679,0.168179,0.204015
std,0.19821,0.160666,0.158654,0.161134,0.136222,0.117499,0.141473,0.196004
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.058824,0.497487,0.508197,0.0,0.0,0.406855,0.070773,0.05
50%,0.176471,0.58794,0.590164,0.232323,0.036052,0.4769,0.125747,0.133333
75%,0.352941,0.704774,0.655738,0.323232,0.150414,0.545455,0.234095,0.333333
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Estandardização

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standardized_input = pd.DataFrame(scaler.fit_transform(raw_input), 
                              columns=raw_input.columns).round(3)

In [10]:
standardized_input

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,0.640,0.848,0.150,0.907,-0.693,0.204,0.468,1.426
1,-0.845,-1.123,-0.161,0.531,-0.693,-0.684,-0.365,-0.191
2,1.234,1.944,-0.264,-1.288,-0.693,-1.103,0.604,-0.106
3,-0.845,-0.998,-0.161,0.155,0.123,-0.494,-0.921,-1.042
4,-1.142,0.504,-1.505,0.907,0.766,1.410,5.485,-0.020
...,...,...,...,...,...,...,...,...
763,1.828,-0.623,0.356,1.723,0.870,0.115,-0.909,2.532
764,-0.548,0.035,0.046,0.405,-0.693,0.610,-0.398,-0.531
765,0.343,0.003,0.150,0.155,0.280,-0.735,-0.685,-0.276
766,-0.845,0.160,-0.471,-1.288,-0.693,-0.240,-0.371,1.171


In [11]:
standardized_input.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,-2e-05,3e-06,-4.6e-05,8.1e-05,-5.5e-05,-1.3e-05,3e-05,4.6e-05
std,1.000765,1.000674,1.000727,1.00057,1.000693,1.00062,1.000651,1.000611
min,-1.142,-3.784,-3.573,-1.288,-0.693,-4.06,-1.19,-1.042
25%,-0.845,-0.685,-0.367,-1.288,-0.693,-0.596,-0.68875,-0.786
50%,-0.251,-0.122,0.15,0.155,-0.428,0.001,-0.3005,-0.361
75%,0.64,0.60575,0.563,0.719,0.41225,0.585,0.46575,0.66
max,3.907,2.444,2.735,4.922,6.653,4.456,5.884,4.064


### Normalização

In [12]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
normalized_input = pd.DataFrame(scaler.fit_transform(raw_input), 
                              columns=raw_input.columns).round(3)

In [13]:
normalized_input

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,0.034,0.828,0.403,0.196,0.000,0.188,0.004,0.280
1,0.008,0.716,0.556,0.244,0.000,0.224,0.003,0.261
2,0.040,0.924,0.323,0.000,0.000,0.118,0.003,0.162
3,0.007,0.588,0.436,0.152,0.622,0.186,0.001,0.139
4,0.000,0.596,0.174,0.152,0.731,0.188,0.010,0.144
...,...,...,...,...,...,...,...,...
763,0.042,0.427,0.322,0.203,0.762,0.139,0.001,0.267
764,0.013,0.812,0.466,0.180,0.000,0.245,0.002,0.180
765,0.027,0.651,0.388,0.124,0.603,0.141,0.001,0.161
766,0.007,0.838,0.399,0.000,0.000,0.200,0.002,0.313


In [14]:
normalized_input.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.02265,0.682902,0.402806,0.112076,0.318923,0.186874,0.002707,0.195434
std,0.020965,0.161162,0.153434,0.092533,0.338569,0.063407,0.001919,0.080953
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032
25%,0.006,0.58775,0.31775,0.0,0.0,0.146,0.001,0.14
50%,0.017,0.7045,0.4305,0.1145,0.2495,0.186,0.002,0.181
75%,0.033,0.8015,0.511,0.18125,0.633,0.227,0.004,0.23925
max,0.117,0.974,0.848,0.42,0.97,0.401,0.013,0.617


### Binarização

In [15]:
from sklearn.preprocessing import Binarizer
scaler = Binarizer(threshold=0.0)
binarized_input = pd.DataFrame(scaler.fit_transform(raw_input), 
                              columns=raw_input.columns).round(3)

In [16]:
binarized_input

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
2,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
763,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
764,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
765,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
766,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0


In [17]:
normalized_input.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.02265,0.682902,0.402806,0.112076,0.318923,0.186874,0.002707,0.195434
std,0.020965,0.161162,0.153434,0.092533,0.338569,0.063407,0.001919,0.080953
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032
25%,0.006,0.58775,0.31775,0.0,0.0,0.146,0.001,0.14
50%,0.017,0.7045,0.4305,0.1145,0.2495,0.186,0.002,0.181
75%,0.033,0.8015,0.511,0.18125,0.633,0.227,0.004,0.23925
max,0.117,0.974,0.848,0.42,0.97,0.401,0.013,0.617
