In [3]:
import pandas as pd
import scipy
import numpy
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from scipy.stats import boxcox
from sklearn.preprocessing import LabelEncoder
from numpy import loadtxt
from urllib.request import urlopen

In [4]:
# Importa os dados
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
raw_data = urlopen(url)
dataframe = loadtxt(raw_data, delimiter=",")
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.DataFrame(dataframe, columns=names)
dataframe.head()

array = dataframe.values

# Separa as variáveis de entrada e saída
X = array[:,0:8]
Y = array[:,8]

In [5]:
# Scale (intervalo entre 0 e 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

# Resultados
numpy.set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[ 0.353  0.744  0.59   0.354  0.     0.501  0.234  0.483]
 [ 0.059  0.427  0.541  0.293  0.     0.396  0.117  0.167]
 [ 0.471  0.92   0.525  0.     0.     0.347  0.254  0.183]
 [ 0.059  0.447  0.541  0.232  0.111  0.419  0.038  0.   ]
 [ 0.     0.688  0.328  0.354  0.199  0.642  0.944  0.2  ]]


In [6]:
# Standardize (Média igual a 0 e desvio padrão igual a 1)
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

# Resultados
numpy.set_printoptions(precision=3)
print(rescaledX[0:5,:])

[[ 0.64   0.848  0.15   0.907 -0.693  0.204  0.468  1.426]
 [-0.845 -1.123 -0.161  0.531 -0.693 -0.684 -0.365 -0.191]
 [ 1.234  1.944 -0.264 -1.288 -0.693 -1.103  0.604 -0.106]
 [-0.845 -0.998 -0.161  0.155  0.123 -0.494 -0.921 -1.042]
 [-1.142  0.504 -1.505  0.907  0.766  1.41   5.485 -0.02 ]]


In [7]:
# Normalize (linha com tamanho 1)
scaler = Normalizer().fit(X)
normalizedX = scaler.transform(X)

# Resultados
numpy.set_printoptions(precision=3)
print(normalizedX[0:5,:])


[[ 0.034  0.828  0.403  0.196  0.     0.188  0.004  0.28 ]
 [ 0.008  0.716  0.556  0.244  0.     0.224  0.003  0.261]
 [ 0.04   0.924  0.323  0.     0.     0.118  0.003  0.162]
 [ 0.007  0.588  0.436  0.152  0.622  0.186  0.001  0.139]
 [ 0.     0.596  0.174  0.152  0.731  0.188  0.01   0.144]]


In [8]:
# Binarize
binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)

# Resultados
numpy.set_printoptions(precision=3)
print(binaryX[0:5,:])

[[ 1.  1.  1.  1.  0.  1.  1.  1.]
 [ 1.  1.  1.  1.  0.  1.  1.  1.]
 [ 1.  1.  1.  0.  0.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.  1.  1.  1.]
 [ 0.  1.  1.  1.  1.  1.  1.  1.]]


In [9]:
# Box-Cox
X_boxcox = boxcox(1+X[:,2])[0]
print(X_boxcox[0:35])

[ 612.796  533.835  508.436  533.835  242.169  640.021  344.14     0.
  586.02   967.864  904.506  640.021  724.34   459.052  612.796    0.
  782.715  640.021  154.314  586.02   842.78   782.715  873.438  724.34
  935.983  586.02   667.689  533.835  753.314  904.506  653.8    667.689
  435.08   904.506  695.797]


In [10]:
# Factor (transforma strings em números)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
dataset = pd.read_csv(url, header=None)
array = dataset.values
y = array[:, 60]

encoder = LabelEncoder()
encoder.fit(y)
print(encoder.classes_)

encoded_y = encoder.transform(y)
print(encoded_y)

['M' 'R']
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
