In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
# load dataset
filename = 'diabetes.csv'
names = ['preg', 'gluc', 'blood', 'skin', 'ins', 'bmi', 'ped', 'age', 'class']
df = pd.read_csv(filename, names=names, header=None, skiprows=1)
df.head()

Unnamed: 0,preg,gluc,blood,skin,ins,bmi,ped,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
array = df.values
print(type(array))
print(array)

<class 'numpy.ndarray'>
[[  6.    148.     72.    ...   0.627  50.      1.   ]
 [  1.     85.     66.    ...   0.351  31.      0.   ]
 [  8.    183.     64.    ...   0.672  32.      1.   ]
 ...
 [  5.    121.     72.    ...   0.245  30.      0.   ]
 [  1.    126.     60.    ...   0.349  47.      1.   ]
 [  1.     93.     70.    ...   0.315  23.      0.   ]]


In [8]:
# pisah input dan output
X = array[:, 0:8]
y = array[:, 8]
print(X.shape)
print(y.shape)

(768, 8)
(768,)


# 1. Rescale Data

In [11]:
# rescale data
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

# set print options dengan numpy
np.set_printoptions(precision=3)

# cetak data setelah rescale 5 baris pertama tiap column X
print(rescaledX[0:5, :])

[[0.353 0.744 0.59  0.354 0.    0.501 0.234 0.483]
 [0.059 0.427 0.541 0.293 0.    0.396 0.117 0.167]
 [0.471 0.92  0.525 0.    0.    0.347 0.254 0.183]
 [0.059 0.447 0.541 0.232 0.111 0.419 0.038 0.   ]
 [0.    0.688 0.328 0.354 0.199 0.642 0.944 0.2  ]]


# 2. Standardize Data

In [12]:
from sklearn.preprocessing import StandardScaler

scaler2 = StandardScaler().fit(X)
rescaledX2 = scaler2.transform(X)

# set print options dengan numpy
np.set_printoptions(precision=3)

# cetak data setelah standardize 5 baris pertama tiap column X
print(rescaledX2[0:5, :])

[[ 0.64   0.848  0.15   0.907 -0.693  0.204  0.468  1.426]
 [-0.845 -1.123 -0.161  0.531 -0.693 -0.684 -0.365 -0.191]
 [ 1.234  1.944 -0.264 -1.288 -0.693 -1.103  0.604 -0.106]
 [-0.845 -0.998 -0.161  0.155  0.123 -0.494 -0.921 -1.042]
 [-1.142  0.504 -1.505  0.907  0.766  1.41   5.485 -0.02 ]]


# 3. Normalize Data

In [13]:
from sklearn.preprocessing import Normalizer

scaler3 = Normalizer().fit(X)
normalizedX = scaler3.transform(X)

# set print options dengan numpy
np.set_printoptions(precision=3)

# cetak data setelah normalize 5 baris pertama tiap column X
print(normalizedX[0:5, :])

[[0.034 0.828 0.403 0.196 0.    0.188 0.004 0.28 ]
 [0.008 0.716 0.556 0.244 0.    0.224 0.003 0.261]
 [0.04  0.924 0.323 0.    0.    0.118 0.003 0.162]
 [0.007 0.588 0.436 0.152 0.622 0.186 0.001 0.139]
 [0.    0.596 0.174 0.152 0.731 0.188 0.01  0.144]]


# 4. Binarize Data

In [14]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)

# set print options dengan numpy
np.set_printoptions(precision=3)

# cetak data setelah binarize 5 baris pertama tiap column X
print(binaryX[0:5, :])

[[1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 1. 1. 1. 1. 1.]]
