# Data Preprocessing dengan SKlearn

## Sample Data

In [None]:
import numpy as np
from sklearn import preprocessing

In [None]:
sample_data = np.array([
    [2.1, -1.9, 5.5],
    [-1.5, 2.4, 3.5],
    [0.5, -7.9, 5.6],
    [5.9, 2.3, -5.8]
])

print(sample_data)

[[ 2.1 -1.9  5.5]
 [-1.5  2.4  3.5]
 [ 0.5 -7.9  5.6]
 [ 5.9  2.3 -5.8]]


In [None]:
print(f"Ukuran: {sample_data.shape}")

Ukuran: (4, 3)


## Binarisation

- Digunakan untuk mengubah bilangan integer / float menjadi biner.
- Nilai 0 dan 1 ditentukan dari threshold yang dibuat.

In [None]:
sample_data

array([[ 2.1, -1.9,  5.5],
       [-1.5,  2.4,  3.5],
       [ 0.5, -7.9,  5.6],
       [ 5.9,  2.3, -5.8]])

In [None]:
# Mengubah nilai >0.5 menjadi 1 dan <=0.5 menjadi 0
preprocessor = preprocessing.Binarizer(threshold=0.5)
binarised_data = preprocessor.transform(sample_data) # Menggunakan transform () untuk mengubah nilai
binarised_data

array([[1., 0., 1.],
       [0., 1., 1.],
       [0., 0., 1.],
       [1., 1., 0.]])

## Scaling

- Digunakan untuk mengubah nilai pada features sesuai dengan range yang ditentukan

In [None]:
sample_data

array([[ 2.1, -1.9,  5.5],
       [-1.5,  2.4,  3.5],
       [ 0.5, -7.9,  5.6],
       [ 5.9,  2.3, -5.8]])

In [None]:
preprocessor = preprocessing.MinMaxScaler(feature_range=(0, 1))
preprocessor.fit(sample_data)
scaled_data = preprocessor.transform(sample_data)
scaled_data

array([[0.48648649, 0.58252427, 0.99122807],
       [0.        , 1.        , 0.81578947],
       [0.27027027, 0.        , 1.        ],
       [1.        , 0.99029126, 0.        ]])

In [None]:
scaled_data = preprocessor.fit_transform(sample_data) # fit dan transform sekaligus
scaled_data

array([[0.48648649, 0.58252427, 0.99122807],
       [0.        , 1.        , 0.81578947],
       [0.27027027, 0.        , 1.        ],
       [1.        , 0.99029126, 0.        ]])

## L1 Normalisation: Least Absolute Deviations

In [None]:
sample_data

array([[ 2.1, -1.9,  5.5],
       [-1.5,  2.4,  3.5],
       [ 0.5, -7.9,  5.6],
       [ 5.9,  2.3, -5.8]])

In [None]:
l1_normalised_data = preprocessing.normalize(sample_data, norm="l1")
l1_normalised_data

array([[ 0.22105263, -0.2       ,  0.57894737],
       [-0.2027027 ,  0.32432432,  0.47297297],
       [ 0.03571429, -0.56428571,  0.4       ],
       [ 0.42142857,  0.16428571, -0.41428571]])

## L2 Normalisation : Least Squares

In [None]:
l2_normalised_data = preprocessing.normalize(sample_data, norm="l2")
l2_normalised_data

array([[ 0.33946114, -0.30713151,  0.88906489],
       [-0.33325106,  0.53320169,  0.7775858 ],
       [ 0.05156558, -0.81473612,  0.57753446],
       [ 0.68706914,  0.26784051, -0.6754239 ]])