# Data Pipeline and Processing
- Data pipeline: 
    - Aliran data, dari sumber, ke proses, hingga output 
    - Di gambarkan dengan diagram alir
    - Proses: tempat terjadi transformasi data
        - transformasi data: perubahan isi data
    - Data source: asal raw data
    - Input: data dalam format yang sudah siap diolah
    - Connector: menghubungkan data source ke proses lain, yang fungsinya hanya membaca dan mengubah format data.
    - Data stories: laporan, log book data pipeline    

## Data preprocessing
- Tujuan pipeline dibuat: memproses data
- 

In [4]:
# Data preprocessing

from sklearn import datasets
import pandas as pd

def create_df():
    iris = datasets.load_iris()
    cols = ['sepa1_length','sepa1_width', 'peta1_length', 'peta1_width']
    df = pd.DataFrame(iris.data)
    df.columns = cols
    df['target'] = iris.target
    return df

df = create_df()
df.head()

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [11]:
# 1. Split Train dan Test

from sklearn.model_selection import train_test_split

def split_df(df, test_size=0.2):
    train, test = train_test_split(df, test_size=test_size)
    return train, test

train, test = split_df(df, test_size=0.1)
# Yang penting: data train dan data test karakternya serupa

In [10]:
len(df), len(train), len(test)

(150, 135, 15)

In [13]:
iris = datasets.load_iris()
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [18]:
# 2. Missing value
def create_df_with_none():
    cols = ['sepa1_length','sepa1_width', 'peta1_length', 'peta1_width']
    data = [[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, None, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, None, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, None, 0.2],
       [4.4, 2.9, None, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.2],
       [5. , 3.2, 1.2, 0.2],
       [5.5, 3.5, 1.3, 0.2],
       [4.9, 3.6, 1.4, 0.1],
       [4.4, 3. , 1.3, 0.2],
       [5.1, 3.4, 1.5, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [7. , 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4. , 1.3],
       [6.5, 2.8, 4.6, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [4.9, 2.4, 3.3, 1. ],
       [6.6, 2.9, 4.6, 1.3],
       [5.2, 2.7, 3.9, 1.4],
       [5. , 2. , 3.5, 1. ],
       [5.9, 3. , 4.2, 1.5],
       [6. , 2.2, 4. , 1. ],
       [6.1, 2.9, 4.7, 1.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.7, 3.1, 4.4, 1.4],
       [5.6, 3. , 4.5, 1.5],
       [5.8, 2.7, 4.1, 1. ],
       [6.2, 2.2, 4.5, 1.5],
       [5.6, 2.5, 3.9, 1.1],
       [5.9, 3.2, 4.8, 1.8],
       [6.1, 2.8, 4. , 1.3],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 2.8, 4.7, 1.2],
       [6.4, 2.9, 4.3, 1.3],
       [6.6, 3. , 4.4, 1.4],
       [6.8, 2.8, 4.8, 1.4],
       [6.7, 3. , 5. , 1.7],
       [6. , 2.9, 4.5, 1.5],
       [5.7, 2.6, 3.5, 1. ],
       [5.5, 2.4, 3.8, 1.1],
       [5.5, 2.4, 3.7, 1. ],
       [5.8, 2.7, 3.9, 1.2],
       [6. , 2.7, 5.1, 1.6],
       [5.4, 3. , 4.5, 1.5],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [6.3, 2.3, 4.4, 1.3],
       [5.6, 3. , 4.1, 1.3],
       [5.5, 2.5, 4. , 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.1, 3. , 4.6, 1.4],
       [5.8, 2.6, 4. , 1.2],
       [5. , 2.3, 3.3, 1. ],
       [5.6, 2.7, 4.2, 1.3],
       [5.7, 3. , 4.2, 1.2],
       [5.7, 2.9, 4.2, 1.3],
       [6.2, 2.9, 4.3, 1.3],
       [5.1, 2.5, 3. , 1.1],
       [5.7, 2.8, 4.1, 1.3],
       [6.3, 3.3, 6. , 2.5],
       [5.8, 2.7, 5.1, 1.9],
       [7.1, 3. , 5.9, 2.1],
       [6.3, 2.9, 5.6, 1.8],
       [6.5, 3. , 5.8, 2.2],
       [7.6, 3. , 6.6, 2.1],
       [4.9, 2.5, 4.5, 1.7],
       [7.3, 2.9, 6.3, 1.8],
       [6.7, 2.5, 5.8, 1.8],
       [7.2, 3.6, 6.1, 2.5],
       [6.5, 3.2, 5.1, 2. ],
       [6.4, 2.7, 5.3, 1.9],
       [6.8, 3. , 5.5, 2.1],
       [5.7, 2.5, 5. , 2. ],
       [5.8, 2.8, 5.1, 2.4],
       [6.4, 3.2, 5.3, 2.3],
       [6.5, 3. , 5.5, 1.8],
       [7.7, 3.8, 6.7, 2.2],
       [7.7, 2.6, 6.9, 2.3],
       [6. , 2.2, 5. , 1.5],
       [6.9, 3.2, 5.7, 2.3],
       [5.6, 2.8, 4.9, 2. ],
       [7.7, 2.8, 6.7, 2. ],
       [6.3, 2.7, 4.9, 1.8],
       [6.7, 3.3, 5.7, 2.1],
       [7.2, 3.2, 6. , 1.8],
       [6.2, 2.8, 4.8, 1.8],
       [6.1, 3. , 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.1],
       [7.2, 3. , 5.8, 1.6],
       [7.4, 2.8, 6.1, 1.9],
       [7.9, 3.8, 6.4, 2. ],
       [6.4, 2.8, 5.6, 2.2],
       [6.3, 2.8, 5.1, 1.5],
       [6.1, 2.6, 5.6, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.4, 5.6, 2.4],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 3. , 4.8, 1.8],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.1, 5.6, 2.4],
       [6.9, 3.1, 5.1, 2.3],
       [5.8, 2.7, 5.1, 1.9],
       [6.8, 3.2, 5.9, 2.3],
       [6.7, 3.3, 5.7, 2.5],
       [6.7, 3. , 5.2, 2.3],
       [6.3, 2.5, 5. , 1.9],
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]]
    df = pd.DataFrame(data)
    df.columns = cols
    
    return df
    
df = create_df_with_none()
df.head(10)

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,,0.2
8,4.4,2.9,,0.2
9,4.9,3.1,1.5,0.1


In [19]:
df.isna().sum()

sepa1_length    0
sepa1_width     1
peta1_length    3
peta1_width     0
dtype: int64

In [14]:
# Kalau NaN cukup banyak --> data NaN diganti (mean, mode, meadian: bisa pilih apa saja asal karakter datanya tidak berubah)
# Kalau NaN sedikit --> dibuang saja

In [26]:
from sklearn.impute import SimpleImputer
import numpy as np

def imputer_df(df):
    cols = ['sepa1_length','sepa1_width', 'peta1_length', 'peta1_width']
    imputer = SimpleImputer(fill_value=np.nan, strategy='mean')
    new_df = pd.DataFrame(imputer.fit_transform(df), columns=cols)
    return new_df
    
new_df = imputer_df(df)
new_df

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width
0,5.1,3.500000,1.4,0.2
1,4.9,3.000000,1.4,0.2
2,4.7,3.056376,1.3,0.2
3,4.6,3.100000,1.5,0.2
4,5.0,3.600000,1.4,0.2
...,...,...,...,...
145,6.7,3.000000,5.2,2.3
146,6.3,2.500000,5.0,1.9
147,6.5,3.000000,5.2,2.0
148,6.2,3.400000,5.4,2.3


In [27]:
new_df.isna().sum()

sepa1_length    0
sepa1_width     0
peta1_length    0
peta1_width     0
dtype: int64

In [None]:
#2. Dropna

In [30]:
# 3. Categorical value into --> integer

def get_df_with_string():
    iris = datasets.load_iris()
    cols = ['sepa1_length','sepa1_width', 'peta1_length', 'peta1_width']

    target_dict = {
        0:'setosa',
        1:'versicolor',
        2:'virginica'
        
    }
    target = iris.target
    new_target = []
    for t in target:
        new_target.append(target_dict[t])
    print("target_name: ", set(new_target))
    df = pd.DataFrame(iris.data)
    df.columns = cols
    df['target'] = new_target
    return df

df = get_df_with_string()
df.head()

target_name:  {'versicolor', 'setosa', 'virginica'}


Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [31]:
# Label Encoder
## Memberi label kolom target dengan label pada target_encoded
from sklearn.preprocessing import LabelEncoder

def encode_col(df, col, new_col_suf='_encoded'):
    l1 = LabelEncoder()
    l1.fit(df[col])
    df[col + new_col_suf] = l1.transform(df[col])
    return df, l1

df2, l1 = encode_col(df,'target')
df2.head()

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target,target_encoded
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [32]:
set(list(df2['target_encoded']))

{0, 1, 2}

In [34]:
def inv_encoded_col(df, l1,col, new_col_suf='_inv'):
    df[col + new_col_suf] = l1.inverse_transform(df[col])
    return df

df_inv = inv_encoded_col(df2,l1, 'target_encoded')
df_inv.head(19)

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target,target_encoded,target_encoded_inv
0,5.1,3.5,1.4,0.2,setosa,0,setosa
1,4.9,3.0,1.4,0.2,setosa,0,setosa
2,4.7,3.2,1.3,0.2,setosa,0,setosa
3,4.6,3.1,1.5,0.2,setosa,0,setosa
4,5.0,3.6,1.4,0.2,setosa,0,setosa
5,5.4,3.9,1.7,0.4,setosa,0,setosa
6,4.6,3.4,1.4,0.3,setosa,0,setosa
7,5.0,3.4,1.5,0.2,setosa,0,setosa
8,4.4,2.9,1.4,0.2,setosa,0,setosa
9,4.9,3.1,1.5,0.1,setosa,0,setosa


In [35]:
df_inv.tail(19)

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target,target_encoded,target_encoded_inv
131,7.9,3.8,6.4,2.0,virginica,2,virginica
132,6.4,2.8,5.6,2.2,virginica,2,virginica
133,6.3,2.8,5.1,1.5,virginica,2,virginica
134,6.1,2.6,5.6,1.4,virginica,2,virginica
135,7.7,3.0,6.1,2.3,virginica,2,virginica
136,6.3,3.4,5.6,2.4,virginica,2,virginica
137,6.4,3.1,5.5,1.8,virginica,2,virginica
138,6.0,3.0,4.8,1.8,virginica,2,virginica
139,6.9,3.1,5.4,2.1,virginica,2,virginica
140,6.7,3.1,5.6,2.4,virginica,2,virginica


In [36]:
target = list(df_inv['target'])
target_encoded_inv = list(df_inv['target_encoded_inv'])

for t in range(len(target)):
    if target[t] != target_encoded_inv[t]:
        print("Wrong in :", target[t], t)

In [41]:
# One hot encoder?
## Data pada kolom target diubah menjadi vektor (1, 0, 0)
##['setosa', 'versicolor', 'virginica']
##[x , x, x]

dfz = df[[col for col in list(df.columns) if col != 'target_encoded_inv']]
dfz.head()

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target,target_encoded
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [38]:
# dfz pandas
dfz2 = pd.get_dummies(data=dfz)
dfz2.head()

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target_encoded,target_setosa,target_versicolor,target_virginica
0,5.1,3.5,1.4,0.2,0,1,0,0
1,4.9,3.0,1.4,0.2,0,1,0,0
2,4.7,3.2,1.3,0.2,0,1,0,0
3,4.6,3.1,1.5,0.2,0,1,0,0
4,5.0,3.6,1.4,0.2,0,1,0,0


In [39]:
dfz2.tail()

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target_encoded,target_setosa,target_versicolor,target_virginica
145,6.7,3.0,5.2,2.3,2,0,0,1
146,6.3,2.5,5.0,1.9,2,0,0,1
147,6.5,3.0,5.2,2.0,2,0,0,1
148,6.2,3.4,5.4,2.3,2,0,0,1
149,5.9,3.0,5.1,1.8,2,0,0,1


In [52]:
from sklearn.preprocessing import LabelBinarizer

def binarize_df(df, col):
    job_encoder = LabelBinarizer() #Fungsi fitting
    job_encoder.fit(df[col])
    transformed = job_encoder.transform(df[col])
    ohe_df = pd.DataFrame(transformed)

    data = pd.concat([df, ohe_df], axis=1)
    return data, job_encoder

data, job_encoder = binarize_df(dfz, 'target')
data.head()

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target,target_encoded,0,1,2
0,5.1,3.5,1.4,0.2,setosa,0,1,0,0
1,4.9,3.0,1.4,0.2,setosa,0,1,0,0
2,4.7,3.2,1.3,0.2,setosa,0,1,0,0
3,4.6,3.1,1.5,0.2,setosa,0,1,0,0
4,5.0,3.6,1.4,0.2,setosa,0,1,0,0


In [59]:
#test = LabelBinarizer()
help LabelBinarizer()

SyntaxError: invalid syntax (<ipython-input-59-dfa8fe0dc63c>, line 2)

In [None]:
# Semua processor karakternya sama seperti ML
# Karakter ML:
## 1. fit(training --> data training), membuat model ML
## 2. transform --> predict
## 3. fit_transform --> membuat model + predict
## 4. inverse_transform --> hasil prediksi atau inverse predict

In [60]:
def inv_binarizer(df, job_encoder, cols, suf='inv_binarizer'):
    df[suf] = job_encoder.inverse_transform(df[cols].to_numpy())
    return df

data2 = inv_binarizer(data, job_encoder, [0,1,2])
data2

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target,target_encoded,0,1,2,inv_binarizer
0,5.1,3.5,1.4,0.2,setosa,0,1,0,0,setosa
1,4.9,3.0,1.4,0.2,setosa,0,1,0,0,setosa
2,4.7,3.2,1.3,0.2,setosa,0,1,0,0,setosa
3,4.6,3.1,1.5,0.2,setosa,0,1,0,0,setosa
4,5.0,3.6,1.4,0.2,setosa,0,1,0,0,setosa
...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2,0,0,1,virginica
146,6.3,2.5,5.0,1.9,virginica,2,0,0,1,virginica
147,6.5,3.0,5.2,2.0,virginica,2,0,0,1,virginica
148,6.2,3.4,5.4,2.3,virginica,2,0,0,1,virginica


In [61]:
# Normalizing data sets

data2.var(ddof=0)

  data2.var(ddof=0)


sepa1_length      0.681122
sepa1_width       0.188713
peta1_length      3.095503
peta1_width       0.577133
target_encoded    0.666667
0                 0.222222
1                 0.222222
2                 0.222222
dtype: float64

In [None]:
from sklearn.preprocessing import StandardScaler



def scaled_df(df):
    ss = StandardScaler()
    old_col = [c for c in list(df.columns) if c not in ['target', 'inv_binarizer']]
    new_col = [str(c) + '_n' for c in list(df.columns) if str(c) not in ['target', 'inv_binarizer']]
    df[new_col] = ss.fit_transform(df[old_col])
#     print("1", old_col)
#     print("2", new_col)
    return df, ss

dfz, ss = scaled_df(data2)
dfz

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target,target_encoded,0,1,2,inv_binarizer,sepa1_length_n,sepa1_width_n,peta1_length_n,peta1_width_n,target_encoded_n,0_n,1_n,2_n
0,5.1,3.5,1.4,0.2,setosa,0,1,0,0,setosa,-0.900681,1.019004,-1.340227,-1.315444,-1.224745,1.414214,-0.707107,-0.707107
1,4.9,3.0,1.4,0.2,setosa,0,1,0,0,setosa,-1.143017,-0.131979,-1.340227,-1.315444,-1.224745,1.414214,-0.707107,-0.707107
2,4.7,3.2,1.3,0.2,setosa,0,1,0,0,setosa,-1.385353,0.328414,-1.397064,-1.315444,-1.224745,1.414214,-0.707107,-0.707107
3,4.6,3.1,1.5,0.2,setosa,0,1,0,0,setosa,-1.506521,0.098217,-1.283389,-1.315444,-1.224745,1.414214,-0.707107,-0.707107
4,5.0,3.6,1.4,0.2,setosa,0,1,0,0,setosa,-1.021849,1.249201,-1.340227,-1.315444,-1.224745,1.414214,-0.707107,-0.707107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2,0,0,1,virginica,1.038005,-0.131979,0.819596,1.448832,1.224745,-0.707107,-0.707107,1.414214
146,6.3,2.5,5.0,1.9,virginica,2,0,0,1,virginica,0.553333,-1.282963,0.705921,0.922303,1.224745,-0.707107,-0.707107,1.414214
147,6.5,3.0,5.2,2.0,virginica,2,0,0,1,virginica,0.795669,-0.131979,0.819596,1.053935,1.224745,-0.707107,-0.707107,1.414214
148,6.2,3.4,5.4,2.3,virginica,2,0,0,1,virginica,0.432165,0.788808,0.933271,1.448832,1.224745,-0.707107,-0.707107,1.414214


In [68]:
dfz.var(ddof=0)

  dfz.var(ddof=0)


sepa1_length        0.681122
sepa1_width         0.188713
peta1_length        3.095503
peta1_width         0.577133
target_encoded      0.666667
0                   0.222222
1                   0.222222
2                   0.222222
sepa1_length_n      1.000000
sepa1_width_n       1.000000
peta1_length_n      1.000000
peta1_width_n       1.000000
target_encoded_n    1.000000
0_n                 1.000000
1_n                 1.000000
2_n                 1.000000
dtype: float64

In [69]:
def inv_scaled_df(df, ss):
    old_col = [c for c in list(df.columns) if str(c).endswith('_n')]
    new_col = [str(c) + '_inv' for c in list(df.columns) if str(c).endswith('_n')]
    df[new_col] = ss.inverse_transform(df[old_col])
    return df
df_inv = inv_scaled_df(dfz, ss)
df_inv
    

Unnamed: 0,sepa1_length,sepa1_width,peta1_length,peta1_width,target,target_encoded,0,1,2,inv_binarizer,...,1_n,2_n,sepa1_length_n_inv,sepa1_width_n_inv,peta1_length_n_inv,peta1_width_n_inv,target_encoded_n_inv,0_n_inv,1_n_inv,2_n_inv
0,5.1,3.5,1.4,0.2,setosa,0,1,0,0,setosa,...,-0.707107,-0.707107,5.1,3.5,1.4,0.2,1.110223e-16,1.0,0.0,0.0
1,4.9,3.0,1.4,0.2,setosa,0,1,0,0,setosa,...,-0.707107,-0.707107,4.9,3.0,1.4,0.2,1.110223e-16,1.0,0.0,0.0
2,4.7,3.2,1.3,0.2,setosa,0,1,0,0,setosa,...,-0.707107,-0.707107,4.7,3.2,1.3,0.2,1.110223e-16,1.0,0.0,0.0
3,4.6,3.1,1.5,0.2,setosa,0,1,0,0,setosa,...,-0.707107,-0.707107,4.6,3.1,1.5,0.2,1.110223e-16,1.0,0.0,0.0
4,5.0,3.6,1.4,0.2,setosa,0,1,0,0,setosa,...,-0.707107,-0.707107,5.0,3.6,1.4,0.2,1.110223e-16,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2,0,0,1,virginica,...,-0.707107,1.414214,6.7,3.0,5.2,2.3,2.000000e+00,0.0,0.0,1.0
146,6.3,2.5,5.0,1.9,virginica,2,0,0,1,virginica,...,-0.707107,1.414214,6.3,2.5,5.0,1.9,2.000000e+00,0.0,0.0,1.0
147,6.5,3.0,5.2,2.0,virginica,2,0,0,1,virginica,...,-0.707107,1.414214,6.5,3.0,5.2,2.0,2.000000e+00,0.0,0.0,1.0
148,6.2,3.4,5.4,2.3,virginica,2,0,0,1,virginica,...,-0.707107,1.414214,6.2,3.4,5.4,2.3,2.000000e+00,0.0,0.0,1.0


In [70]:
df_inv.var(ddof=0)

  df_inv.var(ddof=0)


sepa1_length            0.681122
sepa1_width             0.188713
peta1_length            3.095503
peta1_width             0.577133
target_encoded          0.666667
0                       0.222222
1                       0.222222
2                       0.222222
sepa1_length_n          1.000000
sepa1_width_n           1.000000
peta1_length_n          1.000000
peta1_width_n           1.000000
target_encoded_n        1.000000
0_n                     1.000000
1_n                     1.000000
2_n                     1.000000
sepa1_length_n_inv      0.681122
sepa1_width_n_inv       0.188713
peta1_length_n_inv      3.095503
peta1_width_n_inv       0.577133
target_encoded_n_inv    0.666667
0_n_inv                 0.222222
1_n_inv                 0.222222
2_n_inv                 0.222222
dtype: float64

In [71]:
# Pipeline sklearn
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline


X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

pipe = Pipeline(
    [('scaler', StandardScaler()),
     ('svc', SVC())])
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_train, y_train)


Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
pipe.score(X_test, y_test)

0.88

## Explore:
- sklearn.preprocessing?
- Pipeline sklearn?
- pipeline di dask dan pyspark?
- Referensi: Data pipeline and data preprocessing