In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import normalize

In [None]:
# pipeline
def pipeline(data):
    def is_balance(data):
        freq = data.value_counts(normalize=True)
        center_value = 1 / len(freq)

        n_non_eligible = abs((freq - center_value) / center_value) > 0.1
        if n_non_eligible.sum() > 0.5 :
            return 'imbalance'
        else :
            return 'balance'

    def min_max_scaler(series):
        max = series.max()
        min = series.min()
        std = series.std()
        x_std = (series - min) / (max-min)
        return x_std 

    colmap = {"dateCreated": "ad_created",
        "dateCrawled": "date_crawled",
        "fuelType": "fuel_type",
        "lastSeen": "last_seen",
        "monthOfRegistration": "registration_month",
        "notRepairedDamage": "unrepaired_damage",
        "nrOfPictures": "num_of_pictures",
        "offerType": "offer_type",
        "postalCode": "postal_code",
        "powerPS": "power_ps",
        "vehicleType": "vehicle_type",
        "yearOfRegistration": "registration_year"}
    
    # step 1 : renaing 
    data = data.rename(columns=colmap)

    # step 2: to datetime
    col_to_date = ["ad_created", "date_crawled", "last_seen"]
    data[col_to_date] = data[col_to_date].astype('Datetime64')

    # step 3: fix numeric cols 
    data['price'] = data['price'].str.replace('\$|,|km|\.\d+', '').astype(float)
    data['odometer'] = data['odometer'].str.replace('\$|,|km|\.\d+', '').astype(float)

    # step 3.5: change categorical int to str 
    data['postal_code'] = data['postal_code'].astype('str')
    data['registration_month'] = data['registration_month'].astype('str')

    # step 4: drop bad columns 
    # drop kolom numerik yang tidak informatif 
    col_unique_val = data.select_dtypes('number').apply(lambda x: x.nunique())
    non_informative_col = col_unique_val[col_unique_val <= 1].index.values
    is_balance = pd.DataFrame(data.select_dtypes(['object', 'category']).apply(is_balance), columns=['is_balance'])
    cat_desc = data.select_dtypes(['object', 'category']).describe().T
    cat_desc['mode_prop'] = cat_desc['freq']/cat_desc['count']
    cat_desc['unique_prop'] = cat_desc['unique']/cat_desc['count']
    cat_desc = cat_desc.join(is_balance, how='left')
    cond1 = cat_desc['is_balance'] == 'imbalance' 
    cond2 = cat_desc['mode_prop'] > 0.99
    imbalance_col_cat = cat_desc[cond1 | cond2].index.values
    imbalance_col_cat
    drop_col = list(non_informative_col) + list(imbalance_col_cat)
    clean_data = data.drop(columns=drop_col)

    # step 5: remove outliers 
    cond1 = clean_data['price'] >= 500 
    cond2 = clean_data['price'] <= 40000
    clean_data_no_outlier = clean_data[cond1 & cond2]

    # step 6: fill missing value
    for col in clean_data_no_outlier.columns[1:]:
        if len(clean_data_no_outlier[[col]].select_dtypes('number').columns) > 0 :
            median = clean_data_no_outlier[col].quantile(.5)
            clean_data_no_outlier[col] = clean_data_no_outlier[col].fillna(median)
        else :
            mode_val = clean_data_no_outlier[col].mode()
            clean_data_no_outlier[col] = clean_data_no_outlier[col].fillna(mode_val)

    # step 7 : normalize and encode
    data = clean_data_no_outlier.copy()
    normalized_numeric_data = data.select_dtypes('number').apply(min_max_scaler)
    data_one_hot_category = pd.get_dummies(data.select_dtypes(['object', 'category']))
    data_date = data.select_dtypes(exclude=['number', 'object', 'category'])
    final_data = pd.concat([data_date, normalized_numeric_data, data_one_hot_category], axis=1)
    return final_data 
        