In [4]:
import pandas as pd
import numpy as np
import pickle

def handlingMissingValue(data):
    num_vars = [var for var in data.columns if data[var].dtype != 'O' and var !='Churn']
    num_with_nan = [var for var in num_vars if data[var].isnull().sum() > 0]
    for var in num_with_nan:
        # add binary missing indicator 
        data[var + '_na'] = np.where(data[var].isnull(), 1, 0)

    data['Tenure'] = data['Tenure'].fillna(data.groupby(['MaritalStatus'])['Tenure'].transform('median'))
    data['WarehouseToHome'] = data['WarehouseToHome'].fillna(data.groupby(['MaritalStatus'])['WarehouseToHome'].transform('median'))
    data['DaySinceLastOrder'] = data['DaySinceLastOrder'].fillna(data.groupby(['MaritalStatus'])['DaySinceLastOrder'].transform('median'))
    return data

def handlingOutliers(data):
    outlier_num = ['WarehouseToHome']
    maximum = 36.5
    for var in outlier_num:
        if len(data[data[var] > maximum]):
            data.drop(index=data[data[var] > maximum].index, axis=0, inplace=True)
        else:
            break
    return data


# Function to limit the range of Tenure
def tenure3years(data):
    data = data[data['Tenure'] <= 36]
    return data

def binningData(data):
    data = tenure3years(data)
    labels_tenure = ['1st year', '2nd year', '3rd year']
    data['TenureGroup'] = pd.cut(data["Tenure"], bins=[-1, 12, 24, 36], labels=labels_tenure)
    data['CashbackAmountGroup'] = pd.cut(data["CashbackAmount"], bins=[-1, 50, 100, 150, 200, 250, 300, 350])
    data['NumberOfDRGroup'] =  pd.cut(data["NumberOfDeviceRegistered"], bins=[0, 3, 6])
    return data

def changeDataType(data):
    data["Tenure"] = data["Tenure"].apply(np.int64)
    data["WarehouseToHome"] = data["WarehouseToHome"].apply(np.int64)
    data["NumberOfDeviceRegistered"] = data["NumberOfDeviceRegistered"].apply(np.int64)
    data["DaySinceLastOrder"] = data["DaySinceLastOrder"].apply(np.int64)
    data["TenureGroup"] = data["TenureGroup"].astype(str)
    data["CashbackAmountGroup"] = data["CashbackAmountGroup"].astype(str)
    data["NumberOfDRGroup"] = data["NumberOfDRGroup"].astype(str)
    return data

# Replace Non-Standard Value to Standard Value
def changeValue(data):
    orderCat_dict = {'Mobile':'Mobile Phone', 'Grocery':'Others'}
    orderCat_dict_marital = {'Divorced':'Single/Divorced', 'Single':'Single/Divorced'}
    data['PreferedOrderCat'] = data['PreferedOrderCat'].replace(orderCat_dict)
    data['MaritalStatus'] = data['MaritalStatus'].replace(orderCat_dict_marital)
    return data

# Function Reordering columns
def order_column(data):
    cat_features = [feature for feature in data.columns if ((data[feature].dtypes == 'O'))]
    num_features_all = [feature for feature in data.columns if ((data[feature].dtypes != 'O'))]
    cols = cat_features + num_features_all
    data = data[cols]
    return data

def preprocessing(data):
    data = handlingMissingValue(data)
    data = handlingOutliers(data)
    data = tenure3years(data)
    data = binningData(data)
    data = changeDataType(data)
    data = changeValue(data)
    data = order_column(data)
    return data


In [6]:
# Masukkan dataset Test
df_test = preprocessing(pd.read_csv('........')) 

filename = 'xgb_clf.sav'
loaded_model = pickle.load(open(filename, 'rb'))
y_pred = loaded_model.predict(df_test)