In [69]:
import pandas as pd
import numpy as np
import copy
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import util
from util import load_config, pickle_dump, pickle_load
from sklearn.preprocessing import StandardScaler

In [2]:
def load_dataset(config: dict):
    x_train = pickle_load(config["train_set_path"][0])
    y_train = pickle_load(config["train_set_path"][1])

    x_valid = pickle_load(config["valid_set_path"][0])
    y_valid = pickle_load(config["valid_set_path"][1])

    x_test = pickle_load(config["test_set_path"][0])
    y_test = pickle_load(config["test_set_path"][1])

    # reset index
    x_train = x_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)

    x_valid = x_valid.reset_index(drop=True)
    y_valid = y_valid.reset_index(drop=True)

    x_test = x_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    return x_train, y_train, x_valid, y_valid, x_test, y_test

In [3]:
config = util.load_config()

In [4]:
x_train, y_train, x_valid, y_valid, x_test, y_test = load_dataset(config)

In [5]:
x_train.head(2)

Unnamed: 0,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,preferred_offer_types,medium_of_operation,...,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,18,M,BO1RD7I,Town,Gold Membership,2015-01-14,Yes,CID49290,Gift Vouchers/Coupons,Both,...,10,599.41,49202.84,Error,738.46,No,Yes,No,Not Applicable,Too many ads
1,39,M,9XX2D8Y,,Silver Membership,2016-09-17,Yes,CID62128,Without Offers,Desktop,...,22,95.62,11001.35,15.0,249.450376,No,Yes,Yes,Solved in Follow-up,Poor Website


In [6]:
x_train_numerical = x_train[config["numerical_column"]]
x_train_numerical

Unnamed: 0,age,days_since_last_login,points_in_wallet
0,18,10,738.460000
1,39,22,249.450376
2,34,12,568.140000
3,56,18,716.410000
4,23,10,671.240000
...,...,...,...
25889,58,6,407.179492
25890,56,3,776.420000
25891,20,10,
25892,57,16,693.920000


In [7]:
x_train_numerical.isnull().any()

age                      False
days_since_last_login    False
points_in_wallet          True
dtype: bool

In [8]:
x_valid_numerical = x_valid[config["numerical_column"]]
x_valid_numerical

Unnamed: 0,age,days_since_last_login,points_in_wallet
0,40,-999,548.870000
1,49,17,773.760000
2,46,12,353.290489
3,40,-999,797.180000
4,48,14,758.740000
...,...,...,...
5544,22,14,
5545,56,7,288.001842
5546,26,14,445.142384
5547,41,23,246.045536


In [9]:
x_valid_numerical.isnull().any()

age                      False
days_since_last_login    False
points_in_wallet          True
dtype: bool

In [10]:
x_test_numerical = x_test[config["numerical_column"]]
x_test_numerical

Unnamed: 0,age,days_since_last_login,points_in_wallet
0,41,14,793.811069
1,46,6,768.130000
2,51,19,774.780000
3,40,16,504.670000
4,18,9,755.690000
...,...,...,...
5544,15,23,
5545,64,13,605.170000
5546,26,16,663.500000
5547,29,12,568.320000


In [11]:
x_test_numerical.isnull().any()

age                      False
days_since_last_login    False
points_in_wallet          True
dtype: bool

In [12]:
x_train_categorical = x_train[config["categorical_column"]]
x_train_categorical

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,M,Town,Gold Membership,Yes,Gift Vouchers/Coupons,Both,Mobile_Data,No,Yes,No,Not Applicable,Too many ads
1,M,,Silver Membership,Yes,Without Offers,Desktop,Mobile_Data,No,Yes,Yes,Solved in Follow-up,Poor Website
2,F,City,No Membership,No,Without Offers,Smartphone,Mobile_Data,No,Yes,Yes,Unsolved,Poor Customer Service
3,M,Village,Basic Membership,No,Credit/Debit Card Offers,Desktop,Wi-Fi,Yes,No,Yes,No Information Available,Too many ads
4,F,Village,Basic Membership,No,Without Offers,Desktop,Fiber_Optic,Yes,No,No,Not Applicable,Poor Customer Service
...,...,...,...,...,...,...,...,...,...,...,...,...
25889,F,Town,Premium Membership,Yes,Without Offers,Smartphone,Wi-Fi,Yes,No,No,Not Applicable,Too many ads
25890,F,Town,Premium Membership,Yes,Gift Vouchers/Coupons,?,Fiber_Optic,No,Yes,Yes,Unsolved,User Friendly Website
25891,F,,Silver Membership,Yes,Without Offers,Desktop,Fiber_Optic,Yes,No,No,Not Applicable,Poor Product Quality
25892,M,Town,Basic Membership,Yes,Credit/Debit Card Offers,Smartphone,Fiber_Optic,Yes,No,Yes,No Information Available,No reason specified


In [13]:
x_train_categorical.isnull().any()

gender                          False
region_category                  True
membership_category             False
joined_through_referral         False
preferred_offer_types            True
medium_of_operation             False
internet_option                 False
used_special_discount           False
offer_application_preference    False
past_complaint                  False
complaint_status                False
feedback                        False
dtype: bool

In [14]:
x_valid_categorical = x_valid[config["categorical_column"]]
x_valid_categorical

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,M,,Basic Membership,No,Credit/Debit Card Offers,Smartphone,Fiber_Optic,Yes,No,No,Not Applicable,Too many ads
1,M,,Gold Membership,Yes,Gift Vouchers/Coupons,?,Mobile_Data,Yes,No,No,Not Applicable,User Friendly Website
2,F,,Basic Membership,?,Gift Vouchers/Coupons,Desktop,Fiber_Optic,Yes,No,No,Not Applicable,Poor Product Quality
3,M,City,Premium Membership,Yes,Credit/Debit Card Offers,Smartphone,Fiber_Optic,No,Yes,No,Not Applicable,Poor Customer Service
4,M,,Platinum Membership,Yes,Gift Vouchers/Coupons,Desktop,Mobile_Data,No,Yes,No,Not Applicable,No reason specified
...,...,...,...,...,...,...,...,...,...,...,...,...
5544,M,City,Gold Membership,?,Credit/Debit Card Offers,?,Wi-Fi,Yes,No,No,Not Applicable,Poor Website
5545,M,City,Basic Membership,No,Credit/Debit Card Offers,Desktop,Wi-Fi,No,Yes,No,Not Applicable,Poor Product Quality
5546,M,City,Premium Membership,No,Gift Vouchers/Coupons,Desktop,Wi-Fi,Yes,No,No,Not Applicable,User Friendly Website
5547,M,,Gold Membership,No,Gift Vouchers/Coupons,Both,Mobile_Data,Yes,Yes,Yes,No Information Available,Poor Product Quality


In [15]:
x_test_categorical = x_test[config["categorical_column"]]
x_test_categorical

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,F,City,Platinum Membership,No,Without Offers,Desktop,Fiber_Optic,Yes,No,Yes,Solved,Too many ads
1,M,Town,Platinum Membership,Yes,Gift Vouchers/Coupons,?,Fiber_Optic,Yes,No,Yes,Unsolved,Products always in Stock
2,F,City,Silver Membership,No,Without Offers,Smartphone,Mobile_Data,Yes,No,No,Not Applicable,User Friendly Website
3,M,City,Basic Membership,No,Credit/Debit Card Offers,?,Fiber_Optic,No,Yes,Yes,Solved in Follow-up,Too many ads
4,F,Village,Silver Membership,Yes,Without Offers,Both,Mobile_Data,No,Yes,Yes,Unsolved,User Friendly Website
...,...,...,...,...,...,...,...,...,...,...,...,...
5544,M,City,Premium Membership,No,Credit/Debit Card Offers,Smartphone,Mobile_Data,No,Yes,No,Not Applicable,Reasonable Price
5545,F,Town,Basic Membership,Yes,Gift Vouchers/Coupons,Smartphone,Mobile_Data,No,Yes,Yes,Solved,Poor Product Quality
5546,M,Town,Basic Membership,No,Credit/Debit Card Offers,Smartphone,Fiber_Optic,No,Yes,Yes,Solved,Too many ads
5547,F,Town,Basic Membership,Yes,Gift Vouchers/Coupons,Desktop,Mobile_Data,Yes,No,No,Not Applicable,No reason specified


In [16]:
x_train_numerical[x_train_numerical["days_since_last_login"] < 0]["days_since_last_login"].count()

1404

In [17]:
x_train_numerical["days_since_last_login"] = x_train_numerical["days_since_last_login"].replace(-999, np.nan)
x_train_numerical[x_train_numerical["days_since_last_login"] < 0]["days_since_last_login"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train_numerical["days_since_last_login"] = x_train_numerical["days_since_last_login"].replace(-999, np.nan)


0

In [18]:
x_valid_numerical[x_valid_numerical["days_since_last_login"] < 0]["days_since_last_login"].count()

305

In [19]:
x_valid_numerical["days_since_last_login"] = x_valid_numerical["days_since_last_login"].replace(-999, np.nan)
x_valid_numerical[x_valid_numerical["days_since_last_login"] < 0]["days_since_last_login"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_valid_numerical["days_since_last_login"] = x_valid_numerical["days_since_last_login"].replace(-999, np.nan)


0

In [20]:
x_test_numerical[x_test_numerical["days_since_last_login"] < 0]["days_since_last_login"].count()

290

In [21]:
x_test_numerical["days_since_last_login"] = x_test_numerical["days_since_last_login"].replace(-999, np.nan)
x_test_numerical[x_test_numerical["days_since_last_login"] < 0]["days_since_last_login"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test_numerical["days_since_last_login"] = x_test_numerical["days_since_last_login"].replace(-999, np.nan)


0

In [22]:
x_train_numerical["points_in_wallet"] = x_train_numerical["points_in_wallet"].where(lambda x: x >=0, np.nan)
x_train_numerical[x_train_numerical["points_in_wallet"] < 0]["points_in_wallet"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train_numerical["points_in_wallet"] = x_train_numerical["points_in_wallet"].where(lambda x: x >=0, np.nan)


0

In [23]:
x_valid_numerical["points_in_wallet"] = x_valid_numerical["points_in_wallet"].where(lambda x: x >=0, np.nan)
x_valid_numerical[x_valid_numerical["points_in_wallet"] < 0]["points_in_wallet"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_valid_numerical["points_in_wallet"] = x_valid_numerical["points_in_wallet"].where(lambda x: x >=0, np.nan)


0

In [24]:
x_test_numerical["points_in_wallet"] = x_test_numerical["points_in_wallet"].where(lambda x: x >=0, np.nan)
x_test_numerical[x_test_numerical["points_in_wallet"] < 0]["points_in_wallet"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test_numerical["points_in_wallet"] = x_test_numerical["points_in_wallet"].where(lambda x: x >=0, np.nan)


0

In [25]:
x_train_categorical["gender"] = x_train_categorical["gender"].replace('Unknown', np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train_categorical["gender"] = x_train_categorical["gender"].replace('Unknown', np.nan)


In [26]:
x_valid_categorical["gender"] = x_valid_categorical["gender"].replace('Unknown', np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_valid_categorical["gender"] = x_valid_categorical["gender"].replace('Unknown', np.nan)


In [27]:
x_test_categorical["gender"] = x_test_categorical["gender"].replace('Unknown', np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test_categorical["gender"] = x_test_categorical["gender"].replace('Unknown', np.nan)


In [28]:
x_train_categorical["joined_through_referral"] = x_train_categorical["joined_through_referral"].replace('?', np.nan)
x_train_categorical[x_train_categorical["joined_through_referral"] == "?"]["joined_through_referral"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train_categorical["joined_through_referral"] = x_train_categorical["joined_through_referral"].replace('?', np.nan)


0

In [29]:
x_valid_categorical["joined_through_referral"] = x_valid_categorical["joined_through_referral"].replace('?', np.nan)
x_valid_categorical[x_valid_categorical["joined_through_referral"] == "?"]["joined_through_referral"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_valid_categorical["joined_through_referral"] = x_valid_categorical["joined_through_referral"].replace('?', np.nan)


0

In [30]:
x_test_categorical["joined_through_referral"] = x_test_categorical["joined_through_referral"].replace('?', np.nan)
x_test_categorical[x_test_categorical["joined_through_referral"] == "?"]["joined_through_referral"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test_categorical["joined_through_referral"] = x_test_categorical["joined_through_referral"].replace('?', np.nan)


0

In [31]:
x_train_categorical["medium_of_operation"] = x_train_categorical["medium_of_operation"].replace('?', np.nan)
x_train_categorical[x_train_categorical["medium_of_operation"] == "?"]["medium_of_operation"].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train_categorical["medium_of_operation"] = x_train_categorical["medium_of_operation"].replace('?', np.nan)


0

In [32]:
x_valid_categorical["medium_of_operation"] = x_valid_categorical["medium_of_operation"].replace('?', np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_valid_categorical["medium_of_operation"] = x_valid_categorical["medium_of_operation"].replace('?', np.nan)


In [33]:
x_test_categorical["medium_of_operation"] = x_test_categorical["medium_of_operation"].replace('?', np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test_categorical["medium_of_operation"] = x_test_categorical["medium_of_operation"].replace('?', np.nan)


In [34]:
x_train_numerical.shape, x_valid_numerical.shape, x_test_numerical.shape

((25894, 3), (5549, 3), (5549, 3))

In [35]:
def numericalImputation(data, numerical_column):
    #Filter numeric data
    numerical_data = data[numerical_column]
    
    #Buat imputer
    imputer_numerical = SimpleImputer(missing_values=np.nan,
                                     strategy="median")
    imputer_numerical.fit(numerical_data)
    
    #Transform
    imputed_data = imputer_numerical.transform(numerical_data)
    numerical_data_imputed = pd.DataFrame(imputed_data)
    
    numerical_data_imputed.columns = numerical_column
    numerical_data_imputed.index = numerical_data.index
    
    return numerical_data_imputed, imputer_numerical

In [36]:
x_train_numerical, imputer_numerical = numericalImputation(data = x_train_numerical, numerical_column = config["numerical_column"])
x_valid_numerical, imputer_valid_numerical = numericalImputation(data = x_valid_numerical, numerical_column = config["numerical_column"])
x_test_numerical, imputer_test_numerical = numericalImputation(data = x_test_numerical, numerical_column = config["numerical_column"])

In [37]:
x_train_numerical.isnull().sum()

age                      0
days_since_last_login    0
points_in_wallet         0
dtype: int64

In [38]:
def categoricalImputation(data, categorical_column):
    """
    Function for imputation categorical data
    :param data: <pandas dataframe> sample input data
    :param categorical_column: <list> list column categorical data
    :return categorical_data: <pandas datafarame> categorical data
    """
    #Selection data
    categorical_data = data[categorical_column]
    
    #Imputation
    categorical_data = categorical_data.dropna(subset=["gender"])
    categorical_data = categorical_data.fillna(value="KOSONG")
    
    return categorical_data

In [39]:
x_train_categorical = categoricalImputation(data = x_train_categorical, categorical_column=config["categorical_column"])
x_valid_categorical = categoricalImputation(data = x_valid_categorical, categorical_column=config["categorical_column"])
x_test_categorical = categoricalImputation(data = x_test_categorical, categorical_column=config["categorical_column"])

In [40]:
x_test_categorical.isnull().sum()

gender                          0
region_category                 0
membership_category             0
joined_through_referral         0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
dtype: int64

In [41]:
x_train_categorical.head()

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,M,Town,Gold Membership,Yes,Gift Vouchers/Coupons,Both,Mobile_Data,No,Yes,No,Not Applicable,Too many ads
1,M,KOSONG,Silver Membership,Yes,Without Offers,Desktop,Mobile_Data,No,Yes,Yes,Solved in Follow-up,Poor Website
2,F,City,No Membership,No,Without Offers,Smartphone,Mobile_Data,No,Yes,Yes,Unsolved,Poor Customer Service
3,M,Village,Basic Membership,No,Credit/Debit Card Offers,Desktop,Wi-Fi,Yes,No,Yes,No Information Available,Too many ads
4,F,Village,Basic Membership,No,Without Offers,Desktop,Fiber_Optic,Yes,No,No,Not Applicable,Poor Customer Service


In [42]:
def extractCategorical(data, categorical_column):
    """
    Fungsi untuk ekstrak data kategorikal dengan One Hot Encoding
    :param data: <pandas dataframe> data sample
    :param categorical_column: <list> list kolom kategorik
    :return categorical_ohe: <pandas dataframe> data sample dengan ohe
    """
    data_categorical = categoricalImputation(data=data,
                                            categorical_column=categorical_column)
    categorical_ohe = pd.get_dummies(data_categorical)
    return categorical_ohe

In [43]:
ohe_categorical = pickle_dump(_, config["ohe_categorical_path"])

In [44]:
ohe_categorical

['../models/ohe_categorical.pkl']

In [45]:
ohe_categorical = pickle_load(config["ohe_categorical_path"])

In [46]:
ohe_categorical

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,M,Town,Gold Membership,Yes,Gift Vouchers/Coupons,Both,Mobile_Data,No,Yes,No,Not Applicable,Too many ads
1,M,KOSONG,Silver Membership,Yes,Without Offers,Desktop,Mobile_Data,No,Yes,Yes,Solved in Follow-up,Poor Website
2,F,City,No Membership,No,Without Offers,Smartphone,Mobile_Data,No,Yes,Yes,Unsolved,Poor Customer Service
3,M,Village,Basic Membership,No,Credit/Debit Card Offers,Desktop,Wi-Fi,Yes,No,Yes,No Information Available,Too many ads
4,F,Village,Basic Membership,No,Without Offers,Desktop,Fiber_Optic,Yes,No,No,Not Applicable,Poor Customer Service


In [47]:
x_train_categorical_ohe = extractCategorical(data=x_train_categorical, categorical_column=config["categorical_column"])
x_valid_categorical_ohe = extractCategorical(data=x_valid_categorical, categorical_column=config["categorical_column"])
x_test_categorical_ohe = extractCategorical(data=x_test_categorical, categorical_column=config["categorical_column"])

In [48]:
x_train_categorical_ohe.head() 

Unnamed: 0,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,membership_category_Gold Membership,membership_category_No Membership,membership_category_Platinum Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,1,0,...,1,0,1,0,0,0,0,0,0,0
3,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [49]:
x_train_concat = pd.concat([x_train_numerical,
                            x_train_categorical_ohe],
                            axis=1)
x_train_concat = x_train_concat.dropna().reset_index(drop=True)
x_train_concat.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,18.0,10.0,738.46,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,39.0,22.0,249.450376,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,34.0,12.0,568.14,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,56.0,18.0,716.41,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,23.0,10.0,671.24,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
x_valid_concat = pd.concat([x_valid_numerical,
                            x_valid_categorical_ohe],
                             axis=1)
x_valid_concat = x_valid_concat.dropna().reset_index(drop=True)
x_valid_concat.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,40.0,13.0,548.87,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,49.0,17.0,773.76,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,46.0,12.0,353.290489,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.0,13.0,797.18,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,48.0,14.0,758.74,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
x_test_concat = pd.concat([x_test_numerical,
                            x_test_categorical_ohe],
                            axis=1)
x_test_concat = x_test_concat.dropna().reset_index(drop=True)
x_test_concat.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,41.0,14.0,793.811069,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,46.0,6.0,768.13,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,51.0,19.0,774.78,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,40.0,16.0,504.67,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,18.0,9.0,755.69,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [52]:
def std_scaler_fit(x_train: pd.DataFrame):
    std_scaler = StandardScaler()
    std_scaler.fit(x_train)
    return std_scaler

In [53]:
scaler = std_scaler_fit(x_train_concat)
scaler

StandardScaler()

In [54]:
pickle_dump(scaler, config["scaler_path"])

['../models/std_scaler.pkl']

In [55]:
def std_scaler_transform(features: pd.DataFrame, scaler: object) -> pd.DataFrame:

    '''
    this function transform features using standar scaler machine
    '''
    
    col_names = scaler.feature_names_in_

    feat = copy.deepcopy(features)

    scaled = scaler.transform(feat)

    scaled_df = pd.DataFrame(scaled, columns=col_names)

    return scaled_df

In [56]:
x_train_concat.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,18.0,10.0,738.46,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,39.0,22.0,249.450376,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,34.0,12.0,568.14,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,56.0,18.0,716.41,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,23.0,10.0,671.24,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
x_train_clean = std_scaler_transform(x_train_concat, scaler)
x_train_clean.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,-1.206558,-0.512906,0.268487,-0.997489,0.997489,-0.721973,-0.41431,1.270711,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
1,0.116142,1.698067,-2.474832,-0.997489,0.997489,-0.721973,2.413653,-0.786961,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,2.202762,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
2,-0.198787,-0.14441,-0.686999,1.002518,-1.002518,1.385093,-0.41431,-0.786961,-0.383816,-0.509485,...,2.636029,-0.451308,2.211524,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
3,1.186899,0.961076,0.144788,-0.997489,0.997489,-0.721973,-0.41431,-0.786961,2.605415,1.962766,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
4,-0.89163,-0.512906,-0.108614,1.002518,-1.002518,-0.721973,-0.41431,-0.786961,2.605415,1.962766,...,-0.379359,-0.451308,2.211524,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511


In [58]:
x_valid_concat.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,40.0,13.0,548.87,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,49.0,17.0,773.76,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,46.0,12.0,353.290489,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.0,13.0,797.18,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,48.0,14.0,758.74,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
x_valid_clean = std_scaler_transform(x_valid_concat, scaler)
x_valid_clean.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,0.179128,0.039838,-0.795103,-0.997489,0.997489,-0.721973,2.413653,-0.786961,-0.383816,1.962766,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
1,0.745999,0.776828,0.466518,-0.997489,0.997489,-0.721973,2.413653,-0.786961,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,5.037512
2,0.557042,-0.14441,-1.892294,1.002518,-1.002518,-0.721973,2.413653,-0.786961,-0.383816,1.962766,...,-0.379359,-0.451308,-0.452177,2.18666,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
3,0.179128,0.039838,0.597903,-0.997489,0.997489,1.385093,-0.41431,-0.786961,-0.383816,-0.509485,...,-0.379359,-0.451308,2.211524,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
4,0.683013,0.224085,0.382257,-0.997489,0.997489,-0.721973,2.413653,-0.786961,-0.383816,-0.509485,...,-0.379359,2.215783,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511


In [60]:
x_test_clean = std_scaler_transform(x_test_concat, scaler)
x_test_clean.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,0.242113,0.224085,0.579004,1.002518,-1.002518,1.385093,-0.41431,-0.786961,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
1,0.557042,-1.249897,0.434934,-0.997489,0.997489,-0.721973,-0.41431,1.270711,-0.383816,-0.509485,...,2.636029,-0.451308,-0.452177,-0.457319,-0.453975,5.069863,-0.195117,-0.199666,-0.447452,-0.198511
2,0.87197,1.145324,0.47224,1.002518,-1.002518,1.385093,-0.41431,-0.786961,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,5.037512
3,0.179128,0.592581,-1.043063,-0.997489,0.997489,1.385093,-0.41431,-0.786961,-0.383816,1.962766,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
4,-1.206558,-0.697153,0.365146,1.002518,-1.002518,-0.721973,-0.41431,-0.786961,2.605415,-0.509485,...,2.636029,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,5.037512


In [61]:
x_train_clean.shape, y_train.shape, x_valid_clean.shape, y_valid.shape, x_test_clean.shape, y_test.shape

((25849, 49), (25894,), (5541, 49), (5549,), (5543, 49), (5549,))

In [62]:
y_train[x_train_clean.index]

0        0
1        1
2        1
3        1
4        1
        ..
25844    0
25845    0
25846    0
25847    1
25848    0
Name: churn_risk_score, Length: 25849, dtype: int64

In [63]:
y_train = y_train[x_train_clean.index]
y_valid = y_valid[x_valid_clean.index]
y_test = y_test[x_test_clean.index]

In [64]:
x_train_clean.shape, y_train.shape, x_valid_clean.shape, y_valid.shape, x_test_clean.shape, y_test.shape

((25849, 49), (25849,), (5541, 49), (5541,), (5543, 49), (5543,))

In [65]:
pickle_dump(x_train_clean, config["train_clean_set_path"][0])
pickle_dump(y_train, config["train_clean_set_path"][1])

pickle_dump(x_valid_clean, config["valid_clean_set_path"][0])
pickle_dump(y_valid, config["valid_clean_set_path"][1])

pickle_dump(x_test_clean, config["test_clean_set_path"][0])
pickle_dump(y_test, config["test_clean_set_path"][1])

['../data/processed/y_clean_test.pkl']

In [66]:
x_t = pickle_load(config["train_clean_set_path"][0])
y_t = pickle_load(config["train_clean_set_path"][1])

x_va = pickle_load(config["valid_clean_set_path"][0])
y_va = pickle_load(config["valid_clean_set_path"][1])

x_te = pickle_load(config["test_clean_set_path"][0])
y_te = pickle_load(config["test_clean_set_path"][1])

In [67]:
x_t.shape, y_t.shape, x_va.shape, y_va.shape, x_te.shape, y_te.shape

((25849, 49), (25849,), (5541, 49), (5541,), (5543, 49), (5543,))

In [68]:
x_t.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender_F,gender_M,region_category_City,region_category_KOSONG,region_category_Town,region_category_Village,membership_category_Basic Membership,...,complaint_status_Unsolved,feedback_No reason specified,feedback_Poor Customer Service,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website
0,-1.206558,-0.512906,0.268487,-0.997489,0.997489,-0.721973,-0.41431,1.270711,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
1,0.116142,1.698067,-2.474832,-0.997489,0.997489,-0.721973,2.413653,-0.786961,-0.383816,-0.509485,...,-0.379359,-0.451308,-0.452177,-0.457319,2.202762,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
2,-0.198787,-0.14441,-0.686999,1.002518,-1.002518,1.385093,-0.41431,-0.786961,-0.383816,-0.509485,...,2.636029,-0.451308,2.211524,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
3,1.186899,0.961076,0.144788,-0.997489,0.997489,-0.721973,-0.41431,-0.786961,2.605415,1.962766,...,-0.379359,-0.451308,-0.452177,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,2.234875,-0.198511
4,-0.89163,-0.512906,-0.108614,1.002518,-1.002518,-0.721973,-0.41431,-0.786961,2.605415,1.962766,...,-0.379359,-0.451308,2.211524,-0.457319,-0.453975,-0.197244,-0.195117,-0.199666,-0.447452,-0.198511
