In [7]:
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
import util
from util import load_config, pickle_dump, pickle_load

In [8]:
config = load_config()

In [3]:
def read_raw_df(raw_data_path: str) -> pd.DataFrame:
    df = pd.read_csv(raw_data_path, index_col=0)

    df.drop(config["drop_columns"], axis=1, inplace=True)

    return df

In [11]:
config["predictors"][1:15]

['days_since_last_login',
 'points_in_wallet',
 'gender',
 'region_category',
 'membership_category',
 'joined_through_referral',
 'preferred_offer_types',
 'medium_of_operation',
 'internet_option',
 'used_special_discount',
 'offer_application_preference',
 'past_complaint',
 'complaint_status',
 'feedback']

In [4]:
df = read_raw_df(config["raw_dataset_path"])
df

Unnamed: 0,age,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,days_since_last_login,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,18,F,Village,Platinum Membership,No,Gift Vouchers/Coupons,?,Wi-Fi,17,781.750000,Yes,Yes,No,Not Applicable,Products always in Stock,0
1,32,F,City,Premium Membership,?,Gift Vouchers/Coupons,Desktop,Mobile_Data,16,,Yes,No,Yes,Solved,Quality Customer Care,0
2,44,F,Town,No Membership,Yes,Gift Vouchers/Coupons,Desktop,Wi-Fi,14,500.690000,No,Yes,Yes,Solved in Follow-up,Poor Website,1
3,37,M,City,No Membership,Yes,Gift Vouchers/Coupons,Desktop,Mobile_Data,11,567.660000,No,Yes,Yes,Unsolved,Poor Website,1
4,31,F,City,No Membership,No,Credit/Debit Card Offers,Smartphone,Mobile_Data,20,663.060000,No,Yes,Yes,Solved,Poor Website,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36987,46,F,,Basic Membership,No,Credit/Debit Card Offers,Desktop,Wi-Fi,2,639.510000,No,Yes,Yes,No Information Available,No reason specified,1
36988,29,F,Town,Basic Membership,No,Without Offers,Smartphone,Wi-Fi,13,527.990000,Yes,No,No,Not Applicable,Poor Customer Service,1
36989,23,F,,Basic Membership,Yes,Gift Vouchers/Coupons,Desktop,Wi-Fi,12,680.470000,No,Yes,Yes,Unsolved,Poor Website,1
36990,53,M,Village,Platinum Membership,No,Gift Vouchers/Coupons,Smartphone,Mobile_Data,15,197.264414,Yes,Yes,No,Not Applicable,No reason specified,0


In [6]:
df.isnull().sum()

age                                0
gender                             0
region_category                 5428
membership_category                0
joined_through_referral            0
preferred_offer_types            288
medium_of_operation                0
internet_option                    0
days_since_last_login              0
points_in_wallet                3443
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
churn_risk_score                   0
dtype: int64

In [5]:
df.shape

(36992, 16)

In [6]:
pickle_dump(df, config["raw_df_path"])

['../data/processed/raw_df.pkl']

In [7]:
df_after_drop = pickle_load(config["raw_df_path"])
df_after_drop

Unnamed: 0,age,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,days_since_last_login,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,18,F,Village,Platinum Membership,No,Gift Vouchers/Coupons,?,Wi-Fi,17,781.750000,Yes,Yes,No,Not Applicable,Products always in Stock,0
1,32,F,City,Premium Membership,?,Gift Vouchers/Coupons,Desktop,Mobile_Data,16,,Yes,No,Yes,Solved,Quality Customer Care,0
2,44,F,Town,No Membership,Yes,Gift Vouchers/Coupons,Desktop,Wi-Fi,14,500.690000,No,Yes,Yes,Solved in Follow-up,Poor Website,1
3,37,M,City,No Membership,Yes,Gift Vouchers/Coupons,Desktop,Mobile_Data,11,567.660000,No,Yes,Yes,Unsolved,Poor Website,1
4,31,F,City,No Membership,No,Credit/Debit Card Offers,Smartphone,Mobile_Data,20,663.060000,No,Yes,Yes,Solved,Poor Website,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36987,46,F,,Basic Membership,No,Credit/Debit Card Offers,Desktop,Wi-Fi,2,639.510000,No,Yes,Yes,No Information Available,No reason specified,1
36988,29,F,Town,Basic Membership,No,Without Offers,Smartphone,Wi-Fi,13,527.990000,Yes,No,No,Not Applicable,Poor Customer Service,1
36989,23,F,,Basic Membership,Yes,Gift Vouchers/Coupons,Desktop,Wi-Fi,12,680.470000,No,Yes,Yes,Unsolved,Poor Website,1
36990,53,M,Village,Platinum Membership,No,Gift Vouchers/Coupons,Smartphone,Mobile_Data,15,197.264414,Yes,Yes,No,Not Applicable,No reason specified,0


In [8]:
def split_data(dataframe: pd.DataFrame)->pd.DataFrame:
    df = copy.deepcopy(dataframe)

    x = df[config["predictors"]]
    y = df[config["label"]]

    x_train, x_test, y_train, y_test = train_test_split(
        x, y,
        test_size = config["test_size"],
        random_state = 42,
        stratify = y)

    x_valid, x_test, y_valid, y_test = train_test_split(
        x_test, y_test,
        test_size = config["valid_size"],
        random_state = 42,
        stratify = y_test
    )
    return x_train, x_valid, x_test, y_train, y_valid, y_test

In [9]:
x_train, x_valid, x_test, y_train, y_valid, y_test = split_data(df)

In [10]:
print(f"X_train shape: {x_train.shape}")
print(f"X_test shape: {x_test.shape}")
print(f"X_valid shape:{x_valid.shape}")

print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (25894, 15)
X_test shape: (5549, 15)
X_valid shape:(5549, 15)
y_train shape: (25894, 1)
y_test shape: (5549, 1)
y_valid shape: (5549, 1)


In [11]:
y_train

Unnamed: 0,churn_risk_score
36181,0
31548,1
27354,1
25931,1
12225,1
...,...
14579,0
6498,0
21879,1
23008,1


In [12]:
# reset index
x_train = x_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

x_valid = x_valid.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)

x_test = x_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [13]:
x_train.head()

Unnamed: 0,age,days_since_last_login,points_in_wallet,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,18,10,738.46,M,Town,Gold Membership,Yes,Gift Vouchers/Coupons,Both,Mobile_Data,No,Yes,No,Not Applicable,Too many ads
1,39,22,249.450376,M,,Silver Membership,Yes,Without Offers,Desktop,Mobile_Data,No,Yes,Yes,Solved in Follow-up,Poor Website
2,34,12,568.14,F,City,No Membership,No,Without Offers,Smartphone,Mobile_Data,No,Yes,Yes,Unsolved,Poor Customer Service
3,56,18,716.41,M,Village,Basic Membership,No,Credit/Debit Card Offers,Desktop,Wi-Fi,Yes,No,Yes,No Information Available,Too many ads
4,23,10,671.24,F,Village,Basic Membership,No,Without Offers,Desktop,Fiber_Optic,Yes,No,No,Not Applicable,Poor Customer Service


In [14]:
x_train.shape, x_valid.shape, x_test.shape, y_train.shape,\
    y_valid.shape, y_test.shape

((25894, 15), (5549, 15), (5549, 15), (25894, 1), (5549, 1), (5549, 1))

In [15]:
y_valid

Unnamed: 0,churn_risk_score
0,1
1,0
2,1
3,0
4,0
...,...
5544,0
5545,1
5546,0
5547,0


In [16]:
pickle_dump(x_train, config["train_set_path"][0])
pickle_dump(y_train, config["train_set_path"][1])

pickle_dump(x_valid, config["valid_set_path"][0])
pickle_dump(y_valid, config["valid_set_path"][1])

pickle_dump(x_test, config["test_set_path"][0])
pickle_dump(y_test, config["test_set_path"][1])

['../data/processed/y_test.pkl']

In [17]:
def load_dataset(config: dict):
    x_train = pickle_load(config["train_set_path"][0])
    y_train = pickle_load(config["train_set_path"][1])

    x_valid = pickle_load(config["valid_set_path"][0])
    y_valid = pickle_load(config["valid_set_path"][1])

    x_test = pickle_load(config["test_set_path"][0])
    y_test = pickle_load(config["test_set_path"][1])

    return x_train, y_train, x_valid, y_valid, x_test, y_test

In [18]:
x_train, y_train, x_valid, y_valid, x_test, y_test = load_dataset(config)