In [1]:
# Matrix operations
import pandas as pd
import numpy as np


# Preprocessing
import category_encoders as ce 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from corr_code import get_cat_correlated_cols,get_correlated_cols

# Metrics 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [119]:
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")

In [120]:
train.shape

(36992, 25)

In [121]:
test.shape

(19919, 24)

In [122]:
train.head(3)

Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,...,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,fffe4300490044003600300030003800,Pattie Morrisey,18,F,XW0DQ7H,Village,Platinum Membership,2017-08-17,No,xxxxxxxx,...,300.63,53005.25,17.0,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,2
1,fffe43004900440032003100300035003700,Traci Peery,32,F,5K0N3X1,City,Premium Membership,2017-08-28,?,CID21329,...,306.34,12838.38,10.0,,Yes,No,Yes,Solved,Quality Customer Care,1
2,fffe4300490044003100390032003600,Merideth Mcmeen,44,F,1F2TCL3,Town,No Membership,2016-11-11,Yes,CID12313,...,516.16,21027.0,22.0,500.69,No,Yes,Yes,Solved in Follow-up,Poor Website,5


In [123]:
train.isna().sum()

customer_id                        0
Name                               0
age                                0
gender                             0
security_no                        0
region_category                 5428
membership_category                0
joining_date                       0
joined_through_referral            0
referral_id                        0
preferred_offer_types            288
medium_of_operation                0
internet_option                    0
last_visit_time                    0
days_since_last_login              0
avg_time_spent                     0
avg_transaction_value              0
avg_frequency_login_days           0
points_in_wallet                3443
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
churn_risk_score                   0
dtype: int64

In [124]:
test.isna().sum()

customer_id                        0
Name                               0
age                                0
gender                             0
security_no                        0
region_category                 2948
membership_category                0
joining_date                       0
joined_through_referral            0
referral_id                        0
preferred_offer_types            159
medium_of_operation                0
internet_option                    0
last_visit_time                    0
days_since_last_login              0
avg_time_spent                     0
avg_transaction_value              0
avg_frequency_login_days           0
points_in_wallet                1963
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
dtype: int64

### Drop customer_id Name


In [125]:
train = train.drop(['customer_id','Name','security_no'],axis=1)
test = test.drop(['customer_id',"Name",'security_no'],axis=1)

### Fill region_category

In [126]:
train.region_category.fillna(train.region_category.mode()[0], inplace=True)
test.region_category.fillna(train.region_category.mode()[0], inplace=True)

### joining_date

In [127]:
train.joining_date = pd.to_datetime(train.joining_date)
test.joining_date = pd.to_datetime(test.joining_date)

In [128]:
train['jyear'] = train.joining_date.dt.year
train['jweekday'] = train.joining_date.dt.weekday
train['jweek'] = train.joining_date.dt.week

  This is separate from the ipykernel package so we can avoid doing imports until


In [129]:
test['jyear'] = test.joining_date.dt.year
test['jweekday'] = test.joining_date.dt.weekday
test['jweek'] = test.joining_date.dt.week

  This is separate from the ipykernel package so we can avoid doing imports until


### joined_through_referral

In [130]:
train.loc[train.referral_id.str.startswith("xxx"),'joined_through_referral']  = "No"
train.loc[~train.referral_id.str.startswith("xxx"),'joined_through_referral']  = "Yes"

In [131]:
test.loc[test.referral_id.str.startswith("xxx"),'joined_through_referral']  = "No"
test.loc[~test.referral_id.str.startswith("xxx"),'joined_through_referral']  = "Yes"

In [132]:
train.joined_through_referral.unique()

array(['No', 'Yes'], dtype=object)

In [133]:
train.drop(['referral_id'],axis=1,inplace=True)
test.drop(['referral_id'],axis=1,inplace=True)

### preferred_offer_types

In [134]:
train.preferred_offer_types.fillna(train.preferred_offer_types.mode()[0], inplace=True)
test.preferred_offer_types.fillna(train.preferred_offer_types.mode()[0], inplace=True)

### days_since_last_login

In [135]:
train.loc[train.days_since_last_login<1,'days_since_last_login']=0
test.loc[train.days_since_last_login<1,'days_since_last_login']=0


### points_in_wallet

In [136]:
train.points_in_wallet.fillna(0, inplace=True)
test.points_in_wallet.fillna(0, inplace=True)

In [137]:
train.columns

Index(['age', 'gender', 'region_category', 'membership_category',
       'joining_date', 'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score', 'jyear', 'jweekday', 'jweek'],
      dtype='object')

In [138]:
train.isna().sum()

age                             0
gender                          0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
jyear                           0
jweekday                        0
jweek                           0
dtype: int64

In [139]:
train.avg_time_spent = train.avg_time_spent.abs()
test.avg_time_spent = test.avg_time_spent.abs()

In [140]:
# train.avg_frequency_login_days = train.avg_frequency_login_days.astype('int').abs()
# test.avg_frequency_login_days = test.avg_frequency_login_days.astype('int').abs()

In [141]:
train.drop(['joining_date','last_visit_time','avg_frequency_login_days'],axis=1,inplace=True)
test.drop(['joining_date','last_visit_time','avg_frequency_login_days'],axis=1,inplace=True)


In [142]:
train.head(3)

Unnamed: 0,age,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,days_since_last_login,avg_time_spent,...,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score,jyear,jweekday,jweek
0,18,F,Village,Platinum Membership,No,Gift Vouchers/Coupons,?,Wi-Fi,17,300.63,...,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,2,2017,3,33
1,32,F,City,Premium Membership,Yes,Gift Vouchers/Coupons,Desktop,Mobile_Data,16,306.34,...,0.0,Yes,No,Yes,Solved,Quality Customer Care,1,2017,0,35
2,44,F,Town,No Membership,Yes,Gift Vouchers/Coupons,Desktop,Wi-Fi,14,516.16,...,500.69,No,Yes,Yes,Solved in Follow-up,Poor Website,5,2016,4,45


In [143]:
cat_cols = [ 'gender', 'region_category', 'membership_category',
        'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option',
        'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']

In [144]:
X = train.drop(['churn_risk_score'],axis=1)
y = train['churn_risk_score']

In [145]:
cat_idx = []
for col in cat_cols:
    cat_idx.append(X.columns.get_loc(col))
    
print(cat_idx)

[1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 16]


In [154]:
model = CatBoostClassifier(iterations=1000,auto_class_weights='Balanced',verbose=50,cat_features=cat_idx)

In [155]:
model.fit(X,y)

Learning rate set to 0.095125
0:	learn: 1.6060932	total: 843ms	remaining: 14m 2s
50:	learn: 0.7677677	total: 43.8s	remaining: 13m 34s
100:	learn: 0.7359496	total: 1m 21s	remaining: 12m 4s
150:	learn: 0.6958234	total: 2m 13s	remaining: 12m 29s
200:	learn: 0.6599830	total: 2m 56s	remaining: 11m 42s
250:	learn: 0.6259369	total: 3m 37s	remaining: 10m 50s
300:	learn: 0.5997197	total: 4m 17s	remaining: 9m 59s
350:	learn: 0.5742391	total: 4m 58s	remaining: 9m 11s
400:	learn: 0.5493495	total: 5m 41s	remaining: 8m 29s
450:	learn: 0.5279281	total: 6m 22s	remaining: 7m 45s
500:	learn: 0.5052557	total: 7m 4s	remaining: 7m 2s
550:	learn: 0.4858691	total: 7m 45s	remaining: 6m 19s
600:	learn: 0.4682792	total: 8m 29s	remaining: 5m 38s
650:	learn: 0.4513874	total: 9m 11s	remaining: 4m 55s
700:	learn: 0.4355141	total: 9m 54s	remaining: 4m 13s
750:	learn: 0.4215457	total: 10m 37s	remaining: 3m 31s
800:	learn: 0.4084702	total: 11m 18s	remaining: 2m 48s
850:	learn: 0.3952824	total: 12m	remaining: 2m 6s
900

<catboost.core.CatBoostClassifier at 0x24acceaa848>

In [156]:
Xt = test[X.columns]

In [157]:
res = pd.read_csv("dataset/test.csv")
res['churn_risk_score'] = model.predict(Xt)

In [160]:
res[['customer_id','churn_risk_score']].to_csv("predict.csv",index=False)

In [161]:
res.head()

Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,...,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,fffe43004900440031003700300030003400,Alethia Meints,50,F,OQJ1XAY,Village,Premium Membership,2015-11-02,No,xxxxxxxx,...,386.26,40721.44,7.0,733.83,Yes,No,No,Not Applicable,Poor Product Quality,3
1,fffe43004900440031003900370037003300,Ming Lopez,41,M,OUQRPKO,Village,Gold Membership,2016-03-01,No,xxxxxxxx,...,37.8,9644.4,9.0,726.0,Yes,No,No,Not Applicable,Poor Website,3
2,fffe43004900440034003800360037003000,Carina Flannigan,31,F,02J2RE7,Town,Silver Membership,2017-03-03,No,xxxxxxxx,...,215.36,3693.25,21.0,713.78,Yes,No,Yes,Solved in Follow-up,No reason specified,3
3,fffe43004900440036003200370033003400,Kyung Wanner,64,M,5YEQIF1,Town,Silver Membership,2017-08-18,Yes,CID8941,...,44.57,36809.56,11.0,744.97,Yes,No,Yes,No Information Available,Too many ads,3
4,fffe43004900440035003000370031003900,Enola Gatto,16,F,100RYB5,Town,No Membership,2015-05-05,Yes,CID5690,...,349.88,40675.86,8.0,299.048351,No,Yes,Yes,Solved in Follow-up,Poor Website,5
