### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as st
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

### Read dataset

In [2]:
df=pd.read_csv("dataset/train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36992 entries, 0 to 36991
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   customer_id                   36992 non-null  object 
 1   Name                          36992 non-null  object 
 2   age                           36992 non-null  int64  
 3   gender                        36992 non-null  object 
 4   security_no                   36992 non-null  object 
 5   region_category               31564 non-null  object 
 6   membership_category           36992 non-null  object 
 7   joining_date                  36992 non-null  object 
 8   joined_through_referral       36992 non-null  object 
 9   referral_id                   36992 non-null  object 
 10  preferred_offer_types         36704 non-null  object 
 11  medium_of_operation           36992 non-null  object 
 12  internet_option               36992 non-null  object 
 13  l

In [4]:
df['customer_id'].duplicated().any()

False

In [5]:
df.shape

(36992, 25)

In [6]:
df["churn_risk_score"].value_counts()

 3    10424
 4    10185
 5     9827
 2     2741
 1     2652
-1     1163
Name: churn_risk_score, dtype: int64

In [7]:
df=df.set_index('customer_id')

In [8]:
df

Unnamed: 0_level_0,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,preferred_offer_types,...,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fffe4300490044003600300030003800,Pattie Morrisey,18,F,XW0DQ7H,Village,Platinum Membership,2017-08-17,No,xxxxxxxx,Gift Vouchers/Coupons,...,300.630000,53005.25,17.0,781.750000,Yes,Yes,No,Not Applicable,Products always in Stock,2
fffe43004900440032003100300035003700,Traci Peery,32,F,5K0N3X1,City,Premium Membership,2017-08-28,?,CID21329,Gift Vouchers/Coupons,...,306.340000,12838.38,10.0,,Yes,No,Yes,Solved,Quality Customer Care,1
fffe4300490044003100390032003600,Merideth Mcmeen,44,F,1F2TCL3,Town,No Membership,2016-11-11,Yes,CID12313,Gift Vouchers/Coupons,...,516.160000,21027.00,22.0,500.690000,No,Yes,Yes,Solved in Follow-up,Poor Website,5
fffe43004900440036003000330031003600,Eufemia Cardwell,37,M,VJGJ33N,City,No Membership,2016-10-29,Yes,CID3793,Gift Vouchers/Coupons,...,53.270000,25239.56,6.0,567.660000,No,Yes,Yes,Unsolved,Poor Website,5
fffe43004900440031003900350030003600,Meghan Kosak,31,F,SVZXCWB,City,No Membership,2017-09-12,No,xxxxxxxx,Credit/Debit Card Offers,...,113.130000,24483.66,16.0,663.060000,No,Yes,Yes,Solved,Poor Website,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffe43004900440035003500390036003100,Cuc Tarr,46,F,6F51HFO,,Basic Membership,2017-09-21,No,xxxxxxxx,Credit/Debit Card Offers,...,-650.682759,27277.68,6.0,639.510000,No,Yes,Yes,No Information Available,No reason specified,4
fffe43004900440033003500380036003600,Jenni Stronach,29,F,21KSM8Y,Town,Basic Membership,2016-06-27,No,xxxxxxxx,Without Offers,...,-638.123421,11069.71,28.0,527.990000,Yes,No,No,Not Applicable,Poor Customer Service,5
fffe4300490044003500330034003100,Luciana Kinch,23,F,XK1IM9H,,Basic Membership,2016-09-11,Yes,CID3838,Gift Vouchers/Coupons,...,154.940000,38127.56,Error,680.470000,No,Yes,Yes,Unsolved,Poor Website,4
fffe43004900440031003200390039003000,Tawana Ardoin,53,M,K6VTP1Z,Village,Platinum Membership,2017-06-15,No,xxxxxxxx,Gift Vouchers/Coupons,...,482.610000,2378.86,20.0,197.264414,Yes,Yes,No,Not Applicable,No reason specified,3


### Check Statistical info of data

In [9]:
df.describe()

Unnamed: 0,age,days_since_last_login,avg_time_spent,avg_transaction_value,points_in_wallet,churn_risk_score
count,36992.0,36992.0,36992.0,36992.0,33549.0,36992.0
mean,37.118161,-41.915576,243.472334,29271.194003,686.882199,3.463397
std,15.867412,228.8199,398.289149,19444.806226,194.063624,1.409661
min,10.0,-999.0,-2814.10911,800.46,-760.661236,-1.0
25%,23.0,8.0,60.1025,14177.54,616.15,3.0
50%,37.0,12.0,161.765,27554.485,697.62,4.0
75%,51.0,16.0,356.515,40855.11,763.95,5.0
max,64.0,26.0,3235.578521,99914.05,2069.069761,5.0


## Data Cleaning

### Check mising values

In [10]:
df.isna().sum()

Name                               0
age                                0
gender                             0
security_no                        0
region_category                 5428
membership_category                0
joining_date                       0
joined_through_referral            0
referral_id                        0
preferred_offer_types            288
medium_of_operation                0
internet_option                    0
last_visit_time                    0
days_since_last_login              0
avg_time_spent                     0
avg_transaction_value              0
avg_frequency_login_days           0
points_in_wallet                3443
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
churn_risk_score                   0
dtype: int64

In [11]:
df=df[~pd.isnull(df["region_category"])]

In [12]:
df['points_in_wallet']=df['points_in_wallet'].fillna(0)

In [13]:
df.drop("preferred_offer_types", axis=1, inplace=True)

In [14]:
df["feedback"].value_counts()

Poor Product Quality        5387
Poor Website                5367
No reason specified         5355
Too many ads                5351
Poor Customer Service       5350
User Friendly Website       1201
Reasonable Price            1193
Products always in Stock    1182
Quality Customer Care       1178
Name: feedback, dtype: int64

In [15]:
df.drop(["Name", "security_no", "region_category"], axis=1, inplace=True)

In [16]:
df.head()

Unnamed: 0_level_0,age,gender,membership_category,joining_date,joined_through_referral,referral_id,medium_of_operation,internet_option,last_visit_time,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
fffe4300490044003600300030003800,18,F,Platinum Membership,2017-08-17,No,xxxxxxxx,?,Wi-Fi,16:08:02,17,300.63,53005.25,17.0,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,2
fffe43004900440032003100300035003700,32,F,Premium Membership,2017-08-28,?,CID21329,Desktop,Mobile_Data,12:38:13,16,306.34,12838.38,10.0,0.0,Yes,No,Yes,Solved,Quality Customer Care,1
fffe4300490044003100390032003600,44,F,No Membership,2016-11-11,Yes,CID12313,Desktop,Wi-Fi,22:53:21,14,516.16,21027.0,22.0,500.69,No,Yes,Yes,Solved in Follow-up,Poor Website,5
fffe43004900440036003000330031003600,37,M,No Membership,2016-10-29,Yes,CID3793,Desktop,Mobile_Data,15:57:50,11,53.27,25239.56,6.0,567.66,No,Yes,Yes,Unsolved,Poor Website,5
fffe43004900440031003900350030003600,31,F,No Membership,2017-09-12,No,xxxxxxxx,Smartphone,Mobile_Data,15:46:44,20,113.13,24483.66,16.0,663.06,No,Yes,Yes,Solved,Poor Website,5


In [17]:
df.drop(["joining_date", "joined_through_referral", "referral_id", "medium_of_operation", "internet_option"], axis=1, inplace=True)

In [18]:
df.columns

Index(['age', 'gender', 'membership_category', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [19]:
df=df[df["churn_risk_score"]!=-1]

In [20]:
df_categorical=df.select_dtypes(include=['object'])

df_categorical.apply(lambda x:x=='?', axis=0).sum()

gender                          0
membership_category             0
last_visit_time                 0
avg_frequency_login_days        0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
dtype: int64

In [21]:
df_categorical=df.select_dtypes(include=['object'])

df_categorical.apply(lambda x:x=='xxxxxxxx', axis=0).sum()

gender                          0
membership_category             0
last_visit_time                 0
avg_frequency_login_days        0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
dtype: int64

In [22]:
df.drop("last_visit_time", axis=1, inplace=True)

In [23]:
df.head()

Unnamed: 0_level_0,age,gender,membership_category,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fffe4300490044003600300030003800,18,F,Platinum Membership,17,300.63,53005.25,17.0,781.75,Yes,Yes,No,Not Applicable,Products always in Stock,2
fffe43004900440032003100300035003700,32,F,Premium Membership,16,306.34,12838.38,10.0,0.0,Yes,No,Yes,Solved,Quality Customer Care,1
fffe4300490044003100390032003600,44,F,No Membership,14,516.16,21027.0,22.0,500.69,No,Yes,Yes,Solved in Follow-up,Poor Website,5
fffe43004900440036003000330031003600,37,M,No Membership,11,53.27,25239.56,6.0,567.66,No,Yes,Yes,Unsolved,Poor Website,5
fffe43004900440031003900350030003600,31,F,No Membership,20,113.13,24483.66,16.0,663.06,No,Yes,Yes,Solved,Poor Website,5


In [24]:
df_categorical=df.select_dtypes(include=['object'])
df_categorical.head()

Unnamed: 0_level_0,gender,membership_category,avg_frequency_login_days,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
fffe4300490044003600300030003800,F,Platinum Membership,17.0,Yes,Yes,No,Not Applicable,Products always in Stock
fffe43004900440032003100300035003700,F,Premium Membership,10.0,Yes,No,Yes,Solved,Quality Customer Care
fffe4300490044003100390032003600,F,No Membership,22.0,No,Yes,Yes,Solved in Follow-up,Poor Website
fffe43004900440036003000330031003600,M,No Membership,6.0,No,Yes,Yes,Unsolved,Poor Website
fffe43004900440031003900350030003600,F,No Membership,16.0,No,Yes,Yes,Solved,Poor Website


In [25]:
df_categorical=df.select_dtypes(include=['object'])

df_categorical.apply(lambda x:x=='Error', axis=0).sum()

gender                             0
membership_category                0
avg_frequency_login_days        2913
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
dtype: int64

In [26]:
df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace('Error', '0')

In [27]:
#df=df[df['avg_frequency_login_days']!='Error']

In [28]:
df.apply(lambda x:x=='Error', axis=0).sum()

age                             0
gender                          0
membership_category             0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

In [29]:
df["avg_frequency_login_days"] = pd.to_numeric(df["avg_frequency_login_days"])

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30566 entries, fffe4300490044003600300030003800 to fffe43004900440033003600340034003200
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           30566 non-null  int64  
 1   gender                        30566 non-null  object 
 2   membership_category           30566 non-null  object 
 3   days_since_last_login         30566 non-null  int64  
 4   avg_time_spent                30566 non-null  float64
 5   avg_transaction_value         30566 non-null  float64
 6   avg_frequency_login_days      30566 non-null  float64
 7   points_in_wallet              30566 non-null  float64
 8   used_special_discount         30566 non-null  object 
 9   offer_application_preference  30566 non-null  object 
 10  past_complaint                30566 non-null  object 
 11  complaint_status              30566 non-null  object 
 12  fee

In [31]:
from sklearn import preprocessing
#encode categorical variables using label encoder
#select all categorical variables
df_categorical =df.select_dtypes(include=['object'])
df_categorical.head()

Unnamed: 0_level_0,gender,membership_category,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
fffe4300490044003600300030003800,F,Platinum Membership,Yes,Yes,No,Not Applicable,Products always in Stock
fffe43004900440032003100300035003700,F,Premium Membership,Yes,No,Yes,Solved,Quality Customer Care
fffe4300490044003100390032003600,F,No Membership,No,Yes,Yes,Solved in Follow-up,Poor Website
fffe43004900440036003000330031003600,M,No Membership,No,Yes,Yes,Unsolved,Poor Website
fffe43004900440031003900350030003600,F,No Membership,No,Yes,Yes,Solved,Poor Website


### Apply label encoding to the categorical features

In [32]:
le=preprocessing.LabelEncoder()
df_categorical=df_categorical.apply(le.fit_transform)
df_categorical.head()

Unnamed: 0_level_0,gender,membership_category,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
fffe4300490044003600300030003800,0,3,1,1,0,1,4
fffe43004900440032003100300035003700,0,4,1,0,1,2,5
fffe4300490044003100390032003600,0,2,0,1,1,3,3
fffe43004900440036003000330031003600,1,2,0,1,1,4,3
fffe43004900440031003900350030003600,0,2,0,1,1,2,3


In [33]:
cat_cols = df_categorical.columns

In [34]:
df[cat_cols] = df_categorical[cat_cols]

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30566 entries, fffe4300490044003600300030003800 to fffe43004900440033003600340034003200
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           30566 non-null  int64  
 1   gender                        30566 non-null  int32  
 2   membership_category           30566 non-null  int32  
 3   days_since_last_login         30566 non-null  int64  
 4   avg_time_spent                30566 non-null  float64
 5   avg_transaction_value         30566 non-null  float64
 6   avg_frequency_login_days      30566 non-null  float64
 7   points_in_wallet              30566 non-null  float64
 8   used_special_discount         30566 non-null  int32  
 9   offer_application_preference  30566 non-null  int32  
 10  past_complaint                30566 non-null  int32  
 11  complaint_status              30566 non-null  int32  
 12  fee

In [36]:
df.head()

Unnamed: 0_level_0,age,gender,membership_category,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fffe4300490044003600300030003800,18,0,3,17,300.63,53005.25,17.0,781.75,1,1,0,1,4,2
fffe43004900440032003100300035003700,32,0,4,16,306.34,12838.38,10.0,0.0,1,0,1,2,5,1
fffe4300490044003100390032003600,44,0,2,14,516.16,21027.0,22.0,500.69,0,1,1,3,3,5
fffe43004900440036003000330031003600,37,1,2,11,53.27,25239.56,6.0,567.66,0,1,1,4,3,5
fffe43004900440031003900350030003600,31,0,2,20,113.13,24483.66,16.0,663.06,0,1,1,2,3,5


### Scaling data

In [37]:
from sklearn.preprocessing import QuantileTransformer

In [38]:
scaler=QuantileTransformer()

In [39]:
df[["avg_time_spent", "avg_transaction_value", "avg_frequency_login_days", "points_in_wallet"]] = scaler.fit_transform(df[["avg_time_spent", "avg_transaction_value", "avg_frequency_login_days", "points_in_wallet"]])

In [40]:
df.head()

Unnamed: 0_level_0,age,gender,membership_category,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
fffe4300490044003600300030003800,18,0,3,17,0.696276,0.925617,0.594595,0.829866,1,1,0,1,4,2
fffe43004900440032003100300035003700,32,0,4,16,0.702686,0.226184,0.341842,0.051051,1,0,1,2,5,1
fffe4300490044003100390032003600,44,0,2,14,0.843812,0.378357,0.757257,0.170557,0,1,1,3,3,5
fffe43004900440036003000330031003600,37,1,2,11,0.2305,0.456605,0.198699,0.24384,0,1,1,4,3,5
fffe43004900440031003900350030003600,31,0,2,20,0.39627,0.441241,0.558559,0.449464,0,1,1,2,3,5


In [41]:
df["churn_risk_score"]=df["churn_risk_score"].astype("category")

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30566 entries, fffe4300490044003600300030003800 to fffe43004900440033003600340034003200
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   age                           30566 non-null  int64   
 1   gender                        30566 non-null  int32   
 2   membership_category           30566 non-null  int32   
 3   days_since_last_login         30566 non-null  int64   
 4   avg_time_spent                30566 non-null  float64 
 5   avg_transaction_value         30566 non-null  float64 
 6   avg_frequency_login_days      30566 non-null  float64 
 7   points_in_wallet              30566 non-null  float64 
 8   used_special_discount         30566 non-null  int32   
 9   offer_application_preference  30566 non-null  int32   
 10  past_complaint                30566 non-null  int32   
 11  complaint_status              30566 non-null  int

### Data retained

In [43]:
(df.shape[0]/36992)*100

82.62867647058823

### Splitting the train data into train and test (70/30)

In [44]:
x=df.drop(["churn_risk_score"], axis=1)
y=df["churn_risk_score"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=100)
x_train.head()

Unnamed: 0_level_0,age,gender,membership_category,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fffe4300490044003400380030003900,48,1,0,9,0.381275,0.022375,0.986764,0.580781,1,0,0,1,7
fffe43004900440033003700320038003200,34,1,0,10,0.336392,0.383582,0.198699,0.051051,0,1,0,1,2
fffe43004900440031003400320039003800,25,1,1,14,0.265093,0.564708,0.903904,0.381384,0,1,1,0,3
fffe43004900440034003800380033003800,54,0,2,16,0.299015,0.453864,0.005568,0.003109,0,1,0,1,0
fffe43004900440034003100360034003500,23,1,0,1,0.877213,0.135298,0.964464,0.602029,1,0,0,1,7


### XG Boost

In [45]:
import xgboost as xgb

In [46]:
xg_cl = xgb.XGBClassifier(objective='multi:softmax', n_estimators=200,seed=123,learning_rate=0.15,max_depth=5,colsample_bytree=1,subsample=1)

In [47]:
xg_cl.fit(x_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=123, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=123, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

### Cross validation to check for biased data

In [48]:
from sklearn.model_selection import cross_val_score
cv_results = cross_val_score(xg_cl, x_train, y_train, cv=5)
cv_results



array([0.78621495, 0.77751811, 0.77120823, 0.7810236 , 0.77938771])

In [49]:
y_xg_pred = xg_cl.predict(x_test)

In [50]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [51]:
print(classification_report(y_test, y_xg_pred))

              precision    recall  f1-score   support

           1       0.75      0.82      0.78       683
           2       0.80      0.74      0.77       702
           3       0.91      0.93      0.92      2675
           4       0.73      0.58      0.64      2603
           5       0.72      0.85      0.78      2507

    accuracy                           0.78      9170
   macro avg       0.78      0.78      0.78      9170
weighted avg       0.78      0.78      0.78      9170



In [52]:
print(confusion_matrix(y_test,y_xg_pred))

[[ 557  126    0    0    0]
 [ 186  516    0    0    0]
 [   0    0 2478  197    0]
 [   0    0  256 1507  840]
 [   0    0    0  372 2135]]


In [53]:
print(accuracy_score(y_test, y_xg_pred))

0.7844056706652126


In [54]:
100 * metrics.f1_score(y_test, y_xg_pred, average="macro")

77.7676309343405

### Training the whole data

In [55]:
xg_cl.fit(x,y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=123, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, seed=123, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

### Predict test dataset

In [56]:
df1 = pd.read_csv("dataset/test.csv")

In [57]:
df1['customer_id'].duplicated().any()

False

In [58]:
df1.shape

(19919, 24)

In [59]:
df1=df1.set_index('customer_id')
df1.head()

Unnamed: 0_level_0,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,preferred_offer_types,...,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fffe43004900440031003700300030003400,Alethia Meints,50,F,OQJ1XAY,Village,Premium Membership,2015-11-02,No,xxxxxxxx,Without Offers,...,12,386.26,40721.44,7.0,733.83,Yes,No,No,Not Applicable,Poor Product Quality
fffe43004900440031003900370037003300,Ming Lopez,41,M,OUQRPKO,Village,Gold Membership,2016-03-01,No,xxxxxxxx,Without Offers,...,11,37.8,9644.4,9.0,726.0,Yes,No,No,Not Applicable,Poor Website
fffe43004900440034003800360037003000,Carina Flannigan,31,F,02J2RE7,Town,Silver Membership,2017-03-03,No,xxxxxxxx,Gift Vouchers/Coupons,...,18,215.36,3693.25,21.0,713.78,Yes,No,Yes,Solved in Follow-up,No reason specified
fffe43004900440036003200370033003400,Kyung Wanner,64,M,5YEQIF1,Town,Silver Membership,2017-08-18,Yes,CID8941,Credit/Debit Card Offers,...,-999,44.57,36809.56,11.0,744.97,Yes,No,Yes,No Information Available,Too many ads
fffe43004900440035003000370031003900,Enola Gatto,16,F,100RYB5,Town,No Membership,2015-05-05,Yes,CID5690,Without Offers,...,6,349.88,40675.86,8.0,299.048351,No,Yes,Yes,Solved in Follow-up,Poor Website


In [60]:
x_train.columns

Index(['age', 'gender', 'membership_category', 'days_since_last_login',
       'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days',
       'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback'],
      dtype='object')

In [61]:
df1 = df1[x_train.columns]

In [62]:
df1["avg_frequency_login_days"] = df1["avg_frequency_login_days"].replace('Error', '0')
df1["avg_frequency_login_days"] = pd.to_numeric(df1["avg_frequency_login_days"])

In [63]:
df1_categorical =df1.select_dtypes(include=['object'])
df1_categorical.head()

Unnamed: 0_level_0,gender,membership_category,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
fffe43004900440031003700300030003400,F,Premium Membership,Yes,No,No,Not Applicable,Poor Product Quality
fffe43004900440031003900370037003300,M,Gold Membership,Yes,No,No,Not Applicable,Poor Website
fffe43004900440034003800360037003000,F,Silver Membership,Yes,No,Yes,Solved in Follow-up,No reason specified
fffe43004900440036003200370033003400,M,Silver Membership,Yes,No,Yes,No Information Available,Too many ads
fffe43004900440035003000370031003900,F,No Membership,No,Yes,Yes,Solved in Follow-up,Poor Website


In [64]:
le1=preprocessing.LabelEncoder()
df1_categorical=df1_categorical.apply(le1.fit_transform)
df1_categorical.head()

Unnamed: 0_level_0,gender,membership_category,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
fffe43004900440031003700300030003400,0,4,1,0,0,1,2
fffe43004900440031003900370037003300,1,1,1,0,0,1,3
fffe43004900440034003800360037003000,0,5,1,0,1,3,0
fffe43004900440036003200370033003400,1,5,1,0,1,0,7
fffe43004900440035003000370031003900,0,2,0,1,1,3,3


In [65]:
cat_col = df1_categorical.columns

In [66]:
df1[cat_col] = df1_categorical[cat_col]

In [67]:
df1['points_in_wallet']=df1['points_in_wallet'].fillna(0)

In [68]:
df1[["avg_time_spent", "avg_transaction_value", "avg_frequency_login_days", "points_in_wallet"]] = scaler.fit_transform(df1[["avg_time_spent", "avg_transaction_value", "avg_frequency_login_days", "points_in_wallet"]])
df1.head()

Unnamed: 0_level_0,age,gender,membership_category,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fffe43004900440031003700300030003400,50,0,4,12,0.774779,0.745914,0.238238,0.681363,1,0,0,1,2
fffe43004900440031003900370037003300,41,1,1,11,0.178946,0.16692,0.308809,0.659842,1,0,0,1,3
fffe43004900440034003800360037003000,31,0,5,18,0.591583,0.055063,0.731732,0.612692,1,0,1,3,0
fffe43004900440036003200370033003400,64,1,5,-999,0.20222,0.673339,0.37988,0.711745,1,0,1,0,7
fffe43004900440035003000370031003900,16,0,2,6,0.745899,0.744724,0.273774,0.137508,0,1,1,3,3


In [69]:
df1.shape

(19919, 13)

In [70]:
df1.isna().sum().any()

False

In [71]:
y_test_pred = xg_cl.predict(df1)
y_test_pred

array([3, 3, 4, ..., 5, 4, 3], dtype=int64)

In [72]:
my_df = {'customer_id' : df1.index, 'churn_risk_score':y_test_pred}
data = pd.DataFrame(my_df)
data.to_csv("churn_predictions.csv", header=True, index=False)