In [24]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('customer_churn.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3083 entries, 0 to 3082
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   3083 non-null   int64  
 1   Churn                        3083 non-null   int64  
 2   Tenure                       2930 non-null   float64
 3   PreferredLoginDevice         3083 non-null   object 
 4   CityTier                     3083 non-null   int64  
 5   WarehouseToHome              2929 non-null   float64
 6   PreferredPaymentMode         3083 non-null   object 
 7   Gender                       3083 non-null   object 
 8   HourSpendOnApp               2933 non-null   float64
 9   NumberOfDeviceRegistered     3083 non-null   int64  
 10  PreferedOrderCat             3083 non-null   object 
 11  SatisfactionScore            3083 non-null   int64  
 12  MaritalStatus                3083 non-null   object 
 13  NumberOfAddress   

In [3]:
df.isnull().sum()

CustomerID                       0
Churn                            0
Tenure                         153
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                154
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 150
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    131
CouponUsed                     126
OrderCount                     128
DaySinceLastOrder              166
CashbackAmount                   0
dtype: int64

In [4]:
df.drop_duplicates(inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3078 entries, 0 to 3082
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   3078 non-null   int64  
 1   Churn                        3078 non-null   int64  
 2   Tenure                       2925 non-null   float64
 3   PreferredLoginDevice         3078 non-null   object 
 4   CityTier                     3078 non-null   int64  
 5   WarehouseToHome              2924 non-null   float64
 6   PreferredPaymentMode         3078 non-null   object 
 7   Gender                       3078 non-null   object 
 8   HourSpendOnApp               2928 non-null   float64
 9   NumberOfDeviceRegistered     3078 non-null   int64  
 10  PreferedOrderCat             3078 non-null   object 
 11  SatisfactionScore            3078 non-null   int64  
 12  MaritalStatus                3078 non-null   object 
 13  NumberOfAddress   

In [18]:
df['Tenure'] = df['Tenure'].fillna(round(df['Tenure'].mean(), 1))
df['WarehouseToHome'] = df['WarehouseToHome'].fillna(round(df['WarehouseToHome'].mean(), 1))
df['HourSpendOnApp'] = df['HourSpendOnApp'].fillna(round(df['HourSpendOnApp'].mean(), 1))
df['OrderAmountHikeFromlastYear'] = df['OrderAmountHikeFromlastYear'].fillna(round(df['OrderAmountHikeFromlastYear'].mean(), 1))
df['CouponUsed'] = df['CouponUsed'].fillna(round(df['CouponUsed'].mean(), 1))
df['OrderCount'] = df['OrderCount'].fillna(round(df['OrderCount'].mean(), 1))
df['DaySinceLastOrder'] = df['DaySinceLastOrder'].fillna(round(df['DaySinceLastOrder'].mean(), 1))

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3078 entries, 0 to 3082
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CustomerID                   3078 non-null   int64  
 1   Churn                        3078 non-null   int64  
 2   Tenure                       3078 non-null   float64
 3   PreferredLoginDevice         3078 non-null   object 
 4   CityTier                     3078 non-null   int64  
 5   WarehouseToHome              3078 non-null   float64
 6   PreferredPaymentMode         3078 non-null   object 
 7   Gender                       3078 non-null   object 
 8   HourSpendOnApp               3078 non-null   float64
 9   NumberOfDeviceRegistered     3078 non-null   int64  
 10  PreferedOrderCat             3078 non-null   object 
 11  SatisfactionScore            3078 non-null   int64  
 12  MaritalStatus                3078 non-null   object 
 13  NumberOfAddress   

In [20]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,PreferredLoginDevice,PreferredPaymentMode,Gender,PreferedOrderCat,MaritalStatus
0,Computer,Debit Card,Male,Mobile Phone,Single
1,Mobile Phone,Debit Card,Male,Grocery,Married
2,Mobile Phone,Credit Card,Male,Laptop & Accessory,Single
3,Phone,E wallet,Male,Mobile,Married
4,Computer,Debit Card,Male,Laptop & Accessory,Divorced


In [21]:
obj_df["PreferredLoginDevice"] = obj_df["PreferredLoginDevice"].astype('category')
obj_df["PreferredPaymentMode"] = obj_df["PreferredPaymentMode"].astype('category')
obj_df["Gender"] = obj_df["Gender"].astype('category')
obj_df["PreferedOrderCat"] = obj_df["PreferedOrderCat"].astype('category')
obj_df["MaritalStatus"] = obj_df["MaritalStatus"].astype('category')
obj_df.dtypes

PreferredLoginDevice    category
PreferredPaymentMode    category
Gender                  category
PreferedOrderCat        category
MaritalStatus           category
dtype: object

In [22]:
obj_df["PreferredLoginDevice_cat"] = obj_df["PreferredLoginDevice"].cat.codes
obj_df["PreferredPaymentMode_cat"] = obj_df["PreferredPaymentMode"].cat.codes
obj_df["Gender_cat"] = obj_df["Gender"].cat.codes
obj_df["PreferedOrderCat_cat"] = obj_df["PreferedOrderCat"].cat.codes
obj_df["MaritalStatus_cat"] = obj_df["MaritalStatus"].cat.codes
obj_df

Unnamed: 0,PreferredLoginDevice,PreferredPaymentMode,Gender,PreferedOrderCat,MaritalStatus,PreferredLoginDevice_cat,PreferredPaymentMode_cat,Gender_cat,PreferedOrderCat_cat,MaritalStatus_cat
0,Computer,Debit Card,Male,Mobile Phone,Single,0,4,1,4,2
1,Mobile Phone,Debit Card,Male,Grocery,Married,1,4,1,1,1
2,Mobile Phone,Credit Card,Male,Laptop & Accessory,Single,1,3,1,2,2
3,Phone,E wallet,Male,Mobile,Married,2,5,1,3,1
4,Computer,Debit Card,Male,Laptop & Accessory,Divorced,0,4,1,2,0
...,...,...,...,...,...,...,...,...,...,...
3078,Mobile Phone,Debit Card,Male,Grocery,Married,1,4,1,1,1
3079,Mobile Phone,Debit Card,Male,Fashion,Single,1,4,1,0,2
3080,Mobile Phone,Debit Card,Male,Mobile,Married,1,4,1,3,1
3081,Computer,Debit Card,Female,Mobile,Divorced,0,4,0,3,0


In [23]:
df['PreferredLoginDevice'] = obj_df['PreferredLoginDevice_cat']
df['PreferredPaymentMode'] = obj_df['PreferredPaymentMode_cat']
df['Gender'] = obj_df['Gender_cat']
df['PreferedOrderCat'] = obj_df['PreferedOrderCat_cat']
df['MaritalStatus'] = obj_df['MaritalStatus_cat']
df

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,54023,1,1.0,0,1,31.0,4,1,4.0,5,4,1,2,3,1,25.0,2.0,2.0,1.0,149
1,50266,0,16.0,1,1,18.0,4,1,2.0,4,1,4,1,2,0,11.0,0.0,3.0,9.0,243
2,54985,0,16.0,1,1,36.0,3,1,3.0,6,2,1,2,6,0,20.0,2.0,2.0,1.0,168
3,52029,1,0.0,2,3,15.8,5,1,2.0,4,3,4,1,2,1,18.0,0.0,1.0,0.0,124
4,50084,0,13.0,0,1,15.0,4,1,3.0,4,2,2,0,10,0,14.0,1.0,1.0,2.0,149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3078,51224,0,13.0,1,1,6.0,4,1,3.0,3,1,5,1,3,0,11.0,1.8,11.0,16.0,268
3079,51396,1,11.0,1,1,6.0,4,1,3.0,4,0,3,2,10,1,13.0,0.0,1.0,0.0,154
3080,52017,0,9.1,1,1,16.0,4,1,2.0,3,3,4,1,2,0,14.0,1.0,1.0,0.0,120
3081,50355,1,0.0,0,1,13.0,4,0,2.9,3,3,3,0,2,1,16.0,0.0,1.0,2.0,129


In [25]:
X = df.drop(['Churn'], axis =1).values
y = df['Churn'].values

print(X.shape)
print(y.shape)

(3078, 19)
(3078,)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=12)

In [28]:
from sklearn import svm
svm = SVR(kernel='poly', degree=2, gamma='auto')