In [273]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### Data Preparation

In [274]:
df = pd.read_excel("Customers.xlsx")
df.head(5)

Unnamed: 0,Customer ID,Gender,Age,Income,Transaction Count,Last Transaction Date,Customer Support Interactions,Order Fulfilment Rate,Churn
0,1,Male,35,75000,10,2022-12-31,2,80.0,0
1,2,Female,45,90000,20,2023-02-28,3,95.0,0
2,3,Male,28,40000,5,2022-11-15,1,60.0,1
3,4,Female,50,120000,15,2023-03-31,0,100.0,0
4,5,Male,60,80000,25,2023-01-31,5,75.0,1


#### Data Cleaning

In [275]:
df['Gender'].unique()

array(['Male', 'Female', ' Male   ', ' Female '], dtype=object)

In [276]:
df['Gender'] = df['Gender'].str.strip()


In [277]:
df['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [278]:
#### convert Gender categorical data in numerical data type
def gender(x):
    if x == 'Male':
        return 1
    else:
        return 0

In [279]:
df['gender'] = df['Gender'].apply(gender)
df.drop(['Gender'], axis = 1, inplace = True)

In [280]:
df.rename(columns = {'gender':'Gender'}, inplace = True)

#### Creating new columns Year and Month from Last Transaction Date

In [281]:
df['Year'] = df['Last Transaction Date'].dt.year
df['Month'] = df['Last Transaction Date'].dt.month
df.drop(['Last Transaction Date'],axis = 1, inplace = True)
df.head(5)

Unnamed: 0,Customer ID,Age,Income,Transaction Count,Customer Support Interactions,Order Fulfilment Rate,Churn,Gender,Year,Month
0,1,35,75000,10,2,80.0,0,1,2022,12
1,2,45,90000,20,3,95.0,0,0,2023,2
2,3,28,40000,5,1,60.0,1,1,2022,11
3,4,50,120000,15,0,100.0,0,0,2023,3
4,5,60,80000,25,5,75.0,1,1,2023,1


In [282]:
#create new column Quarter
labels = ['First Quarter','Second Quarter','Third Quarter','Forth Quarter']
df['Quarter'] = pd.cut(df['Month'], 4, labels = labels)
df.head(5)

Unnamed: 0,Customer ID,Age,Income,Transaction Count,Customer Support Interactions,Order Fulfilment Rate,Churn,Gender,Year,Month,Quarter
0,1,35,75000,10,2,80.0,0,1,2022,12,Forth Quarter
1,2,45,90000,20,3,95.0,0,0,2023,2,First Quarter
2,3,28,40000,5,1,60.0,1,1,2022,11,Forth Quarter
3,4,50,120000,15,0,100.0,0,0,2023,3,First Quarter
4,5,60,80000,25,5,75.0,1,1,2023,1,First Quarter


In [283]:
from sklearn.preprocessing import OneHotEncoder

In [284]:
enc_data = pd.DataFrame(OneHotEncoder().fit_transform(df[['Quarter','Year']]).toarray())


In [285]:
df = df.join(enc_data)
df.drop(['Month','Quarter'], axis = 1, inplace = True)


In [286]:
df.drop(['Customer ID'],axis = 1, inplace = True)
df['Age'] = df['Age'].astype('int')
df['Income'] = df['Income'].astype('float')
df['Transaction Count'] = df['Transaction Count'].astype('int')
df['Customer Support Interactions'] = df['Customer Support Interactions'].astype('int')
df['Order Fulfilment Rate'] = df['Order Fulfilment Rate'].astype('float')
df['Churn'] = df['Churn'].astype('int')
df.rename(columns = {0:'Quarter1',1:'Quarter2',2:'Quarter3',3:'Quarter4',4:'Year22',5:'Year23'}, inplace = True)
df['Quarter1'] = df['Quarter1'].astype('int')
df['Quarter2'] = df['Quarter2'].astype('int')
df['Quarter3'] = df['Quarter3'].astype('int')
df['Quarter4'] = df['Quarter4'].astype('int')
df.head(5)

Unnamed: 0,Age,Income,Transaction Count,Customer Support Interactions,Order Fulfilment Rate,Churn,Gender,Year,Quarter1,Quarter2,Quarter3,Quarter4,Year22,Year23
0,35,75000.0,10,2,80.0,0,1,2022,0,1,0,0,1.0,0.0
1,45,90000.0,20,3,95.0,0,0,2023,1,0,0,0,0.0,1.0
2,28,40000.0,5,1,60.0,1,1,2022,0,1,0,0,1.0,0.0
3,50,120000.0,15,0,100.0,0,0,2023,1,0,0,0,0.0,1.0
4,60,80000.0,25,5,75.0,1,1,2023,1,0,0,0,0.0,1.0


#### Split data into training and testing set

In [287]:
X = df.drop(['Churn'], axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [288]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = []
models.append(('LR', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVM', SVC()))

results = []
names = []
for name, model in models:
    cv_results = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR: 0.800000 (0.114564)
CART: 0.700000 (0.127475)
RF: 0.737500 (0.117925)
SVM: 0.700000 (0.127475)


## Logistic Regression is giving highest accuracy

In [289]:
rf = LogisticRegression()
rf.fit(X_train, y_train)


In [290]:
y_pred = rf.predict(X_test)
for Y, y in zip(y_test, y_pred):
  print(Y, y)


0 0
1 0
1 1
1 1
0 0
0 0
1 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
1 1
0 1
1 1
0 0
0 0
0 0
0 1
1 1
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
1 0
1 1
1 1
0 0
