In [1]:
#Churn Prediction

In [2]:
#Problem Statement: Financial institutions, such as banks and insurance companies, face the constant challenge of customer churn, where customers stop using their services.
#Predicting customer churn allows these institutions to proactively engage at-risk customers with retention strategies.
#The goal is to build a model that can predict whether a customer is likely to leave the financial institution within a specific time frame, based on historical data.

In [3]:
#Import important libraries
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
#Importing Data
df= pd.read_csv('Churn_Modelling.csv')

In [5]:
#Lets check whats inside
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [7]:
#Lets check for missing values if any
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [8]:
#Lets check for duplicate values
df[df.duplicated()]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited


In [9]:
#We have two categorical columns i.e. geography and gender
#Lets convert the same to numerical columns using Label Encoder

In [10]:
le = LabelEncoder()
df['Geography'] = le.fit_transform(df['Geography'])
df['Gender'] = le.fit_transform(df['Gender'])

In [11]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,0,0,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,2,0,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,0,0,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,0,0,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,2,0,43,2,125510.82,1,1,1,79084.1,0


In [12]:
##Lets put independent features together
features= ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

In [13]:
X= df[features]
y= df['Exited']

In [14]:
##Splitting data for training and testing
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 42)

In [15]:
###Feature Scaling
scaler= StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

In [16]:
X_train[:5] #sample check

array([[ 0.35649971, -0.9055496 ,  0.91324755, -0.6557859 ,  0.34567966,
        -1.21847056,  0.80843615,  0.64920267,  0.97481699,  1.36766974],
       [-0.20389777,  0.30164867,  0.91324755,  0.29493847, -0.3483691 ,
         0.69683765,  0.80843615,  0.64920267,  0.97481699,  1.6612541 ],
       [-0.96147213,  1.50884694,  0.91324755, -1.41636539, -0.69539349,
         0.61862909, -0.91668767,  0.64920267, -1.02583358, -0.25280688],
       [-0.94071667, -0.9055496 , -1.09499335, -1.13114808,  1.38675281,
         0.95321202, -0.91668767,  0.64920267, -1.02583358,  0.91539272],
       [-1.39733684, -0.9055496 ,  0.91324755,  1.62595257,  1.38675281,
         1.05744869, -0.91668767, -1.54035103, -1.02583358, -1.05960019]])

In [17]:
###Lets apply Random Forest Classifier
model= RandomForestClassifier()
model.fit(X_train, y_train)

In [18]:
#Lets predict for testing
y_pred= model.predict(X_test)

In [19]:
########Lets check the accuracy of our model
conf_matrix= confusion_matrix(y_test, y_pred)
class_rep= classification_report(y_test, y_pred)
accuracy= accuracy_score(y_test, y_pred)

In [20]:
#Lets display the metrics
print(conf_matrix)
print(class_rep)
print(accuracy)

[[1552   55]
 [ 212  181]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.77      0.46      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000

0.8665


In [21]:
#Applying Logistic Regression

from sklearn.linear_model import LogisticRegression
log_reg= LogisticRegression()
log_reg.fit(X_train, y_train)

In [22]:
y_pred_lg= log_reg.predict(X_test)

In [23]:
########Lets check the accuracy of our model
conf_matrix= confusion_matrix(y_test, y_pred_lg)
class_rep= classification_report(y_test, y_pred_lg)
accuracy= accuracy_score(y_test, y_pred_lg)

In [24]:
print(conf_matrix)
print(class_rep)
print(accuracy)

[[1559   48]
 [ 322   71]]
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1607
           1       0.60      0.18      0.28       393

    accuracy                           0.81      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000

0.815


In [25]:
## Applying K Neighbors Classification

from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier()
knn.fit(X_train, y_train)

In [26]:
y_pred_knn= knn.predict(X_test)

In [27]:
########Lets check the accuracy of our model
conf_matrix= confusion_matrix(y_test, y_pred_knn)
class_rep= classification_report(y_test, y_pred_knn)
accuracy= accuracy_score(y_test, y_pred_knn)

In [28]:
print(conf_matrix)
print(class_rep)
print(accuracy)

[[1519   88]
 [ 241  152]]
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      1607
           1       0.63      0.39      0.48       393

    accuracy                           0.84      2000
   macro avg       0.75      0.67      0.69      2000
weighted avg       0.82      0.84      0.82      2000

0.8355


In [29]:
from sklearn.ensemble import GradientBoostingClassifier
gbm= GradientBoostingClassifier()
gbm.fit(X_train, y_train)

In [30]:
y_pred_gbm= gbm.predict(X_test)

In [31]:
########Lets check the accuracy of our model
conf_matrix= confusion_matrix(y_test, y_pred_gbm)
class_rep= classification_report(y_test, y_pred_gbm)
accuracy= accuracy_score(y_test, y_pred_gbm)

In [32]:
print(conf_matrix)
print(class_rep)
print(accuracy)

[[1547   60]
 [ 209  184]]
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.47      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000

0.8655
