In [39]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme(style='darkgrid')
import warnings
warnings.filterwarnings('ignore')

In [40]:
data = pd.read_csv('Bank-data.csv')
data.head()

Unnamed: 0,Index,interest_rate,credit,Gender,previous,duration,Churn
0,0,1.334,0,1,0,117,no
1,1,0.767,0,0,1,274,yes
2,2,4.858,0,1,0,167,no
3,3,4.12,0,0,0,686,yes
4,4,4.856,0,1,0,159,no


In [41]:
data.isnull().any()

Index            False
interest_rate    False
credit           False
Gender           False
previous         False
duration         False
Churn            False
dtype: bool

In [42]:
data.isnull().sum()

Index            0
interest_rate    0
credit           0
Gender           0
previous         0
duration         0
Churn            0
dtype: int64

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 518 entries, 0 to 517
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Index          518 non-null    int64  
 1   interest_rate  518 non-null    float64
 2   credit         518 non-null    int64  
 3   Gender         518 non-null    int64  
 4   previous       518 non-null    int64  
 5   duration       518 non-null    int64  
 6   Churn          518 non-null    object 
dtypes: float64(1), int64(5), object(1)
memory usage: 28.5+ KB


In [44]:
data['Churn'] = data['Churn'].astype('category')
data['Churn'] = data['Churn'].cat.codes

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 518 entries, 0 to 517
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Index          518 non-null    int64  
 1   interest_rate  518 non-null    float64
 2   credit         518 non-null    int64  
 3   Gender         518 non-null    int64  
 4   previous       518 non-null    int64  
 5   duration       518 non-null    int64  
 6   Churn          518 non-null    int8   
dtypes: float64(1), int64(5), int8(1)
memory usage: 24.9 KB


In [46]:
data['Churn'].value_counts()

0    259
1    259
Name: Churn, dtype: int64

In [47]:
# split the data into ind and dep variable
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [48]:
x.head()

Unnamed: 0,Index,interest_rate,credit,Gender,previous,duration
0,0,1.334,0,1,0,117
1,1,0.767,0,0,1,274
2,2,4.858,0,1,0,167
3,3,4.12,0,0,0,686
4,4,4.856,0,1,0,159


In [49]:
y.head()

0    0
1    1
2    0
3    1
4    0
Name: Churn, dtype: int8

In [50]:
# split the data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=101)

# LogisticRegression Model

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logit = LogisticRegression()
logit.fit(x_train, y_train)
y_pred = logit.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.8538461538461538


# PCA - Principal Component Analysis

In [52]:
from sklearn.decomposition import PCA

In [53]:
pd.DataFrame(x_train).head(2)

Unnamed: 0,Index,interest_rate,credit,Gender,previous,duration
69,69,4.12,0,0,0,104
260,260,1.453,0,0,0,463


In [57]:
pca = PCA(n_components=None) # None - all component/variables/features
x_train_n = pca.fit_transform(x_train)
x_test_n = pca.fit_transform(x_test)

In [58]:
pd.DataFrame(x_train_n).head(2)

Unnamed: 0,0,1,2,3,4,5
0,-293.378624,175.880375,1.204506,-0.217953,-0.058617,-0.064395
1,74.599343,2.80612,-1.361333,-0.239816,-0.275379,-0.065405


In [59]:
# To check pca significant label in each slice
explained_variance = pca.explained_variance_ratio_

In [60]:
explained_variance

array([8.07144870e-01, 1.92823493e-01, 2.92532511e-05, 1.52261165e-06,
       6.96408766e-07, 1.64823601e-07])

In [24]:
# pca0 = 0.80714487 - 80.7%
# pca1 = 0.192823493 - 19.2% 
# pca2 = 0.00002925325
# pca3 = 0.00000152261
# pca4 = 6.96408766e-7
# pca5 = 1.64823601e-7
(2/6)*100

33.33333333333333

In [None]:
# PCA = 2 component are significant

In [61]:
pca_2 = PCA(n_components=2) # None - all component/variables/features
x_train_2 = pca_2.fit_transform(x_train)
x_test_2 = pca_2.fit_transform(x_test)

In [62]:
pd.DataFrame(x_train_2).head(2)

Unnamed: 0,0,1
0,-293.378624,175.880375
1,74.599343,2.80612


# LogisticRegression with PCA

In [63]:
logit1 = LogisticRegression()
logit1.fit(x_train_2, y_train)
y_pred_2 = logit1.predict(x_test_2)
print(accuracy_score(y_test, y_pred_2))


0.7384615384615385


In [68]:
# cross validation method
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(logit1, x_train_2, y_train, cv=10)
accuracy_test = cross_val_score(logit1, x_test_2, y_test, cv=10)
print(accuracy)
print(accuracy.mean())
print(accuracy.max())
print("*******************")
print(accuracy_test)
print(accuracy_test.mean())
print(accuracy_test.max())

[0.76923077 0.51282051 0.76923077 0.74358974 0.69230769 0.48717949
 0.64102564 0.82051282 0.68421053 0.65789474]
0.6778002699055331
0.8205128205128205
*******************
[0.76923077 0.76923077 0.61538462 0.76923077 0.46153846 0.76923077
 0.92307692 0.76923077 0.76923077 0.69230769]
0.7307692307692308
0.9230769230769231


# RandomForest Classifier

In [65]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.823076923076923
