<a href="https://colab.research.google.com/github/tanya-dora/EDA_BankChurn/blob/main/Bank_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
dataset = pd.read_csv('/content/Bank_Churn.csv')
dataset.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Summary Statistics

In [3]:
dataset.shape

(10000, 13)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1015.8+ KB


info() tells me that the dataset has no null values

In [5]:
dataset.describe()

Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


# **Data Handling and Cleaning**

1. Checking for null or duplicate values

In [6]:
print(dataset.isnull().sum())
print(dataset.duplicated().sum())

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64
0


no null or duplicated values

2. Removing irrelevant columns

In [7]:
dataset.columns

Index(['CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age',
       'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited'],
      dtype='object')

In [8]:
dataset = dataset[['CreditScore', 'Geography', 'Gender', 'Age',
       'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember',
       'EstimatedSalary', 'Exited']]


3. Converting categorical to numerical - Gender, Geography

In [9]:
dataset.replace(['Male', 'Female'], [0,1], inplace = True)

  dataset.replace(['Male', 'Female'], [0,1], inplace = True)


In [10]:
dummies = pd.get_dummies(dataset['Geography'], drop_first = True)
dummies.head()

Unnamed: 0,Germany,Spain
0,False,False
1,False,True
2,False,False
3,False,False
4,False,True


In [11]:
merged_dataset = pd.concat([dataset, dummies], axis = 1)
merged_dataset.drop('Geography', axis = 1, inplace = True)

In [12]:
merged_dataset.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Germany,Spain
0,619,1,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,1,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,1,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,1,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,1,43,2,125510.82,1,1,1,79084.1,0,False,True


4. Scale Values

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [14]:
continuous_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
df_scaled = scaler.fit_transform(merged_dataset[continuous_columns])

In [15]:

#df_scaled = pd.DataFrame(df_scaled) column names are not given
df_scaled = pd.DataFrame(df_scaled, columns = continuous_columns)


In [16]:
df_non_continuous = merged_dataset.drop(columns=continuous_columns)
dataset_scaled = pd.concat([df_scaled, df_non_continuous], axis = 1)

In [17]:
dataset_scaled.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,Gender,HasCrCard,IsActiveMember,Exited,Germany,Spain
0,0.538,0.324324,0.2,0.0,0.0,0.506735,1,1,1,1,False,False
1,0.516,0.310811,0.1,0.334031,0.0,0.562709,1,0,1,0,False,True
2,0.304,0.324324,0.8,0.636357,0.666667,0.569654,1,1,0,1,False,False
3,0.698,0.283784,0.1,0.0,0.333333,0.46912,1,0,0,0,False,False
4,1.0,0.337838,0.2,0.500246,0.0,0.3954,1,1,1,0,False,True


# Univariate Analysis