In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [44]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder

import scipy.special
import scipy.stats as stats
from scipy.stats import skew, kurtosis, shapiro

!pip install --pre --quiet pycaret
from pycaret.classification import *

import warnings
warnings.filterwarnings('ignore')

In [5]:
data = '../input/bank-customer-churn-dataset/Bank Customer Churn Prediction.csv'

In [6]:
df = pd.read_csv(data)

In [7]:
df.head()

In [8]:
df.shape

In [9]:
df.describe

In [10]:
df.dtypes

In [11]:
features = df.keys()
features = features.drop('churn')
subsets = ['credit_score']
df.groupby('churn')[features].mean().style.background_gradient(cmap = "ocean")

In [16]:
fig = px.histogram(df, x="age", y="balance", color="churn",
                   marginal="box",
                   hover_data=df.columns)
fig.show()

In [20]:
sizes = [df.churn[df['churn']==1].count(), df.churn[df['churn']==0].count()]
labels = ['Churned', 'Not Churned']
figure, axes = plt.subplots(figsize=(10, 8))
axes.pie(sizes, labels=labels,shadow=True,autopct = '%1.2f%%')
plt.legend()
plt.title("Churned VS Not Churned", size = 15)
plt.show()

In [22]:
churned_french = df.churn[(df.country == 'France') & (df.churn == 1)].count()
count_french = df.churn[df.country == 'France'].count()
print("Percent of French People Who Churned --->", churned_french*100/count_french,'%')

churned_german = df.churn[(df.country == 'Germany') & (df.churn == 1)].count()
count_german = df.churn[(df.country == 'Germany')].count()
print("Percent of German People Who Churned --->", churned_german*100/count_german,"%")

churned_spain = df.churn[(df.country == 'Spain') & (df.churn == 1)].count()
count_spain = df.churn[(df.country == 'Spain')].count()
print("Percent of Spanish People Who Churned --->", churned_spain*100/count_spain,"%")

In [23]:
px.histogram(df,x='country', color = 'churn',barmode = 'group')

In [24]:
# Checking the labels of the remaining categorical columns
for col in df[["country","gender","products_number","credit_card", "active_member","churn"]]:
   
    print("******************")
    print(col)
    print("******************")
    
    print(df[col].value_counts(dropna=False, normalize =True))
    print("_____________________________________________________")

In [25]:
df["gender"] = [0 if i == "Female" else 1 for i in df["gender"]]

In [26]:
from collections import Counter
def detect_outliers(df, features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile Q1
        Q1 = np.percentile(df[c], 25)
        # 3st quartile Q3
        Q3 = np.percentile(df[c], 75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indices
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # store indices
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers


In [27]:
df.loc[detect_outliers(df, ["credit_score","balance", "age","estimated_salary"])]


In [28]:
plt.subplots(figsize=(20,10))
sns.heatmap(df.corr(), annot = True, fmt = ".2f", cmap = "viridis");

In [29]:
corr = df.corr().churn
plt.figure(figsize = (15,6))
sns.barplot(x = corr[:-1].index, y = corr[:-1])
plt.title("Corelation of features to churn")
plt.xticks(rotation=90);

In [32]:
# print skewness of each attribute
for (colName, colData) in df[["credit_score","age", "balance","estimated_salary"]].iteritems():
    print("Feature name: {0}  ---->>  Skewness: {1}".format(colName.upper(), stats.skew(colData)))

In [33]:
df["Salary_Segment"] = pd.qcut(df["estimated_salary"], 5, labels=["E","D","C","B","A"])
#ie; 0-25 --> D, 25-50 --> C, 50-75 --> B, 75-100 --> A 
df["Credit_Score_Segment"] = pd.qcut(df["credit_score"], 5, labels=["E","D","C","B","A"])
# 'Worst' : "E" , Bad':"D" , 'Fair':"C" , 'Good':"B" , 'Excellent':"A"
df["Balance_Segment"] = pd.qcut(df["balance"][df["balance"]>0], 4, labels=["D","C","B","A"])
# if 0 --> E .....
df['Balance_Segment'] = df['Balance_Segment'].cat.add_categories('E')
df['Balance_Segment'].fillna('E', inplace=True)

In [34]:
le = LabelEncoder()
df.country  = le.fit_transform(df.country)
df.Salary_Segment = le.fit_transform(df.Salary_Segment)
df.Credit_Score_Segment = le.fit_transform(df.Credit_Score_Segment)
df.Balance_Segment = le.fit_transform(df.Balance_Segment)

In [35]:
df.drop(columns = ["estimated_salary","credit_score","balance"] , inplace = True)

In [45]:
s = setup(df, target = 'churn', train_size = .8, preprocess = False, transformation= False,  
          remove_multicollinearity = True, multicollinearity_threshold = 0.9, feature_selection = True,
          fix_imbalance = True)

In [46]:
# compare all models**
best_model = compare_models(sort='AUC')

In [47]:
# Feature Importance Plot
 
plot_model(best_model, plot = 'feature')

In [48]:
# Confusion Matrix**
plot_model(best_model, plot = 'confusion_matrix')