<a href="https://colab.research.google.com/github/vishnuprasad2004/machine_learning_codsoft/blob/main/Bank_Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'bank-customer-churn-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2008274%2F3322096%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240629%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240629T172832Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D97b19a22c22e92b7b938b4e8ba0ee2825fee8cd8b1e70c9fea7c34f15d6727724ddbdcd160b5901569ac830ae210c1917706773a0591d83aa1517b48d3cebde1c612b220c499cef68b6910cb6fd20f70a49e1c2204676dde7cce1ba454201eed64fd7eac4937065ad2208f8e39f5888ea6449625863a8a475ea96f835b542bb9e92dbd2b06751d6bf27f82ad5e227f4a2dcb5620c4cb2390b0fce1f1a17d9a425eeaf277922ae08192701b7838dad758b9d5b69f88ed216d3419621d772d4c24a4b4b65c0e61a1e0dd0804c23fcdac178dfb7a1b411ddeeea962893fe0606aa0a32ddb397234d57f38a499929729512ca4ba0359f369106df911d20945cf1fae'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading bank-customer-churn-prediction, 267794 bytes compressed
Downloaded and uncompressed: bank-customer-churn-prediction
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score
from sklearn.utils import resample

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv


# Importing the Dataset

In [3]:
df = pd.read_csv("/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv")
df.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


### Data Information

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [5]:
# Handling null values
print("Count of null values in the dataset: ",df.isna().sum().sum())

Count of null values in the dataset:  0


# Data Preprocessing

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X = df[["CreditScore", "Age", "Balance", "EstimatedSalary", "Gender", "IsActiveMember","HasCrCard","NumOfProducts","Geography"]]
X.loc[:, 'Gender'] = X["Gender"].map({'Male': 0, 'Female': 1})
X.loc[:, "Geography"] = le.fit_transform(X["Geography"])
Y = df[["Exited"]]

X.head(10),Y.head(10)

(   CreditScore  Age    Balance  EstimatedSalary Gender  IsActiveMember  \
 0          619   42       0.00        101348.88      1               1   
 1          608   41   83807.86        112542.58      1               1   
 2          502   42  159660.80        113931.57      1               0   
 3          699   39       0.00         93826.63      1               0   
 4          850   43  125510.82         79084.10      1               1   
 5          645   44  113755.78        149756.71      0               0   
 6          822   50       0.00         10062.80      0               1   
 7          376   29  115046.74        119346.88      1               0   
 8          501   44  142051.07         74940.50      0               1   
 9          684   27  134603.88         71725.73      0               1   
 
    HasCrCard  NumOfProducts Geography  
 0          1              1         0  
 1          0              1         2  
 2          1              3         0  
 3       

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

print(len(X_train), len(X_test), len(Y_train), len(Y_test))

8000 2000 8000 2000


In [8]:
X_train

Unnamed: 0,CreditScore,Age,Balance,EstimatedSalary,Gender,IsActiveMember,HasCrCard,NumOfProducts,Geography
5291,659,32,155584.21,153662.88,1,1,0,1,0
8771,681,31,97338.19,48226.76,0,0,0,2,1
35,475,45,134264.04,27822.99,1,0,1,1,0
1255,624,46,0.00,62825.03,0,1,1,2,2
3266,537,30,103138.17,96555.42,0,1,1,1,2
...,...,...,...,...,...,...,...,...,...
6400,676,30,0.00,179066.58,1,0,0,2,2
9160,778,24,0.00,162809.20,0,1,1,2,0
9859,678,55,129646.91,184125.10,0,1,1,1,1
1688,601,41,0.00,160607.06,1,1,0,2,0


## Logistic Regression
Logistic regression is a statistical method for predicting binary classes. The outcome or target variable is binary, meaning it can take on two possible outcomes (0 or 1, true or false, etc.). It models the probability of the default class (usually 1) and uses a logistic function to produce outputs between 0 and 1.

<br><br><br>
$$\min_{w} \frac{1}{S}\sum_{i=1}^n s_i
\left(-y_i \log(\hat{p}(X_i)) - (1 - y_i) \log(1 - \hat{p}(X_i))\right)
+ \frac{r(w)}{S C}\,,$$

#### Advantages

* Simplicity: Easy to implement and interpret.
* Efficiency: Works well with small datasets and doesn't require high computational resources.
* Probabilistic Outputs: Provides probabilities for class membership.
<br>

In [9]:
model = LogisticRegression(penalty="l1", solver="liblinear")
model.fit(X_train, Y_train)

  y = column_or_1d(y, warn=True)


In [10]:
Y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print("Testing Accuracy Score:", accuracy)

Testing Accuracy Score: 0.817


## Random Forest Algorithm
Random Forest is an ensemble learning method used for classification and regression tasks. It builds multiple decision trees during training and merges them to improve the model's accuracy and control overfitting.

#### Advantages

* High Accuracy: Often provides high predictive accuracy due to the ensemble nature.
* Robustness: Less prone to overfitting compared to individual decision trees.
* Feature Importance: Offers insights into which features are most influential in predictions.

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf_params = {'max_depth': 16,
             'min_samples_leaf': 1,
             'min_samples_split': 2,
             'n_estimators': 100,
             'random_state': 12345}

rf_model = RandomForestClassifier(**rf_params)
rf_model.fit(X_train, Y_train)
Y_pred_rf = model.predict(X_test)
rf_accuracy = accuracy_score(Y_test, Y_pred_rf)
print("Training Accuracy :", rf_accuracy)

  rf_model.fit(X_train, Y_train)


Training Accuracy : 0.817


# Conclusion
#### In conclusion we find out that Logistic Regression (with L1 Regularization) and Random Forest Algorithm gives an accuracy of **81.75%**