In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix



In [2]:
# Load the dataset
df = pd.read_csv('Churn_Modelling.csv')



In [3]:
# Inspect the dataset
print(df.head())
print(df.info())



   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [4]:
# Drop irrelevant columns (if any)
# Assuming "RowNumber", "CustomerId", and "Surname" are irrelevant
df = df.drop(columns=["RowNumber", "CustomerId", "Surname"])

# Check for missing values
print(df.isnull().sum())

# Encode categorical features
label_encoder = LabelEncoder()
df['Geography'] = label_encoder.fit_transform(df['Geography'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Feature and target split
X = df.drop(columns=["Exited"])  # Features
y = df["Exited"]  # Target (Churn)



CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)




In [6]:
# Model 1: Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)



In [7]:
# Model 2: Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)



In [8]:
# Model 3: Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_clf.fit(X_train, y_train)



In [9]:
# Predictions for all models
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rf_clf = rf_clf.predict(X_test)
y_pred_gb_clf = gb_clf.predict(X_test)



In [10]:
# Evaluation function
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    return accuracy, precision, recall, f1, roc_auc



In [11]:
# Evaluating Logistic Regression
acc_log_reg, prec_log_reg, rec_log_reg, f1_log_reg, roc_auc_log_reg = evaluate_model(y_test, y_pred_log_reg)
print(f"Logistic Regression: Accuracy={acc_log_reg}, Precision={prec_log_reg}, Recall={rec_log_reg}, F1-Score={f1_log_reg}, ROC-AUC={roc_auc_log_reg}")



Logistic Regression: Accuracy=0.815, Precision=0.5966386554621849, Recall=0.1806615776081425, F1-Score=0.27734375, ROC-AUC=0.5753961279453282


In [12]:
# Evaluating Random Forest
acc_rf, prec_rf, rec_rf, f1_rf, roc_auc_rf = evaluate_model(y_test, y_pred_rf_clf)
print(f"Random Forest: Accuracy={acc_rf}, Precision={prec_rf}, Recall={rec_rf}, F1-Score={f1_rf}, ROC-AUC={roc_auc_rf}")



Random Forest: Accuracy=0.8645, Precision=0.7479674796747967, Recall=0.4681933842239186, F1-Score=0.5758998435054773, ROC-AUC=0.7148060885027495


In [13]:
# Evaluating Gradient Boosting
acc_gb, prec_gb, rec_gb, f1_gb, roc_auc_gb = evaluate_model(y_test, y_pred_gb_clf)
print(f"Gradient Boosting: Accuracy={acc_gb}, Precision={prec_gb}, Recall={rec_gb}, F1-Score={f1_gb}, ROC-AUC={roc_auc_gb}")



Gradient Boosting: Accuracy=0.8655, Precision=0.7540983606557377, Recall=0.4681933842239186, F1-Score=0.5777080062794349, ROC-AUC=0.7154283660385305


In [14]:
# Confusion Matrix for Random Forest (or any other model)
conf_matrix = confusion_matrix(y_test, y_pred_rf_clf)
print(f"Confusion Matrix for Random Forest:\n{conf_matrix}")

Confusion Matrix for Random Forest:
[[1545   62]
 [ 209  184]]
