Data Preparation

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Load the dataset
df = pd.read_csv('Churn_Modelling.csv')

# Display basic information about the dataset
print(df.info())
print(df.describe())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB
None
         RowNumber    CustomerId   CreditScore           Age        Tenure  \
count  1000

Exploratory Data Analysis (EDA)

In [3]:
print(df['Exited'].value_counts(normalize=True))

Exited
0    0.7963
1    0.2037
Name: proportion, dtype: float64


Feature Engineering

In [4]:
# Dropping irrelevant columns
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

# Separate features and target variable
X = df.drop('Exited', axis=1)
y = df['Exited']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the preprocessor for handling numeric and categorical features
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

Model Training

In [5]:
# Define the models
log_reg = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', LogisticRegression(max_iter=1000))])

random_forest = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

gradient_boosting = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))])

# Train the models
log_reg.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)

# Save the models
joblib.dump(log_reg, 'logistic_regression_model.pkl')
joblib.dump(random_forest, 'random_forest_model.pkl')
joblib.dump(gradient_boosting, 'gradient_boosting_model.pkl')

['gradient_boosting_model.pkl']

Model Evaluation

In [6]:
# Evaluate Logistic Regression
y_pred_log_reg = log_reg.predict(X_test)
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")
print(confusion_matrix(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))

# Evaluate Random Forest
y_pred_random_forest = random_forest.predict(X_test)
print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_random_forest)}")
print(confusion_matrix(y_test, y_pred_random_forest))
print(classification_report(y_test, y_pred_random_forest))

# Evaluate Gradient Boosting
y_pred_gradient_boosting = gradient_boosting.predict(X_test)
print("\nGradient Boosting Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gradient_boosting)}")
print(confusion_matrix(y_test, y_pred_gradient_boosting))
print(classification_report(y_test, y_pred_gradient_boosting))

Logistic Regression Results:
Accuracy: 0.7945
[[1538   55]
 [ 356   51]]
              precision    recall  f1-score   support

           0       0.81      0.97      0.88      1593
           1       0.48      0.13      0.20       407

    accuracy                           0.79      2000
   macro avg       0.65      0.55      0.54      2000
weighted avg       0.74      0.79      0.74      2000


Random Forest Results:
Accuracy: 0.8555
[[1533   60]
 [ 229  178]]
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1593
           1       0.75      0.44      0.55       407

    accuracy                           0.86      2000
   macro avg       0.81      0.70      0.73      2000
weighted avg       0.85      0.86      0.84      2000


Gradient Boosting Results:
Accuracy: 0.855
[[1530   63]
 [ 227  180]]
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1593
           1       0.74  