In [1]:
# Author: Aafra Md. Hanif Shaikh
# Task Name: CUSTOMER_CHURN_PREDICTION
# 3st task in the list of tasks
# Task Category: Machine Learning
# Date Of Submission: 
# Linkedin Profile: 
# Github Repository: 

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [3]:
# Load your dataset (replace 'your_data.csv' with your dataset's filename)
data = pd.read_csv('Churn_Modelling.csv')

In [4]:
# Data Preprocessing
# 1. Handle missing values
data.fillna(0, inplace=True)

# 2. Feature selection and target variable
X = data.drop(columns=['Exited', 'RowNumber', 'CustomerId', 'Surname'])  # Features (exclude 'Exited', non-numeric, and non-ordinal categorical columns)
y = data['Exited']  # Target variable ('Exited')

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Standardize the numeric features
numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# 5. One-hot encoding for categorical columns
categorical_columns = ['Geography', 'Gender']
encoder = OneHotEncoder(sparse_output=False, drop='first')
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_columns]))
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_columns]))

X_train_encoded.index = X_train.index
X_test_encoded.index = X_test.index

X_train.drop(categorical_columns, axis=1, inplace=True)
X_test.drop(categorical_columns, axis=1, inplace=True)

X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

# Convert column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

# Model Selection and Training
# You can choose between Logistic Regression, Random Forest, or Gradient Boosting

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

# Save the Logistic Regression model to a file
joblib.dump(lr_model, 'logistic_regression_model.pkl')

# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Save the Random Forest model to a file
joblib.dump(rf_model, 'random_forest_model.pkl')

# Gradient Boosting
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
gb_predictions = gb_model.predict(X_test)

# Save the Gradient Boosting model to a file
joblib.dump(gb_model, 'gradient_boosting_model.pkl')

['gradient_boosting_model.pkl']

In [5]:
# Model Evaluation
def evaluate_model(model, model_name):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy for {model_name}: {accuracy}")
    print(f"Confusion Matrix for {model_name}:\n{confusion_matrix(y_test, predictions)}")
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, predictions)}")

# Evaluate Logistic Regression
evaluate_model(lr_model, 'Logistic Regression')

# Evaluate Random Forest
evaluate_model(rf_model, 'Random Forest')

# Evaluate Gradient Boosting
evaluate_model(gb_model, 'Gradient Boosting')

Accuracy for Logistic Regression: 0.811
Confusion Matrix for Logistic Regression:
[[1543   64]
 [ 314   79]]
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000

Accuracy for Random Forest: 0.8695
Confusion Matrix for Random Forest:
[[1547   60]
 [ 201  192]]
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1607
           1       0.76      0.49      0.60       393

    accuracy                           0.87      2000
   macro avg       0.82      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000

Accuracy for Gradient Boosting: 0.8675
Confusion Ma