In [1]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import plotly.graph_objects as pgo


In [2]:
!pip install --upgrade scikit-learn
!pip install --upgrade imbalanced-learn



In [3]:
# read the data file
data = pd.read_excel("/Users/snigdhanerges/Downloads/MSBA/Fall 2023/Q2/CIS 508/Project/Churn_Modelling.xlsx")
data.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [4]:
# Specify a list of categorical columns to be converted
categorical_columns = ['Geography']

# Convert categorical columns to numerical using one-hot encoding
data = pd.get_dummies(data, columns=categorical_columns)

# Fill missing values with mode (most frequent value) for each column
data = data.fillna(data.mode().iloc[0])

# Now, your categorical columns are converted to numerical columns
# and missing values are filled with the mode.
data.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,1,15634602,Hargrave,619,Female,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,2,15647311,Hill,608,Female,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,3,15619304,Onio,502,Female,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,4,15701354,Boni,699,Female,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,5,15737888,Mitchell,850,Female,43,2,125510.82,1,1,1,79084.1,0,0,0,1
5,6,15574012,Chu,645,Male,44,8,113755.78,2,1,0,149756.71,1,0,0,1
6,7,15592531,Bartlett,822,Male,50,7,0.0,2,1,1,10062.8,0,1,0,0
7,8,15656148,Obinna,376,Female,29,4,115046.74,4,1,0,119346.88,1,0,1,0
8,9,15792365,He,501,Male,44,4,142051.07,2,0,1,74940.5,0,1,0,0
9,10,15592389,H?,684,Male,27,2,134603.88,1,1,1,71725.73,0,1,0,0


In [5]:
data['EstimatedSalary'] = data['EstimatedSalary'].astype(float)
data['Balance'] = data['Balance'].astype(float)


In [6]:
# X contains the features (all columns except 'Exited')
X = data.drop(columns=['Exited','Surname','CustomerId','RowNumber','Gender'])
# y contains the target variable 'Exited'
y = data['Exited']

In [7]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy=0.4)
X, y = smote.fit_resample(X, y)

In [8]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 'X_train' and 'y_train' are the features and target variable for the training set
# 'X_test' and 'y_test' are the features and target variable for the testing set

In [9]:
data.columns.values

array(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Geography_France',
       'Geography_Germany', 'Geography_Spain'], dtype=object)

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming you have X_train, y_train, X_test, y_test already prepared

# Build Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
dt_predictions = dt_model.predict(X_test)

# Evaluate Decision Tree
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_predictions))
print("\nClassification Report:\n", classification_report(y_test, dt_predictions))


Decision Tree Accuracy: 0.7928251121076233

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85      1602
           1       0.62      0.66      0.64       628

    accuracy                           0.79      2230
   macro avg       0.74      0.75      0.75      2230
weighted avg       0.80      0.79      0.79      2230



In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define hyperparameters to tune
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Print the best parameters
print("Best Hyperparameters for Decision Tree:", best_params)

# Use the best parameters to create the final model
dt_model_tuned = DecisionTreeClassifier(random_state=42, **best_params)
dt_model_tuned.fit(X_train, y_train)

# Make predictions on the test set
dt_tuned_predictions = dt_model_tuned.predict(X_test)

# Evaluate the tuned model
accuracy_tuned = accuracy_score(y_test, dt_tuned_predictions)
print("Tuned Decision Tree Accuracy:", accuracy_tuned)

# Print classification report for tuned model
print("\nClassification Report for Tuned Decision Tree:\n", classification_report(y_test, dt_tuned_predictions))


Best Hyperparameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Tuned Decision Tree Accuracy: 0.837219730941704

Classification Report for Tuned Decision Tree:
               precision    recall  f1-score   support

           0       0.86      0.93      0.89      1602
           1       0.76      0.61      0.68       628

    accuracy                           0.84      2230
   macro avg       0.81      0.77      0.78      2230
weighted avg       0.83      0.84      0.83      2230



In [12]:
from sklearn.linear_model import LogisticRegression

# Build Logistic Regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

# Make predictions
lr_predictions = lr_model.predict(X_test)

# Evaluate Logistic Regression
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("\nClassification Report:\n", classification_report(y_test, lr_predictions))


Logistic Regression Accuracy: 0.7260089686098655

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.93      0.83      1602
           1       0.53      0.22      0.31       628

    accuracy                           0.73      2230
   macro avg       0.64      0.57      0.57      2230
weighted avg       0.69      0.73      0.68      2230

