# Building Model 

### Importing packages

In [57]:
import os
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### Importing Dataset

In [29]:
os.chdir(r"/Users/apple/Documents/Credit_Card_Churn_Model_Local/credit-card-churn/Input")
data = pd.read_csv("/Users/apple/Documents/Credit_Card_Churn_Model_Local/credit-card-churn/Input/credit_card_churn_cleaned_feature_selected.csv")
data_scaled = pd.read_csv("/Users/apple/Documents/Credit_Card_Churn_Model_Local/credit-card-churn/Input/credit_card_churn_cleaned_feature_selected_scaled.csv") 

In [30]:
y = data['Attrition_Flag']
X = data.drop(['Attrition_Flag'], axis=1)

In [35]:
X_scaled = data_scaled.drop(['Attrition_Flag'], axis=1)
y_scaled = data_scaled['Attrition_Flag']

In [32]:
print("data shape:", data.shape)
print("data_scaled shape:", data_scaled.shape)


data shape: (10127, 33)
data_scaled shape: (9143, 33)


### Splitting Dataset into Train and Test Sets with Stratify

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42, stratify=y_scaled)

## K-Nearest Neighbors

In [43]:
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn_grid = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5, scoring='accuracy')
knn_grid.fit(X_train_scaled, y_train_scaled)

print("Best KNN Params:", knn_grid.best_params_)
print("Best KNN Accuracy:", knn_grid.best_score_)


Best KNN Params: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best KNN Accuracy: 0.8945862043493262


In [48]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='manhattan')
knn.fit(X_train_scaled, y_train_scaled)
y_pred_train = knn.predict(X_train_scaled)
y_pred_train

array([0, 1, 0, ..., 0, 0, 0])

#### Accuracy of KNN on Training Set

In [51]:
#Accuracy Score
accuracy_knn_train = accuracy_score(y_train_scaled, y_pred_train)
print(f"Accuracy of KNN model: {accuracy_knn_train:.2f}")

Accuracy of KNN model: 1.00


In [None]:
# Confusion Matrix
cm_knn = confusion_matrix(y_test, y_pred)
confusion_matrix_display = px.imshow(cm_knn, color_continuous_scale='Blues')
confusion_matrix_display.update_layout(title='Confusion Matrix for KNN', xaxis_title='Predicted', yaxis_title='Actual')
confusion_matrix_display.show()

## Decision Tree

In [None]:
from sklearn.model_selection import RandomizedSearchCV  
param_dist = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}
dt = DecisionTreeClassifier(random_state=42)
dt_random_search = RandomizedSearchCV(estimator=dt, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
dt_random_search.fit(X_train, y_train)
best_params = dt_random_search.best_params_
best_params

Fitting 3 folds for each of 54 candidates, totalling 162 fits




[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split

{'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}

In [52]:
dt = DecisionTreeClassifier(random_state=42, min_samples_split=5, min_samples_leaf= 1, max_depth = 10)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_train)
accuracy_dt_train = accuracy_score(y_train, dt_pred)
print("Accuracy:",accuracy_dt_train)

Accuracy: 0.9650660412294778


In [None]:
#Confusion Matrix for Decision Tree
cm_DT = confusion_matrix(y_test, dt_pred)
confusion_matrix_display = px.imshow(cm_DT, color_continuous_scale='Blues')
confusion_matrix_display.update_layout(title='Confusion Matrix for Decision Tree', xaxis_title='Predicted', yaxis_title='Actual')
confusion_matrix_display.show()

## Random Forest

In [53]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=30)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_train)
print("Accuracy:", accuracy_score(y_train, rf_pred))

Accuracy: 1.0


In [54]:
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'n_estimators': [90,100,110],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=30)
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                               n_iter=100, cv=5, n_jobs=-1, scoring= 'accuracy')
rf_random.fit(X_train, y_train)
rf_random.best_params_


{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_depth': 20}

In [55]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=30, n_estimators=100, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            max_depth=20)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_train)
accuracy_rf_train = accuracy_score(y_train, rf_pred)
print("Accuracy:", accuracy_rf_train)

Accuracy: 1.0


In [None]:
# BEGIN: Visualizing Feature Importance of Random Forest
import plotly.express as px
rf.fit(X_train, y_train)
importances = rf.feature_importances_
# Get feature importances from the Random Forest model
importances = rf.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plotting
fig = px.bar(importance_df, x='Importance', y='Feature', title='Feature Importance of Random Forest Model', orientation='h')
fig.show()

## Logistic Regression

In [None]:
#Accuracy Score
print("Accuracy:", accuracy_score(y_test, y_pred))

In [58]:
log_reg_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1','l2'],
    'solver': ['liblinear', 'lbfgs']
}

log_reg_grid = GridSearchCV(LogisticRegression(max_iter=1000), log_reg_param_grid, cv=5, scoring='accuracy')
log_reg_grid.fit(X_train_scaled, y_train_scaled)

print("Best Logistic Regression Params:", log_reg_grid.best_params_)


Best Logistic Regression Params: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}


25 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/apple/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/apple/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/apple/Library/Python/3.9/lib/python/site-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/apple/Library/Python/3.9/lib/python/site-packag

In [59]:
from sklearn.metrics import accuracy_score
model = LogisticRegression(C=1, penalty='l2', solver='lbfgs', max_iter=1000)
model.fit(X_train_scaled, y_train_scaled)
y_pred = model.predict(X_test_scaled)
accuracy_lr_train = accuracy_score(y_test_scaled, y_pred)
print("Accuracy :", accuracy_lr_train)


Accuracy : 0.9294696555494806


## SVM

In [61]:
from sklearn.svm import SVC

svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.01, 0.1,1]
}

svm_grid = RandomizedSearchCV(SVC(), svm_param_grid, cv=5,n_iter= 20, scoring='accuracy')
svm_grid.fit(X_train_scaled, y_train_scaled)

print("Best SVM Params:", svm_grid.best_params_)


Best SVM Params: {'kernel': 'rbf', 'gamma': 'auto', 'C': 10}


In [62]:
from sklearn.svm import SVC
svm_model = SVC(random_state=42, kernel='rbf', C=10, gamma='auto')
svm_model.fit(X_train_scaled, y_train_scaled)
y_pred_svm = svm_model.predict(X_train_scaled)
accuracy_svm_train = accuracy_score(y_train_scaled, y_pred_svm)
print("Accuracy:", accuracy_svm_train)

Accuracy: 0.961033634126333


## Accuracy Summary of all Classification Models

In [67]:
import pandas as pd

results = {
    "Model": ["KNN", "Decision Tree", "Random Forest", "Logistic Regression", "SVM"],

    "Test Accuracy": [
        accuracy_knn_train,
        accuracy_dt_train,
        accuracy_rf_train,
        accuracy_lr_train,
        accuracy_svm_train]
    
}

summary_df = pd.DataFrame(results)
summary_df = summary_df.sort_values(by="Test Accuracy", ascending=False).reset_index(drop=True)
print(summary_df)


                 Model  Test Accuracy
0                  KNN       1.000000
1        Random Forest       1.000000
2        Decision Tree       0.965066
3                  SVM       0.961034
4  Logistic Regression       0.929470


## Applying Test Split in these models

## KNN

In [68]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='manhattan')
knn.fit(X_train_scaled, y_train_scaled)
y_pred_train = knn.predict(X_test_scaled)
accuracy_knn_test = accuracy_score(y_test_scaled, y_pred_train)
print(f"Accuracy : {accuracy_knn_test:.2f}")

Accuracy : 0.90


## Decision Tree

In [69]:
dt = DecisionTreeClassifier(random_state=42, min_samples_split=5, min_samples_leaf= 1, max_depth = 10)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
accuracy_dt_test = accuracy_score(y_test, dt_pred)
print("Accuracy:",accuracy_dt_test)

Accuracy: 0.9136229022704837


## Random Forest

In [70]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=30, n_estimators=100, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            max_depth=20)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
accuracy_rf_test = accuracy_score(y_test, rf_pred)
print("Accuracy:", accuracy_rf_test)

Accuracy: 0.9397828232971372
