In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import pandas as pd
import numpy as np


In [4]:
bank_data = pd.read_csv('cleaned-A.csv')

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
# Selecting additional features
features = ['Age', 'CreditScore', 'Balance', 'NumOfProducts']
X = bank_data[features]
y = bank_data['Exited']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# Making predictions
y_pred = log_reg.predict(X_test_scaled)

# Evaluating the model
conf_matrix = confusion_matrix(y_test, y_pred)
acc_score = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

conf_matrix, acc_score, class_report

(array([[37524,  1609],
        [ 8235,  2143]], dtype=int64),
 0.8011754963543455,
 '              precision    recall  f1-score   support\n\n           0       0.82      0.96      0.88     39133\n           1       0.57      0.21      0.30     10378\n\n    accuracy                           0.80     49511\n   macro avg       0.70      0.58      0.59     49511\nweighted avg       0.77      0.80      0.76     49511\n')

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Fitting Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_scaled, y_train)

# Fitting Random Forest
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train_scaled, y_train)

# Making predictions with Decision Tree
dt_y_pred = decision_tree.predict(X_test_scaled)

# Making predictions with Random Forest
rf_y_pred = random_forest.predict(X_test_scaled)

# Evaluating Decision Tree
dt_conf_matrix = confusion_matrix(y_test, dt_y_pred)
dt_acc_score = accuracy_score(y_test, dt_y_pred)
dt_class_report = classification_report(y_test, dt_y_pred)

# Evaluating Random Forest
rf_conf_matrix = confusion_matrix(y_test, rf_y_pred)
rf_acc_score = accuracy_score(y_test, rf_y_pred)
rf_class_report = classification_report(y_test, rf_y_pred)

# Printing the Decision Tree results
print("Decision Tree Model Performance")
print("-------------------------------")
print("Accuracy:", dt_acc_score)
print("Confusion Matrix:")
print(pd.DataFrame(dt_conf_matrix, index=['Actual: No', 'Actual: Yes'], columns=['Predicted: No', 'Predicted: Yes']))
print("\nClassification Report:")
print(dt_class_report)
print("\n")

# Printing the Random Forest results
print("Random Forest Model Performance")
print("-------------------------------")
print("Accuracy:", rf_acc_score)
print("Confusion Matrix:")
print(pd.DataFrame(rf_conf_matrix, index=['Actual: No', 'Actual: Yes'], columns=['Predicted: No', 'Predicted: Yes']))
print("\nClassification Report:")
print(rf_class_report)


Decision Tree Model Performance
-------------------------------
Accuracy: 0.7928541132273637
Confusion Matrix:
             Predicted: No  Predicted: Yes
Actual: No           34544            4589
Actual: Yes           5667            4711

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87     39133
           1       0.51      0.45      0.48     10378

    accuracy                           0.79     49511
   macro avg       0.68      0.67      0.67     49511
weighted avg       0.79      0.79      0.79     49511



Random Forest Model Performance
-------------------------------
Accuracy: 0.8253923370564117
Confusion Matrix:
             Predicted: No  Predicted: Yes
Actual: No           35920            3213
Actual: Yes           5432            4946

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89     39133
           1       0.61      0.48 

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd

# Defining the models
dt_entropy = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_gini = DecisionTreeClassifier(criterion='gini', random_state=42)

# Fitting the models
dt_entropy.fit(X_train_scaled, y_train)
dt_gini.fit(X_train_scaled, y_train)

# Making predictions
dt_entropy_pred = dt_entropy.predict(X_test_scaled)
dt_gini_pred = dt_gini.predict(X_test_scaled)

# Evaluating the models
# Entropy
entropy_conf_matrix = confusion_matrix(y_test, dt_entropy_pred)
entropy_acc_score = accuracy_score(y_test, dt_entropy_pred)
entropy_class_report = classification_report(y_test, dt_entropy_pred)

# Gini
gini_conf_matrix = confusion_matrix(y_test, dt_gini_pred)
gini_acc_score = accuracy_score(y_test, dt_gini_pred)
gini_class_report = classification_report(y_test, dt_gini_pred)

# Printing results
print("Decision Tree with Entropy")
print("Accuracy:", entropy_acc_score)
print("Confusion Matrix:")
print(pd.DataFrame(entropy_conf_matrix, index=['Actual: No', 'Actual: Yes'], columns=['Predicted: No', 'Predicted: Yes']))
print("Classification Report:")
print(entropy_class_report)

print("\nDecision Tree with Gini Impurity")
print("Accuracy:", gini_acc_score)
print("Confusion Matrix:")
print(pd.DataFrame(gini_conf_matrix, index=['Actual: No', 'Actual: Yes'], columns=['Predicted: No', 'Predicted: Yes']))
print("Classification Report:")
print(gini_class_report)

Decision Tree with Entropy
Accuracy: 0.7948940639453859
Confusion Matrix:
             Predicted: No  Predicted: Yes
Actual: No           34622            4511
Actual: Yes           5644            4734
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87     39133
           1       0.51      0.46      0.48     10378

    accuracy                           0.79     49511
   macro avg       0.69      0.67      0.68     49511
weighted avg       0.79      0.79      0.79     49511


Decision Tree with Gini Impurity
Accuracy: 0.7928541132273637
Confusion Matrix:
             Predicted: No  Predicted: Yes
Actual: No           34544            4589
Actual: Yes           5667            4711
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87     39133
           1       0.51      0.45      0.48     10378

    accuracy                           0.79     49

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

# Defining the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fitting the model
gb_model.fit(X_train_scaled, y_train)

# Making predictions
gb_pred = gb_model.predict(X_test_scaled)

# Evaluating the model
gb_conf_matrix = confusion_matrix(y_test, gb_pred)
gb_acc_score = accuracy_score(y_test, gb_pred)
gb_class_report = classification_report(y_test, gb_pred)

# Printing results
print("Gradient Boosting Model Performance")
print("Accuracy:", gb_acc_score)
print("Confusion Matrix:")
print(pd.DataFrame(gb_conf_matrix, index=['Actual: No', 'Actual: Yes'], columns=['Predicted: No', 'Predicted: Yes']))
print("Classification Report:")
print(gb_class_report)

Gradient Boosting Model Performance
Accuracy: 0.8484175233786432
Confusion Matrix:
             Predicted: No  Predicted: Yes
Actual: No           36847            2286
Actual: Yes           5219            5159
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     39133
           1       0.69      0.50      0.58     10378

    accuracy                           0.85     49511
   macro avg       0.78      0.72      0.74     49511
weighted avg       0.84      0.85      0.84     49511



In [17]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
    ('dt', DecisionTreeClassifier(random_state=42))
]

# Define meta-model
meta_model = LogisticRegression()

# Create the stacking ensemble model
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Fit the model on the training data
stacked_model.fit(X_train_scaled, y_train)

# Make predictions
stacked_pred = stacked_model.predict(X_test_scaled)

# Evaluate the model
stacked_conf_matrix = confusion_matrix(y_test, stacked_pred)
stacked_acc_score = accuracy_score(y_test, stacked_pred)
stacked_class_report = classification_report(y_test, stacked_pred)

# Print results
print("Stacked Ensemble Model Performance")
print("Accuracy:", stacked_acc_score)
print("Confusion Matrix:")
print(pd.DataFrame(stacked_conf_matrix, index=['Actual: No', 'Actual: Yes'], columns=['Predicted: No', 'Predicted: Yes']))
print("Classification Report:")
print(stacked_class_report)

Stacked Ensemble Model Performance
Accuracy: 0.8491648320575226
Confusion Matrix:
             Predicted: No  Predicted: Yes
Actual: No           36892            2241
Actual: Yes           5227            5151
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     39133
           1       0.70      0.50      0.58     10378

    accuracy                           0.85     49511
   macro avg       0.79      0.72      0.74     49511
weighted avg       0.84      0.85      0.84     49511



**The Stacked Ensemble Model** has the highest accuracy and performs slightly better overall than the Gradient Boosting model, with similar precision, recall, and F1-scores for predicting Class 1. This suggests it might be the most reliable model, combining the strengths of Random Forest, Gradient Boosting, and Decision Trees effectively.

**Precision for Class 1:** The Stacked Ensemble also has the highest precision for predicting Class 1, tied closely with the Gradient Boosting model. Higher precision is crucial if the cost of false positives (incorrectly predicting that a customer will exit when they will not) is high.

**In the Random Forest model**, the recall for Class 1 is the highest, albeit only slightly better than the Stacked Ensemble and Gradient Boosting models. High recall is important if missing out on actual churners (false negatives) is more detrimental.

Given the slight improvements in performance metrics and the balanced approach of handling different errors (precision and recall), the Stacked Ensemble Model appears to be the best choice. It effectively leverages the individual strengths of various models, improving the overall decision-making process.

**The Stacked Ensemble Model** is more complex and may require more computational resources and time to train than simpler models like Logistic Regression or a single Decision Tree.

We must also consider the interpretability of your model. Ensemble methods, especially stacking, can be harder to interpret compared to simpler models.

In a business context, where both the costs of false positives and false negatives can be significant (such as in customer churn prediction), it is typically the prefered a model that balances precision and recall while providing the highest overall accuracy. Therefore, implementing the Stacked Ensemble Model will be the best strategy provided the computational resources are adequate for the job.