**Model used:** </br>
Random Forest </br>
Random Forest Hyper-parameter tuning </br>
Ensemble Model </br>

**Features used:** </br>
'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_France', 'Geography_Germany', 'Geography_Spain' </br>



## Initializing

In [37]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import plotly.graph_objects as pgo
import matplotlib.pyplot as plt

In [38]:
!pip install --upgrade scikit-learn
!pip install --upgrade imbalanced-learn



## Importing Data

In [39]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving Churn_Modelling.csv to Churn_Modelling (2).csv
User uploaded file "Churn_Modelling (2).csv" with length 786732 bytes


In [40]:
import pandas as pd

# Load the dataset into a Pandas DataFrame
file_path = 'Churn_Modelling.csv'
df_full = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to understand its structure
df_full.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,$0.00,1,1,1,"$101,348.88",1
1,2,15647311,Hill,608,Spain,Female,41,1,"$83,807.86",1,0,1,"$112,542.58",0
2,3,15619304,Onio,502,France,Female,42,8,"$159,660.80",3,1,0,"$113,931.57",1
3,4,15701354,Boni,699,France,Female,39,1,$0.00,2,0,0,"$93,826.63",0
4,5,15737888,Mitchell,850,Spain,Female,43,2,"$125,510.82",1,1,1,"$79,084.10",0


## Data Checking and Cleaning

In [41]:
# Check the data types of all columns
df_full.dtypes

RowNumber           int64
CustomerId          int64
Surname            object
CreditScore         int64
Geography          object
Gender             object
Age                 int64
Tenure              int64
Balance            object
NumOfProducts       int64
HasCrCard           int64
IsActiveMember      int64
EstimatedSalary    object
Exited              int64
dtype: object

## Data Cleaning

In [42]:
# Check for missing values in a DataFrame
missing_values = df_full.isna()

# Count missing values in each column
missing_count = df_full.isna().sum()

# Remove rows with missing values
df_full.dropna(axis=0, inplace=True)

In [43]:
missing_count = df_full.isna().sum()
print(missing_count)

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


## Data Encoding

In [44]:
df_full['Balance'] = df_full['Balance'].str.replace('$', '').str.replace(',', '').astype(float)

  df_full['Balance'] = df_full['Balance'].str.replace('$', '').str.replace(',', '').astype(float)


In [45]:
df_full['EstimatedSalary'] = df_full['EstimatedSalary'].str.replace('$', '').str.replace(',', '').astype(float)

  df_full['EstimatedSalary'] = df_full['EstimatedSalary'].str.replace('$', '').str.replace(',', '').astype(float)


In [46]:
df_full.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [47]:
df_full.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [48]:
# Create dummy variables for 'Geography'
geography_dummies = pd.get_dummies(df_full['Geography'], prefix='Geography')

# Create dummy variables for 'Gender'
gender_dummies = pd.get_dummies(df_full['Gender'], prefix='Gender')

# Concatenate the dummy variables with the original DataFrame
df_with_dummies = pd.concat([df_full, geography_dummies, gender_dummies], axis=1)

# Drop the original categorical columns ('Geography' and 'Gender') if needed
df_with_dummies.drop(['Geography', 'Gender','Surname'], axis=1, inplace=True)

In [49]:
df_with_dummies.head()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,1,15634602,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
1,2,15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,3,15619304,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
3,4,15701354,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
4,5,15737888,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


In [51]:
df_with_dummies.dtypes

RowNumber              int64
CustomerId             int64
CreditScore            int64
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Geography_France       uint8
Geography_Germany      uint8
Geography_Spain        uint8
Gender_Female          uint8
Gender_Male            uint8
dtype: object

## Train-Test-Split

In [52]:
from sklearn.model_selection import train_test_split

specified_columns = [
            'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography_France', 'Geography_Germany', 'Geography_Spain'
           ]


# Split the data into features and target
X = df_with_dummies[specified_columns]
y = df_with_dummies['Exited']  # 'Exited' is the target variable

In [57]:
# Specify the size of the test set (e.g., 20% of the data)
test_size = 0.2

# Set a random seed for reproducibility (optional)
random_seed = 42

# Split the data into training and test sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_seed)

## Random Forest

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, classification_report

# Custom scorers for evaluation
scorers = {
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1),
    'f1': make_scorer(f1_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(roc_auc_score)
}

# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Fitting the model
rf_classifier.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_pred_rf = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1_score = f1_score(y_test, y_pred_rf, pos_label=1)

rf_accuracy, rf_precision, rf_recall, rf_f1_score

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.8715

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.92      1607
           1       0.77      0.49      0.60       393

    accuracy                           0.87      2000
   macro avg       0.83      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000



## Random Forest Hyper Tunning

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

# Custom scorers for evaluation
scorers = {
    'precision': make_scorer(precision_score, pos_label=1),
    'recall': make_scorer(recall_score, pos_label=1),
    'f1': make_scorer(f1_score, pos_label=1),
    'accuracy': make_scorer(accuracy_score),
    'auc': make_scorer(roc_auc_score)
}

# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],        # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],        # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],          # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt'],       # The number of features to consider when looking for the best split
    'class_weight': [None, 'balanced']      # Weights associated with classes in the form {class_label: weight}
}

# Grid Search for hyperparameter tuning with custom scorers
grid_search_rf = GridSearchCV(estimator=rf_classifier,
                              param_grid=param_grid,
                              scoring=scorers,
                              refit='f1',  # Refitting on the basis of F1 score, you can choose any other metric
                              cv=5,
                              verbose=2,
                              n_jobs=-1)

# Fitting the model
grid_search_rf.fit(X_train, y_train)

# Extract the best parameters
best_rf_params = grid_search_rf.best_params_
print("Best Parameters:", best_rf_params)

# Predicting and evaluating on the test set
y_pred_rf = grid_search_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1_score = f1_score(y_test, y_pred_rf, pos_label=1)

# rf_accuracy, rf_precision, rf_recall, rf_f1_score

print("Ensemble 1 Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


1080 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(


Best Parameters: {'class_weight': 'balanced', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Ensemble 1 Accuracy: 0.8495

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.91      0.91      1607
           1       0.62      0.62      0.62       393

    accuracy                           0.85      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.85      0.85      0.85      2000



In [65]:
grid_search_rf

In [66]:
# After fitting, you can get the results for all iterations
results = pd.DataFrame(grid_search_rf.cv_results_)

# Extract the scores for each target class and test accuracy
# results[['mean_test_f1_score_0', 'mean_test_f1_score_1', 'mean_test_test_accuracy']]

results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,...,std_test_accuracy,rank_test_accuracy,split0_test_auc,split1_test_auc,split2_test_auc,split3_test_auc,split4_test_auc,mean_test_auc,std_test_auc,rank_test_auc
0,0.002517,0.000659,0.000000,0.000000,,,auto,1,2,50,...,,217,,,,,,,,217
1,0.003204,0.002109,0.000000,0.000000,,,auto,1,2,100,...,,217,,,,,,,,217
2,0.003861,0.001495,0.000000,0.000000,,,auto,1,2,200,...,,217,,,,,,,,217
3,0.002331,0.000227,0.000000,0.000000,,,auto,1,5,50,...,,217,,,,,,,,217
4,0.002207,0.000329,0.000000,0.000000,,,auto,1,5,100,...,,217,,,,,,,,217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,2.105957,0.430040,0.120795,0.036401,balanced,30,sqrt,4,5,100,...,0.010104,165,0.741726,0.751721,0.780990,0.762161,0.752293,0.757778,0.013285,39
428,2.856253,0.339731,0.110171,0.002671,balanced,30,sqrt,4,5,200,...,0.008049,171,0.744823,0.742942,0.774857,0.761035,0.758819,0.756495,0.011688,47
429,0.681771,0.009836,0.041890,0.001098,balanced,30,sqrt,4,10,50,...,0.010042,187,0.739416,0.742441,0.781276,0.760981,0.743352,0.753493,0.015818,55
430,1.818046,0.429128,0.090189,0.026480,balanced,30,sqrt,4,10,100,...,0.008809,185,0.744727,0.742549,0.781276,0.768580,0.749146,0.757255,0.015128,45


## Ensemble Models

### Ensemble 1

In [59]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Creating individual models with specified parameters
decision_tree_clf = DecisionTreeClassifier(max_depth=4, random_state=42)
decision_tree_clf2 = DecisionTreeClassifier(max_depth=3, random_state=42)
logit_clf = LogisticRegression(C=100, max_iter=1000, random_state=42)


# Combining the models in a voting ensemble
# We use 'soft' voting to predict the class label based on the argmax of the sums of the predicted probabilities
ensemble_clf = VotingClassifier(
    estimators=[('dt', decision_tree_clf), ('logit', logit_clf), ('dt2', decision_tree_clf2)],
    voting='soft'
)

# Training the ensemble model
ensemble_clf.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_pred_ensemble = ensemble_clf.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_precision = precision_score(y_test, y_pred_ensemble)
ensemble_recall = recall_score(y_test, y_pred_ensemble)
ensemble_f1_score = f1_score(y_test, y_pred_ensemble, pos_label=1)

# ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1_score

print("Ensemble 1 Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ensemble))


Ensemble 1 Accuracy: 0.844

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.98      0.91      1607
           1       0.77      0.29      0.42       393

    accuracy                           0.84      2000
   macro avg       0.81      0.64      0.67      2000
weighted avg       0.83      0.84      0.81      2000



### Ensemble 2

In [62]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Creating individual models with specified parameters
decision_tree_clf = DecisionTreeClassifier(max_depth=4, random_state=42)
decision_tree_clf2 = DecisionTreeClassifier(max_depth=4, random_state=42)
logit_clf = LogisticRegression(C=100, max_iter=1000, random_state=42)


# Combining the models in a voting ensemble
# We use 'soft' voting to predict the class label based on the argmax of the sums of the predicted probabilities
ensemble_clf = VotingClassifier(
    estimators=[('dt', decision_tree_clf), ('dt2', decision_tree_clf2)],
    voting='soft'
)

# Training the ensemble model
ensemble_clf.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_pred_ensemble = ensemble_clf.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_precision = precision_score(y_test, y_pred_ensemble)
ensemble_recall = recall_score(y_test, y_pred_ensemble)
ensemble_f1_score = f1_score(y_test, y_pred_ensemble, pos_label=1)

# ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1_score

print("Ensemble 2 Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ensemble))

Ensemble 2 Accuracy: 0.8535

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.97      0.91      1607
           1       0.77      0.37      0.50       393

    accuracy                           0.85      2000
   macro avg       0.81      0.67      0.70      2000
weighted avg       0.84      0.85      0.83      2000



- TA Comments in Template:
- ensemble could increase performance when selecting the best performing models
- but the ensemble performance follows that of baseline models

### Ensemble 3

In [63]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Creating individual models with specified parameters
decision_tree_clf = DecisionTreeClassifier(max_depth=4, random_state=42)
decision_tree_clf2 = DecisionTreeClassifier(max_depth=4, random_state=42)
logit_clf = LogisticRegression(C=100, max_iter=1000, random_state=42)

# Initialize Gaussian Naive Bayes model
gnb = GaussianNB()


# Combining the models in a voting ensemble
# We use 'soft' voting to predict the class label based on the argmax of the sums of the predicted probabilities
ensemble_clf = VotingClassifier(
    estimators=[('dt', decision_tree_clf), ('dt2', decision_tree_clf2), ('nb', gnb)],
    voting='soft'
)

# Training the ensemble model
ensemble_clf.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_pred_ensemble = ensemble_clf.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_precision = precision_score(y_test, y_pred_ensemble)
ensemble_recall = recall_score(y_test, y_pred_ensemble)
ensemble_f1_score = f1_score(y_test, y_pred_ensemble, pos_label=1)

# ensemble_accuracy, ensemble_precision, ensemble_recall, ensemble_f1_score

print("Ensemble 3 Accuracy:", accuracy_score(y_test, y_pred_ensemble))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ensemble))

Ensemble 3 Accuracy: 0.8525

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.97      0.91      1607
           1       0.76      0.36      0.49       393

    accuracy                           0.85      2000
   macro avg       0.81      0.67      0.70      2000
weighted avg       0.84      0.85      0.83      2000

