In [1]:
import os
import numpy as np
import pandas as pd
import pickle

from time import time
import multiprocessing

from scipy.stats import ks_2samp

from imblearn.over_sampling import ADASYN, SMOTE, BorderlineSMOTE, KMeansSMOTE, SMOTEN, SMOTENC, SVMSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier


from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, classification_report, confusion_matrix

metrics_data_columns = ['model','sampling_method','k_neighbour', 'train_accuracy', 'test_accuracy', 'roc_auc',
                         'precision_0', 'recall_0', 'f1_0',
                         'precision_1', 'recall_1', 'f1_1', 
                         'ks_stat', 'p_value', 
                         'tp', 'tn', 'fp', 'fn']

import warnings
warnings.filterwarnings('ignore')

## Data preperation and evaluation

In [4]:
# Data preparation and Evaluation

def features_target_split(df, target_col='Exited'):
    """
    Split the DataFrame into features and target variables.
    
    Parameters:
        df (DataFrame): The input DataFrame.
        target_col (str): The name of the target column. Default is 'Exited'.
        
    Returns:
        x (DataFrame): The features.
        y (Series): The target variable.
    """
    # Drop the target column from the DataFrame to get the features
    x = df.drop(target_col, axis=1)
    
    # Assign the target column as the y variable
    y = df[target_col]
    
    # Return the features and target variables
    return x,y


def train_test_split(x,y,df,target_col='Exited', test_size=0.2, random_state=42):
    """
    Split the features and target variables into training and testing sets.
    
    Parameters:
        x (DataFrame): The features.
        y (Series): The target variable.
        df (DataFrame): The original DataFrame.
        target_col (str): The name of the target column. Default is 'Exited'.
        test_size (float or int): The proportion or absolute number of samples to include in the testing set. Default is 0.2.
        random_state (int): The seed used by the random number generator. Default is 42.
        
    Returns:
        x_train (DataFrame): The training set features.
        x_test (DataFrame): The testing set features.
        y_train (Series): The training set target variable.
        y_test (Series): The testing set target variable.
    """
    from sklearn.model_selection import train_test_split
    
    # Split the features and target variables into training and testing sets
    # Stratified is being used to maintain the proportion of class [0 and 1] in splits.
    x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                        test_size=test_size, 
                                                        random_state=random_state, 
                                                        stratify=df[target_col])
    
    return x_train, x_test, y_train, y_test


def prediction(model, x_train, x_test):
    """
    Generate predictions using a trained logistic regression model.
    
    Parameters:
        log_reg_model (LogisticRegression): The trained logistic regression model.
        x_train (array-like or sparse matrix): The training set features.
        x_test (array-like or sparse matrix): The testing set features.
        
    Returns:
        y_pred_train (array-like): Predicted labels for the training set.
        y_pred_test (array-like): Predicted labels for the testing set.
        y_pred_test_proba (array-like): Predicted probabilities for the testing set.
    """
    # Generate predictions for the training set
    y_pred_train = model.predict(x_train)
    
    # Generate predictions for the testing set
    y_pred_test = model.predict(x_test)
    
    # Generate predicted probabilities for the testing set
    y_pred_test_proba = model.predict_proba(x_test)
    
    return y_pred_train, y_pred_test, y_pred_test_proba


class Evaluation():
    def __init__(self,y_train, y_test, y_pred_train, y_pred_test, y_pred_test_proba):
        self.y_train = y_train
        self.y_test = y_test
        self.y_pred_train = y_pred_train
        self.y_pred_test = y_pred_test
        self.y_pred_test_proba = y_pred_test_proba
    
    def __ks_stats_value__(self):
        """
        Calculate the Kolmogorov-Smirnov (KS) statistic and p-value.
        
        Returns:
            ks_stat (float): The KS statistic.
            p_value (float): The p-value.
        """
        
        # proba_non_churn contains the predicted probabilities for instances that did not churn
        proba_non_churn = self.y_pred_test_proba[:,1][self.y_test==0]
        
        # proba_churn contains the predicted probabilities for instances that actually churned
        proba_churn = self.y_pred_test_proba[:,1][self.y_test==1]
        
        # Calculating Kolmogorov-Smirnov (KS) statistic and p-value
        ks_stat, p_value = ks_2samp(proba_non_churn, proba_churn)
        return ks_stat, p_value
    
    def __accuracy_value__(self):
        train_accuracy = accuracy_score(self.y_train, self.y_pred_train)
        test_accuracy = accuracy_score(self.y_test, self.y_pred_test)
        return train_accuracy, test_accuracy

    def __prec_rec_f1_value__(self, pos_label):
        """
        Calculate precision, recall, and F1-score for a given label.
        
        Parameters:
            pos_label: The label for which metrics are calculated.
        
        Returns:
            precision (float): Precision score.
            recall (float): Recall score.
            f1 (float): F1-score.
        """
        precision = precision_score(self.y_test, self.y_pred_test,pos_label=pos_label)
        recall = recall_score(self.y_test, self.y_pred_test,pos_label=pos_label)
        f1 = f1_score(self.y_test, self.y_pred_test, pos_label=pos_label)
        return precision, recall, f1

    def __roc_value__(self):
        roc_auc = roc_auc_score(self.y_test, self.y_pred_test)
        return roc_auc

    def __confusion_matrix_value__(self):
        tn, fp, fn, tp = confusion_matrix(self.y_test, self.y_pred_test).ravel()
        return tn, fp, fn, tp
    
    def main(self):
        train_accuracy, test_accuracy = self.__accuracy_value__()
        
        precision_0, recall_0, f1_0 = self.__prec_rec_f1_value__(pos_label=0)
        precision_1, recall_1, f1_1 = self.__prec_rec_f1_value__(pos_label=1)
        
        ks_stat, p_value = self.__ks_stats_value__()
        
        roc_auc = self.__roc_value__()
        
        tn, fp, fn, tp = self.__confusion_matrix_value__()
        
        all_metrics = [train_accuracy, test_accuracy, roc_auc, 
                       precision_0, recall_0, f1_0, 
                       precision_1, recall_1, f1_1, 
                       ks_stat, p_value, 
                       tp, tn, fp, fn]
        
        all_metrics = [round(value, ndigits=6) for value in all_metrics]
        all_metrics_dict = {'train_acc':all_metrics[0], 'test_acc':all_metrics[1], 'roc_auc':all_metrics[2],  
                            'class_0':{'precision':all_metrics[3], 'recall':all_metrics[4], 'f1':all_metrics[5]}, 
                            'class_1':{'precision':all_metrics[6], 'recall':all_metrics[7], 'f1':all_metrics[8]},
                            'ks_stats':all_metrics[9], 'p_value':all_metrics[10],
                            'tp':all_metrics[11],'tn':all_metrics[12],'fp':all_metrics[13],'fn':all_metrics[14]}
        
        return all_metrics, all_metrics_dict

# Models

In [5]:
def logistic_model_train(x_train, y_train, random_state=42, max_iter=1000):
    """
    Train a logistic regression model using the provided training data.
    
    Parameters:
        x_train (DataFrame): The training set features.
        y_train (Series): The training set target variable.
        random_state (int): The seed used by the random number generator. Default is 42.
        max_iter (int): The maximum number of iterations for the solver to converge. Default is 1000.
        
    Returns:
        log_reg_model (LogisticRegression): The trained logistic regression model.
    """
    
    # Create an instance of LogisticRegression model with specified random_state and max_iter
    log_reg_model = LogisticRegression(random_state=random_state, max_iter=max_iter)
    
    # Fit the logistic regression model to the training data
    log_reg_model.fit(x_train, y_train)
    
    return log_reg_model


def gnb_model_train(x_train, y_train):
    
    # instantiate the model
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    return gnb

def svc_model_train(x_train, y_train, random_state=42):

    svc = SVC(probability=True,random_state=random_state)
    svc.fit(x_train, y_train)
    return svc

def adaboost_model_train(x_train, y_train, random_state=42):

    adb_model = AdaBoostClassifier(random_state=random_state)
    adb_model.fit(x_train, y_train)
    return adb_model

def etc_model_train(x_train, y_train, random_state=42):
    etc_model = ExtraTreesClassifier(random_state=random_state)
    etc_model.fit(x_train, y_train)
    return etc_model

def gbc_model_train(x_train, y_train, random_state=42):
    gbc_model = GradientBoostingClassifier(random_state=random_state)
    gbc_model.fit(x_train, y_train)
    return gbc_model

def hgbc_model_train(x_train, y_train, random_state=42):
    hgbc_model = HistGradientBoostingClassifier(random_state=random_state)
    hgbc_model.fit(x_train, y_train)
    return hgbc_model

def rfc_model_train(x_train, y_train, random_state=42):
    rfc_model = RandomForestClassifier(random_state=random_state)
    rfc_model.fit(x_train, y_train)
    return rfc_model

def bbc_model_train(x_train, y_train, random_state=42):
    bbc_model = BalancedBaggingClassifier(random_state=random_state)
    bbc_model.fit(x_train, y_train)
    return bbc_model

def brfc_model_train(x_train, y_train, random_state=42):
    brfc_model = BalancedRandomForestClassifier(random_state=random_state)
    brfc_model.fit(x_train, y_train)
    return brfc_model

def eec_model_train(x_train, y_train, random_state=42):
    eec_model = EasyEnsembleClassifier(random_state=random_state)
    eec_model.fit(x_train, y_train)
    return eec_model

def lgbm_model_train(x_train, y_train, random_state=42):
    lgbm_model = LGBMClassifier(random_state=random_state)
    lgbm_model.fit(x_train, y_train)
    return lgbm_model

def catboost_model_train(x_train, y_train, random_state=42):
    catboost_model = CatBoostClassifier(random_state=random_state)
    catboost_model.fit(x_train, y_train, verbose=False)
    return catboost_model

def train_all_models(x_train, y_train, model_name):
    
    if model_name == 'logistic_regression':
        model = logistic_model_train(x_train, y_train)
        
    elif model_name == 'gaussian_naive_bayes':
        model = gnb_model_train(x_train, y_train)
        
    elif model_name == 'support_vector_classifier':
        model = svc_model_train(x_train, y_train)
        
    elif model_name == 'ada_boost':
        model = adaboost_model_train(x_train, y_train)
        
    elif model_name == 'extra_trees_classifier':
        model = etc_model_train(x_train, y_train)

    elif model_name == 'gradient_boosting_classifier':
        model = gbc_model_train(x_train, y_train)
    
    elif model_name == 'hist_gradient_boosting_classifier':
        model = hgbc_model_train(x_train, y_train)
    
    elif model_name == 'random_forest_classifier':
        model = rfc_model_train(x_train, y_train)

    elif model_name == 'balanced_bagging_classifier':
        model = bbc_model_train(x_train, y_train)
        
    elif model_name == 'balanced_random_forest_classifier':
        model = brfc_model_train(x_train, y_train)
        
    elif model_name == 'easy_ensemble_classifier':
        model = eec_model_train(x_train, y_train)

    elif model_name == 'lgbm_classifier':
        model = lgbm_model_train(x_train, y_train)

    elif model_name == 'catboost_classifier':
        model = catboost_model_train(x_train, y_train)

    else:
        print("Check model name")
    
    return model

## Class Balancing Methods

In [6]:
def smote_method(x,y,neighbour):
    # Apply SMOTE
    sm = SMOTE(random_state=42, k_neighbors=neighbour)
    x_new, y_new = sm.fit_resample(x,y)
    
    return x_new, y_new

def adasyn_method(x,y,neighbour):
    adap_synt = ADASYN(random_state=42, n_neighbors=neighbour)
    x_new, y_new = adap_synt.fit_resample(x,y)
    
    return x_new, y_new

def borderline_smote_method(x,y,neighbour):
    border_smote = BorderlineSMOTE(random_state=42, k_neighbors=neighbour)
    x_new, y_new = border_smote.fit_resample(x,y)

    return x_new, y_new

def kmeans_smote_method(x,y,neighbour):
    kmeans_smote = KMeansSMOTE(random_state=42, k_neighbors=neighbour,cluster_balance_threshold=0.2)
    x_new, y_new = kmeans_smote.fit_resample(x,y)

    return x_new, y_new

def smoten_method(x,y,neighbour):
    # Apply SMOTEN
    sm = SMOTEN(random_state=42, k_neighbors=neighbour)
    x_new, y_new = sm.fit_resample(x,y)
    
    return x_new, y_new

def smotenc_method(x,y,neighbour, approach_type):
    
    # File location of the dataset
    data_loc = "./Churn_Modelling.csv"

    # Read the CSV file into a Pandas DataFrame, using the first column as the index
    df = pd.read_csv(data_loc, index_col=0)
    
    df.drop(['CustomerId'], axis = 1,inplace=True)
    
    if approach_type == 1:
    
        x,y = features_target_split(df)

        # Apply SMOTENC
        sm = SMOTENC(categorical_features=[0,2,3],random_state=42, k_neighbors=neighbour)
        x_new, y_new = sm.fit_resample(x,y)

        x_new.drop(['Surname', 'Geography', 'Gender'], axis = 1,inplace=True)
        
        return x_new, y_new
    
    if approach_type == 2:
        x,y = features_target_split(df)
        
        # Split the features and target variables into training and testing sets.
        x_train, x_test, y_train, y_test = train_test_split(x,y,df)

        # Apply SMOTENC
        sm = SMOTENC(categorical_features=[0,2,3],random_state=42, k_neighbors=neighbour)
        x_train_new, y_train_new = sm.fit_resample(x_train,y_train)

        x_train_new.drop(['Surname', 'Geography', 'Gender'], axis = 1,inplace=True)
        
        x_test.drop(['Surname', 'Geography', 'Gender'], axis = 1,inplace=True)
        
        return x_train_new, x_test, y_train_new, y_test 

def svm_smote_method(x,y,neighbour):
    # Apply SVMSMOTE
    sm = SVMSMOTE(random_state=42, k_neighbors=neighbour)
    x_new, y_new = sm.fit_resample(x,y)
    
    return x_new, y_new

def smote_enn_method(x,y,neighbour):
    # Apply SMOTEENN
    sm = SMOTEENN(random_state=42, smote=SMOTE(random_state=42, k_neighbors=neighbour))
    x_new, y_new = sm.fit_resample(x,y)
    
    return x_new, y_new

def smote_tomek_method(x,y,neighbour):
    # Apply SMOTETOMEK
    sm = SMOTETomek(random_state=42, smote=SMOTE(random_state=42, k_neighbors=neighbour))
    x_new, y_new = sm.fit_resample(x,y)
    
    return x_new, y_new

def sampling_method(method_name, neighbour, x_train, y_train):
    
#     print(f"\nUsing {method_name.upper()} :: APPROACH 2 :: ")

    if method_name == 'smote':
        # Apply SMOTE
        x_train_new, y_train_new = smote_method(x_train,y_train,neighbour)

    if method_name == 'adasyn':
        # Apply ADASYN
        x_train_new, y_train_new = adasyn_method(x_train,y_train,neighbour)

    if method_name == 'borderline_smote':
        # Apply Borderline SMOTE
        x_train_new, y_train_new = borderline_smote_method(x_train,y_train,neighbour)

    if method_name == 'kmeans_smote':
        # Apply KMeans SMOTE
        x_train_new, y_train_new = kmeans_smote_method(x_train,y_train,neighbour)

    if method_name == 'smoten':
        # Apply SMOTEN
        x_train_new, y_train_new = smoten_method(x_train,y_train,neighbour)

#     if method_name == 'smotenc':
#         # Apply SMOTENC
#         x_train_new, x_test, y_train_new, y_test = smotenc_method(x_train,y_train,neighbour, approach_type=2)

    if method_name == 'svmsmote':
        # Apply SVMSMOTE
        x_train_new, y_train_new = svm_smote_method(x_train,y_train,neighbour)

    if method_name == 'smoteenn':
        # Apply SMOTEENN
        x_train_new, y_train_new = smote_enn_method(x_train,y_train,neighbour)

    if method_name == 'smotetomek':
        # Apply SMOTETOMEK
        x_train_new, y_train_new = smote_tomek_method(x_train,y_train,neighbour)
    
    
    return x_train_new, y_train_new

# Sampling and Training with Multiprocessing

In [5]:
def multi_processing(neighbour):
    x_train_new, y_train_new = sampling_method(mth, neighbour, x_train, y_train)

    model = train_all_models(x_train_new, y_train_new, model_name)

    # Generate predictions
    y_pred_train, y_pred_test, y_pred_test_proba = prediction(model, x_train_new, x_test)

    # Calculate evaluation metrics
    model_evaluation = Evaluation(y_train_new, y_test, y_pred_train, y_pred_test, y_pred_test_proba)

    all_metrics,_ = model_evaluation.main()
    all_metrics.insert(0, model_name)
    all_metrics.insert(1, mth)
    all_metrics.insert(2, neighbour)

#     if neighbour%100 == 0 or neighbour == 1:
#     print("{:<6} :: Train Acc: {:<14} :: Test Acc: {}".format(neighbour, all_metrics[3], all_metrics[4]))
#         print('.', end='')   
    return all_metrics

# Training with numerical data only

## Reading Data

In [7]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Drop all categorical columns
df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,619,42,2,0.0,1,1,1,101348.88,1
2,608,41,1,83807.86,1,0,1,112542.58,0
3,502,42,8,159660.8,3,1,0,113931.57,1
4,699,39,1,0.0,2,0,0,93826.63,0
5,850,43,2,125510.82,1,1,1,79084.1,0


## Train Test Split

In [11]:
# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)

In [12]:
loc = "./final_results/potential_models/weights/fn_num/"
if not os.path.exists(loc):
    os.makedirs(loc)

pm = pd.read_csv("./final_results/potential_models/fn_num.csv")
pm

Unnamed: 0,method,k_neighbour,train_accuracy,test_accuracy,roc_auc,precision_0,recall_0,f1_0,precision_1,recall_1,f1_1,ks_stat,p_value,tp,tn,fp,fn,model,sampling_method
0,Approach_ada_BOOST,243,0.801256,0.7735,0.755377,0.917889,0.785938,0.846804,0.463836,0.724816,0.565676,0.526516,0.0,295,1252,341,112,ada_boost,borderline_smote
1,Approach_easy_ENSEMBLE,243,0.801256,0.7735,0.755377,0.917889,0.785938,0.846804,0.463836,0.724816,0.565676,0.526516,0.0,295,1252,341,112,easy_ensemble_classifier,borderline_smote
2,Approach_gradient_BOOSTING,354,0.819007,0.7885,0.762049,0.917857,0.806654,0.85867,0.486667,0.717445,0.57994,0.525354,0.0,292,1285,308,115,gradient_boosting_classifier,adasyn
3,Approach_lgbm_CLASSIFIER,486,0.895234,0.783,0.753109,0.913633,0.803515,0.855043,0.477462,0.702703,0.568588,0.518287,0.0,286,1280,313,121,lgbm_classifier,smotetomek
4,Approach_hist_GRADIENT,354,0.862917,0.7805,0.74971,0.912143,0.801632,0.853324,0.473333,0.697789,0.564052,0.514629,0.0,284,1277,316,123,hist_gradient_boosting_classifier,adasyn
5,Approach_balanced_RANDOM,244,0.999785,0.7845,0.747648,0.909732,0.809793,0.856858,0.479381,0.685504,0.564206,0.501054,0.0,279,1290,303,128,balanced_random_forest_classifier,smotetomek
6,Approach_balanced_BAGGING,316,0.988383,0.785,0.733328,0.900758,0.820465,0.858739,0.479053,0.646192,0.550209,0.466656,0.0,263,1307,286,144,balanced_bagging_classifier,borderline_smote


In [13]:
for row in pm.values:
#     print(row)
    mth = row[-1]
    model_name = row[-2]
    neighbour = row[1]
    
    metrics_data = []

    x_train_new, y_train_new = sampling_method(mth, neighbour, x_train, y_train)

    model = train_all_models(x_train_new, y_train_new, model_name)

    # Generate predictions
    y_pred_train, y_pred_test, y_pred_test_proba = prediction(model, x_train_new, x_test)

    # Calculate evaluation metrics
    model_evaluation = Evaluation(y_train_new, y_test, y_pred_train, y_pred_test, y_pred_test_proba)

    all_metrics,_ = model_evaluation.main()
    all_metrics.insert(0, model_name)
    all_metrics.insert(1, mth)
    all_metrics.insert(2, neighbour)

    metrics_data = metrics_data + all_metrics
    
    if np.all(metrics_data[2:] == row[1:-2]) or model_name == 'catboost_classifier':
        pickle.dump(model, open(os.path.join(loc, '_'.join(row[-2:]) + f'_{neighbour}.pkl'),'wb'))
    else:
        print("Error")
    
#     break

# Training with numerical and categorical data

## Reading Data

In [9]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

Unnamed: 0_level_0,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1115,619,0,0,42,2,0.0,1,1,1,101348.88,1
2,1177,608,2,0,41,1,83807.86,1,0,1,112542.58,0
3,2040,502,0,0,42,8,159660.8,3,1,0,113931.57,1
4,289,699,0,0,39,1,0.0,2,0,0,93826.63,0
5,1822,850,2,0,43,2,125510.82,1,1,1,79084.1,0


## Train Test Split

In [10]:
# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)

In [11]:
loc = "./final_results/potential_models/weights/fn_num_cat/"
if not os.path.exists(loc):
    os.makedirs(loc)

pm = pd.read_csv("./final_results/potential_models/fn_num_cat.csv")
pm

Unnamed: 0,method,k_neighbour,train_accuracy,test_accuracy,roc_auc,precision_0,recall_0,f1_0,precision_1,recall_1,f1_1,ks_stat,p_value,tp,tn,fp,fn,model,sampling_method
0,Approach_lgbm_CLASSIFIER,907,0.899529,0.822,0.774847,0.916498,0.854363,0.88434,0.549515,0.695332,0.613883,0.550322,0.0,283,1361,232,124,lgbm_classifier,borderline_smote
1,Approach_gradient_BOOSTING,414,0.851823,0.8045,0.768435,0.917361,0.829253,0.871085,0.514286,0.707617,0.595657,0.550248,0.0,288,1321,272,119,gradient_boosting_classifier,adasyn
2,Approach_ada_BOOST,375,0.829042,0.802,0.773268,0.921182,0.82172,0.868613,0.509499,0.724816,0.598377,0.548993,0.0,295,1309,284,112,ada_boost,borderline_smote
3,Approach_easy_ENSEMBLE,375,0.829042,0.802,0.773268,0.921182,0.82172,0.868613,0.509499,0.724816,0.598377,0.548993,0.0,295,1309,284,112,easy_ensemble_classifier,borderline_smote
4,Approach_balanced_RANDOM,254,1.0,0.81,0.758168,0.909521,0.845574,0.876383,0.526012,0.670762,0.589633,0.531842,0.0,273,1347,246,134,balanced_random_forest_classifier,smote
5,Approach_balanced_BAGGING,200,0.989708,0.805,0.744968,0.90288,0.846202,0.873623,0.516765,0.643735,0.573304,0.503491,0.0,262,1348,245,145,balanced_bagging_classifier,smotetomek


## Training

In [12]:
for row in pm.values:
#     print(row)
    mth = row[-1]
    model_name = row[-2]
    neighbour = row[1]
    
    metrics_data = []

    x_train_new, y_train_new = sampling_method(mth, neighbour, x_train, y_train)

    model = train_all_models(x_train_new, y_train_new, model_name)

    # Generate predictions
    y_pred_train, y_pred_test, y_pred_test_proba = prediction(model, x_train_new, x_test)

    # Calculate evaluation metrics
    model_evaluation = Evaluation(y_train_new, y_test, y_pred_train, y_pred_test, y_pred_test_proba)

    all_metrics,_ = model_evaluation.main()
    all_metrics.insert(0, model_name)
    all_metrics.insert(1, mth)
    all_metrics.insert(2, neighbour)

    metrics_data = metrics_data + all_metrics
    
    if np.all(metrics_data[2:] == row[1:-2]) or model_name == 'catboost_classifier':
        pickle.dump(model, open(os.path.join(loc, '_'.join(row[-2:]) + f'_{neighbour}.pkl'),'wb'))
    else:
        print("Error")
    
#     break

# Start Point

# Ensemble Models

### Combination of two models are used for creating ensemble models based on weightage to precision and recall for class 1.

**Model 1** : Focus on decreasing FP.

**Model 2** : Focus on decreasing FN.


| Combination Name | Model 1                      | Model 2                      |
| ---------------- | ---------------------------- | ---------------------------- |
| Test 1           | numerical features only      | numerical features only      |
| Test 2           | numerical + categorical features | numerical + categorical features |
| Test 3           | categorical features treated as categorical** | categorical features treated as categorical** |
| Test 4           | numerical features only      | numerical + categorical features |
| Test 5           | numerical + categorical features | categorical features treated as categorical** |
| Test 6           | categorical features treated as categorical** | numerical features only      |
| Test 7           | numerical features only      | categorical features treated as categorical** |
| Test 8           | numerical + categorical features | numerical features only      |
| Test 9           | categorical features treated as categorical** | numerical + categorical features |


**\*\*Valid for CatBoost model only**


In [7]:
fp_num_models_loc = "./final_results/potential_models/weights/fp_num/"
fp_num_cat_models_loc = "./final_results/potential_models/weights/fp_num_cat/"
fp_cat_cat_models_loc = "./final_results/potential_models/weights/fp_cat_cat/"

fn_num_models_loc = "./final_results/potential_models/weights/fn_num/"
fn_num_cat_models_loc = "./final_results/potential_models/weights/fn_num_cat/"
fn_cat_cat_models_loc = "./final_results/potential_models/weights/fn_cat_cat/"

In [8]:
def model_ensembler(model_1_name, model_2_name, variable_value):
    # Assuming you have already trained and obtained predictions from both classifier models

    ## Test Prediction
    # Model 1 predictions
    model_1_predictions_test = model_1.predict_proba(x_test)  # Assuming you're interested in class 1 prediction

    # Gradient Boosting Classifier predictions
    model_2_predictions_test = model_2.predict_proba(x_test)  # Assuming you're interested in class 1 prediction
    
    ## Train Prediction
    # Model 1 predictions
    model_1_predictions_train = model_1.predict_proba(x_train)  # Assuming you're interested in class 1 prediction

    # Gradient Boosting Classifier predictions
    model_2_predictions_train = model_2.predict_proba(x_train)  # Assuming you're interested in class 1 prediction


    for i in i_range:
        precision_weightage = round(i,ndigits=2)
#         print(round(i,ndigits=3), end=' ')
        for j in j_range:
            recall_weightage = round(j,ndigits=2)
#             print(round(j,ndigits=3), end=' ')
#             for threshold in threshold_range:
        
            # Ensemble the models by weighted average
            ensemble_predictions_test = (precision_weightage * model_1_predictions_test) + (recall_weightage * model_2_predictions_test)

            # Threshold the ensemble predictions if necessary
            ensemble_predictions_binary_test = np.where(ensemble_predictions_test >= threshold, 1, 0)

            # Ensemble the models by weighted average
            ensemble_predictions_train = (precision_weightage * model_1_predictions_train) + (recall_weightage * model_2_predictions_train)

            # Threshold the ensemble predictions if necessary

            ensemble_predictions_binary_train = np.where(ensemble_predictions_train >= threshold, 1, 0)

            # Calculate evaluation metrics
            model_evaluation = Evaluation(y_train, y_test, 
                                          ensemble_predictions_binary_train[:,1], 
                                          ensemble_predictions_binary_test[:,1], 
                                          ensemble_predictions_test)
            all_metrics, _ = model_evaluation.main()

            all_metrics.insert(0, model_1_name)
            all_metrics.insert(1, model_2_name)
            all_metrics.insert(2,variable_value)
            all_metrics.insert(3, precision_weightage)
            all_metrics.insert(4, recall_weightage)
            all_metrics.insert(5, threshold)


            metrics_data.append(all_metrics)

In [9]:
def model_ensembler_1(model_1_name, model_2_name, variable_value):
    # Assuming you have already trained and obtained predictions from both classifier models

    ## Test Prediction
    # Model 1 predictions
    model_1_predictions_test = model_1.predict_proba(x_test.drop(['Surname','Geography', 'Gender'], axis=1))  # Assuming you're interested in class 1 prediction

    # Gradient Boosting Classifier predictions
    model_2_predictions_test = model_2.predict_proba(x_test)  # Assuming you're interested in class 1 prediction
    
    ## Train Prediction
    # Model 1 predictions
    model_1_predictions_train = model_1.predict_proba(x_train.drop(['Surname','Geography', 'Gender'], axis=1))  # Assuming you're interested in class 1 prediction

    # Gradient Boosting Classifier predictions
    model_2_predictions_train = model_2.predict_proba(x_train)  # Assuming you're interested in class 1 prediction

    for i in i_range:
        precision_weightage = round(i, ndigits=2)
#         print(round(i,ndigits=3), end=' ')
        for j in j_range:
            recall_weightage = round(j,ndigits=2)
#             for threshold in threshold_range:

    #             print(round(j,ndigits=3), end=' ')

            # Ensemble the models by weighted average
            ensemble_predictions_test = (precision_weightage * model_1_predictions_test) + (recall_weightage * model_2_predictions_test)

            # Threshold the ensemble predictions if necessary
            ensemble_predictions_binary_test = np.where(ensemble_predictions_test >= threshold, 1, 0)

            # Ensemble the models by weighted average
            ensemble_predictions_train = (precision_weightage * model_1_predictions_train) + (recall_weightage * model_2_predictions_train)

            # Threshold the ensemble predictions if necessary

            ensemble_predictions_binary_train = np.where(ensemble_predictions_train >= threshold, 1, 0)

            # Calculate evaluation metrics
            model_evaluation = Evaluation(y_train, y_test, 
                                          ensemble_predictions_binary_train[:,1], 
                                          ensemble_predictions_binary_test[:,1], 
                                          ensemble_predictions_test)
            all_metrics, _ = model_evaluation.main()

            all_metrics.insert(0, model_1_name)
            all_metrics.insert(1, model_2_name)
            all_metrics.insert(2, variable_value)
            all_metrics.insert(3, precision_weightage)
            all_metrics.insert(4, recall_weightage)
            all_metrics.insert(5, threshold)


            metrics_data.append(all_metrics)

In [10]:
i_range = j_range = np.arange(0.1,1.0, 0.01)
threshold = 0.5

## Test 1

In [11]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# Drop all categorical columns
df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)

In [12]:

metrics_data = []
# st = time()
for model_1_name in os.listdir(fp_num_models_loc):
    model_1 = pickle.load(open(os.path.join(fp_num_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fn_num_models_loc):
        model_2 = pickle.load(open(os.path.join(fn_num_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)
        
        model_ensembler(model_1_name, model_2_name, 'num_num')
        

                
                
#         break
#     break
# et = time()

balanced_random_forest_classifier_smoten_375.pkl
	 ada_boost_borderline_smote_243.pkl
	 gradient_boosting_classifier_adasyn_354.pkl
	 hist_gradient_boosting_classifier_adasyn_354.pkl
	 easy_ensemble_classifier_borderline_smote_243.pkl
	 lgbm_classifier_smotetomek_486.pkl
	 balanced_bagging_classifier_borderline_smote_316.pkl
	 balanced_random_forest_classifier_smotetomek_244.pkl
hist_gradient_boosting_classifier_smoten_376.pkl
	 ada_boost_borderline_smote_243.pkl
	 gradient_boosting_classifier_adasyn_354.pkl
	 hist_gradient_boosting_classifier_adasyn_354.pkl
	 easy_ensemble_classifier_borderline_smote_243.pkl
	 lgbm_classifier_smotetomek_486.pkl
	 balanced_bagging_classifier_borderline_smote_316.pkl
	 balanced_random_forest_classifier_smotetomek_244.pkl
gradient_boosting_classifier_smoten_820.pkl
	 ada_boost_borderline_smote_243.pkl
	 gradient_boosting_classifier_adasyn_354.pkl
	 hist_gradient_boosting_classifier_adasyn_354.pkl
	 easy_ensemble_classifier_borderline_smote_243.pkl
	 lgbm

In [16]:
et-st

57.80266737937927

In [13]:
metrics_data_columns = ['model_1','model_2','type', 'precision_weight', 'recall_weight', 'threshold' ,
                         'train_accuracy', 'test_accuracy', 'roc_auc',
                         'precision_0', 'recall_0', 'f1_0',
                         'precision_1', 'recall_1', 'f1_1', 
                         'ks_stat', 'p_value', 
                         'tp', 'tn', 'fp', 'fn']

pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_1.csv", index=False, header=True)

## Test 2

In [14]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)

In [15]:

metrics_data = []

for model_1_name in os.listdir(fp_num_cat_models_loc):
    model_1 = pickle.load(open(os.path.join(fp_num_cat_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fn_num_cat_models_loc):
        model_2 = pickle.load(open(os.path.join(fn_num_cat_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)

        model_ensembler(model_1_name, model_2_name, 'num_cat')
                
        
#     break

balanced_bagging_classifier_smoten_290.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
catboost_classifier_smoten_818.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
extra_trees_classifier_smoten_405.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
lgbm_classifier_smoten_1208.pkl
	 lgbm_classifier_bor

In [16]:
metrics_data_columns = ['model_1','model_2','type', 'precision_weight', 'recall_weight', 'threshold' ,
                         'train_accuracy', 'test_accuracy', 'roc_auc',
                         'precision_0', 'recall_0', 'f1_0',
                         'precision_1', 'recall_1', 'f1_1', 
                         'ks_stat', 'p_value', 
                         'tp', 'tn', 'fp', 'fn']


pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_2.csv", index=False, header=True)



## Test 3

In [17]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)


metrics_data = []

for model_1_name in os.listdir(fp_cat_cat_models_loc):
    model_1 = pickle.load(open(os.path.join(fp_cat_cat_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fn_cat_cat_models_loc):
        model_2 = pickle.load(open(os.path.join(fn_cat_cat_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)
        
        model_ensembler(model_1_name, model_2_name, 'cat_cat')
                
pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_3.csv", index=False, header=True)
#     break

catboost_classifier_smoten_1201.pkl
	 catboost_classifier_smoteenn_457.pkl
catboost_classifier_smoten_375.pkl
	 catboost_classifier_smoteenn_457.pkl
catboost_classifier_smoten_376.pkl
	 catboost_classifier_smoteenn_457.pkl
catboost_classifier_smoten_285.pkl
	 catboost_classifier_smoteenn_457.pkl
catboost_classifier_smoten_384.pkl
	 catboost_classifier_smoteenn_457.pkl


## Test 4

In [18]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)


metrics_data = []

for model_1_name in os.listdir(fp_num_models_loc):
    model_1 = pickle.load(open(os.path.join(fp_num_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fn_num_cat_models_loc):
        model_2 = pickle.load(open(os.path.join(fn_num_cat_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)
        
        model_ensembler_1(model_1_name, model_2_name, 'num_num_num_cat')
                
        
#     break

pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_4.csv", index=False, header=True)

balanced_random_forest_classifier_smoten_375.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
hist_gradient_boosting_classifier_smoten_376.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
gradient_boosting_classifier_smoten_820.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
easy_ensemble_classifier_sm

## Test 5

In [19]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)


metrics_data = []

for model_1_name in os.listdir(fp_num_cat_models_loc):
    model_1 = pickle.load(open(os.path.join(fp_num_cat_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fn_cat_cat_models_loc):
        model_2 = pickle.load(open(os.path.join(fn_cat_cat_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)
        
        model_ensembler(model_1_name, model_2_name, 'num_cat_cat_cat')
                
pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_5.csv", index=False, header=True)
#     break

balanced_bagging_classifier_smoten_290.pkl
	 catboost_classifier_smoteenn_457.pkl
catboost_classifier_smoten_818.pkl
	 catboost_classifier_smoteenn_457.pkl
extra_trees_classifier_smoten_405.pkl
	 catboost_classifier_smoteenn_457.pkl
lgbm_classifier_smoten_1208.pkl
	 catboost_classifier_smoteenn_457.pkl
balanced_random_forest_classifier_smoten_821.pkl
	 catboost_classifier_smoteenn_457.pkl
gradient_boosting_classifier_smoten_1173.pkl
	 catboost_classifier_smoteenn_457.pkl
random_forest_classifier_smoten_466.pkl
	 catboost_classifier_smoteenn_457.pkl
easy_ensemble_classifier_smoten_844.pkl
	 catboost_classifier_smoteenn_457.pkl
ada_boost_smoten_950.pkl
	 catboost_classifier_smoteenn_457.pkl


## TEst 6

In [20]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)


metrics_data = []

for model_1_name in os.listdir(fn_num_models_loc):
    model_1 = pickle.load(open(os.path.join(fn_num_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fp_cat_cat_models_loc):
        model_2 = pickle.load(open(os.path.join(fp_cat_cat_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)
        
        model_ensembler_1(model_1_name, model_2_name, 'cat_cat_num_num')
                
pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_6.csv", index=False, header=True)    
#     break

ada_boost_borderline_smote_243.pkl
	 catboost_classifier_smoten_1201.pkl
	 catboost_classifier_smoten_375.pkl
	 catboost_classifier_smoten_376.pkl
	 catboost_classifier_smoten_285.pkl
	 catboost_classifier_smoten_384.pkl
gradient_boosting_classifier_adasyn_354.pkl
	 catboost_classifier_smoten_1201.pkl
	 catboost_classifier_smoten_375.pkl
	 catboost_classifier_smoten_376.pkl
	 catboost_classifier_smoten_285.pkl
	 catboost_classifier_smoten_384.pkl
hist_gradient_boosting_classifier_adasyn_354.pkl
	 catboost_classifier_smoten_1201.pkl
	 catboost_classifier_smoten_375.pkl
	 catboost_classifier_smoten_376.pkl
	 catboost_classifier_smoten_285.pkl
	 catboost_classifier_smoten_384.pkl
easy_ensemble_classifier_borderline_smote_243.pkl
	 catboost_classifier_smoten_1201.pkl
	 catboost_classifier_smoten_375.pkl
	 catboost_classifier_smoten_376.pkl
	 catboost_classifier_smoten_285.pkl
	 catboost_classifier_smoten_384.pkl
lgbm_classifier_smotetomek_486.pkl
	 catboost_classifier_smoten_1201.pkl
	 cat

## Test 7

In [21]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)


metrics_data = []

for model_1_name in os.listdir(fp_num_models_loc):
    model_1 = pickle.load(open(os.path.join(fp_num_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fn_cat_cat_models_loc):
        model_2 = pickle.load(open(os.path.join(fn_cat_cat_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)
        
        model_ensembler_1(model_1_name, model_2_name, 'num_num_cat_cat')
                
pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_7.csv", index=False, header=True)     
#     break

balanced_random_forest_classifier_smoten_375.pkl
	 catboost_classifier_smoteenn_457.pkl
hist_gradient_boosting_classifier_smoten_376.pkl
	 catboost_classifier_smoteenn_457.pkl
gradient_boosting_classifier_smoten_820.pkl
	 catboost_classifier_smoteenn_457.pkl
easy_ensemble_classifier_smoten_949.pkl
	 catboost_classifier_smoteenn_457.pkl
lgbm_classifier_smoten_1215.pkl
	 catboost_classifier_smoteenn_457.pkl
ada_boost_smoten_949.pkl
	 catboost_classifier_smoteenn_457.pkl


## Test 8

In [22]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)


metrics_data = []

for model_1_name in os.listdir(fn_num_models_loc):
    model_1 = pickle.load(open(os.path.join(fn_num_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fp_num_cat_models_loc):
        model_2 = pickle.load(open(os.path.join(fp_num_cat_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)

        model_ensembler_1(model_1_name, model_2_name, 'num_cat_num_num')
                
pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_8.csv", index=False, header=True)        
#     break

ada_boost_borderline_smote_243.pkl
	 balanced_bagging_classifier_smoten_290.pkl
	 catboost_classifier_smoten_818.pkl
	 extra_trees_classifier_smoten_405.pkl
	 lgbm_classifier_smoten_1208.pkl
	 balanced_random_forest_classifier_smoten_821.pkl
	 gradient_boosting_classifier_smoten_1173.pkl
	 random_forest_classifier_smoten_466.pkl
	 easy_ensemble_classifier_smoten_844.pkl
	 ada_boost_smoten_950.pkl
gradient_boosting_classifier_adasyn_354.pkl
	 balanced_bagging_classifier_smoten_290.pkl
	 catboost_classifier_smoten_818.pkl
	 extra_trees_classifier_smoten_405.pkl
	 lgbm_classifier_smoten_1208.pkl
	 balanced_random_forest_classifier_smoten_821.pkl
	 gradient_boosting_classifier_smoten_1173.pkl
	 random_forest_classifier_smoten_466.pkl
	 easy_ensemble_classifier_smoten_844.pkl
	 ada_boost_smoten_950.pkl
hist_gradient_boosting_classifier_adasyn_354.pkl
	 balanced_bagging_classifier_smoten_290.pkl
	 catboost_classifier_smoten_818.pkl
	 extra_trees_classifier_smoten_405.pkl
	 lgbm_classifier_sm

## Test 9

In [23]:
# File location of the dataset
data_loc = "./Churn_Modelling.csv"

# Read the CSV file into a Pandas DataFrame, using the first column as the index
df = pd.read_csv(data_loc, index_col=0)

# df.head()

# Converting type of columns to category
df['Geography'] = df['Geography'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Surname'] = df['Surname'].astype('category')

# Assigning numerical values and storing it in another columns
df['Geography_new'] = df['Geography'].cat.codes
df['Gender_new'] = df['Gender'].cat.codes
df['Surname_new'] = df['Surname'].cat.codes

df['Geography'] = df['Geography_new']
df['Gender'] = df['Gender_new']
df['Surname'] = df['Surname_new']

df.drop(['CustomerId', 'Surname_new','Geography_new', 'Gender_new'], axis = 1,inplace=True)
# df.drop(['CustomerId', 'Surname','Geography', 'Gender'], axis = 1,inplace=True)
df.head()

# Split the DataFrame into features and target variables.
x,y = features_target_split(df)

# Split the features and target variables into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(x,y,df)


metrics_data = []

for model_1_name in os.listdir(fp_cat_cat_models_loc):
    model_1 = pickle.load(open(os.path.join(fp_cat_cat_models_loc, model_1_name), 'rb'))
    print(model_1_name)
    
    for model_2_name in os.listdir(fn_num_cat_models_loc):
        model_2 = pickle.load(open(os.path.join(fn_num_cat_models_loc, model_2_name), 'rb'))
        print('\t',model_2_name)
        
        model_ensembler(model_1_name, model_2_name, 'cat_cat_num_cat')
                
pd.DataFrame(metrics_data, columns=metrics_data_columns).to_csv("../Desktop/test_9.csv", index=False, header=True)        
#     break

catboost_classifier_smoten_1201.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
catboost_classifier_smoten_375.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
catboost_classifier_smoten_376.pkl
	 lgbm_classifier_borderline_smote_907.pkl
	 ada_boost_borderline_smote_375.pkl
	 balanced_random_forest_classifier_smote_254.pkl
	 gradient_boosting_classifier_adasyn_414.pkl
	 balanced_bagging_classifier_smotetomek_200.pkl
	 easy_ensemble_classifier_borderline_smote_375.pkl
catboost_classifier_smoten_285.pkl
	 lgbm_classifier_borderline