In [247]:
# numpy pandas matplotlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import for Decision Tree
from sklearn.tree import DecisionTreeClassifier 

# import for Random forest
from sklearn.ensemble import RandomForestClassifier

# import for KNN
from sklearn.neighbors import KNeighborsClassifier

# imports for Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB

from sklearn.model_selection import train_test_split 
from sklearn import metrics  
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.tree import export_graphviz
from IPython.display import Image  
import pydotplus
import os

In [248]:
# Class for printing with underline, colors and bold
class bcolors:
    HEADER = '\033[95m'
    OKCYAN = '\033[96m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [249]:
# create folders
directory = "Decision Tree"
parent_dir = "./"
path = os.path.join(parent_dir, directory)
if not os.path.exists(directory): 
    os.mkdir(path)
    
directory = "Best Params"
parent_dir = "./"
path = os.path.join(parent_dir, directory)
if not os.path.exists(directory): 
    os.mkdir(path)

In [250]:
# import csv file
df = pd.read_csv("./marketing_campaigns_train_after_pre_proc.csv", index_col=0)

In [251]:
df_classifier = df.copy()
l = ['age', 'account_balance', 'n_contact', 'p_days','l_call_duration','n_p_contact','p_days','status_cat','education_cat',
    'profession_cat', 'device_cat', 'month_l_date_cat', 'p_outcome_cat','age_bin','account_balance_bin',
    'education_Pre_Proc_cat','profession_Pre_Proc_cat','device_Pre_Proc_cat', 'p_outcome_Pre_Proc_cat']

df_classifier.drop(l, axis='columns', inplace=True)

In [252]:
df_classifier.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30517 entries, 512491 to 516748
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   loan                             30517 non-null  int64  
 1   mortgage                         30517 non-null  int64  
 2   credit                           30517 non-null  int64  
 3   positive                         30517 non-null  int64  
 4   isEmployed_cat                   30517 non-null  int64  
 5   age_min_max                      30517 non-null  float64
 6   n_p_contact_min_max              30517 non-null  float64
 7   p_days_min_max                   30517 non-null  float64
 8   n_contact_min_max                30517 non-null  float64
 9   account_balance_min_max          30517 non-null  float64
 10  l_call_duration_min_max          30517 non-null  float64
 11  status_cat_min_max               30517 non-null  float64
 12  education_Pr

In [253]:
# Split the data to features and target
X = df_classifier.drop('subscribed', axis=1)
y = df_classifier['subscribed']

In [254]:
# Split the data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) 

# <font color = 'red'> Decision Tree </font> 

In [269]:
# open the file Best_DT 
f_r = open("./Best Params/Best_DT.txt", "r")
Lines = f_r.read().splitlines()
f_r.close()
best_DT = []
for line in Lines:
    best_DT.append(line)

In [270]:
best_DT

['90.29', 'gini', '3', '2', '4']

### Find the hyperparameters 

#### Randomized Search

In [256]:
dt = DecisionTreeClassifier(splitter='best',random_state=42)

criterion= ['gini','entropy']

max_depth = [k for k in range(2,50)]

min_samples_split = [2,3,4,10,50,100,150,200,250,300,350,400]

min_samples_leaf = [2,3,4,10,50,100,150,200,250,300,350,400]

random_grid = {
               'criterion': criterion, 
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }
dt_random = RandomizedSearchCV(estimator = dt, param_distributions = random_grid, n_iter = 200,cv = 10,
                               verbose = 2, random_state = 42, n_jobs = -1)
dt_random.fit(X_train, y_train)
parameters = dt_random.best_params_

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


#### Grid Search

In [257]:
dt = DecisionTreeClassifier(splitter='best',random_state=42)
criterion = [parameters['criterion']]

if parameters['max_depth'] > 2 :
    max_depth = [parameters['max_depth'] - 1 , parameters['max_depth'], parameters['max_depth'] + 2,parameters['max_depth']+3]
else:
    max_depth = [parameters['max_depth'], parameters['max_depth'] + 2,parameters['max_depth']+3]


min_samples_split = [parameters['min_samples_split'],parameters['min_samples_split']+1,parameters['min_samples_split']+2,
                    parameters['min_samples_split']+3]

min_samples_leaf = [parameters['min_samples_leaf'],parameters['min_samples_leaf']+1,parameters['min_samples_leaf']+2,
                   parameters['min_samples_leaf']+3]

param_grid = {
               'criterion': criterion, 
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }
dt_grid_search = GridSearchCV(estimator = dt, param_grid = param_grid, cv = 10, n_jobs = -1, verbose = 2, scoring = 'accuracy')
dt_grid_search.fit(X_train, y_train)
parameters_dt = dt_grid_search.best_params_
parameters_dt

Fitting 10 folds for each of 64 candidates, totalling 640 fits


{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 4,
 'min_samples_split': 2}

#### Save the best parameters

In [276]:
accuracy = round((dt_grid_search.best_score_)*100,2)
print(bcolors.OKCYAN + 'accuracy after that train:'+ bcolors.ENDC + " ", accuracy ,"%")
if accuracy > float(best_DT[0]):
    f_w = open("./Best Params/Best_DT.txt", "w")
    f_w.write(str(accuracy)+'\n')
    f_w.write(str(parameters_dt['criterion']) + '\n')
    f_w.write(str(parameters_dt['max_depth']) + '\n')
    f_w.write(str(parameters_dt['min_samples_split']) + '\n')
    f_w.write(str(parameters_dt['min_samples_leaf']))
    f_w.close()

else:
    accuracy = float(best_DT[0])
    parameters_dt['criterion'] = best_DT[1]
    parameters_dt['max_depth'] = int(best_DT[2])
    parameters_dt['min_samples_split'] = int(best_DT[3])
    parameters_dt['min_samples_leaf'] = int(best_DT[4])

    
dt = DecisionTreeClassifier(criterion=parameters_dt['criterion'], splitter='best', 
                          max_depth=parameters_dt['max_depth'], min_samples_split=parameters_dt['min_samples_split'],
                            min_samples_leaf=parameters_dt['min_samples_leaf'], random_state=42)
dt.fit(X_train, y_train)
print(dt)

[96maccuracy after that train:[0m  90.29 %
DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)


# <font color = 'red'> Random Forest </font> 

In [13]:
# open the file Best_RF 
f_r = open("./Best Params/Best_RF.txt", "r")
Lines = f_r.read().splitlines()
f_r.close()
best_RF = []
for line in Lines:
    best_RF.append(line)

#### Randomized Search

In [None]:
rf = RandomForestClassifier(random_state = 42)
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

max_features = ['auto', 'log2', 2, 5, 8]

max_depth = [int(x) for x in np.linspace(5, 50, num = 10)]
max_depth.append(None)

min_samples_split = [1, 2, 5, 10, 15, 20]

min_samples_leaf = [1, 2, 5, 10]

max_samples = [0.1, 0.3, 0.5, 0.7, 0.9]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_samples': max_samples}
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 10,
                               verbose = 2, random_state = 42, n_jobs = -1)

rf_random.fit(X_train, y_train)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


#### Grid Search

In [None]:
parameters_RF = rf_random.best_params_

n_estimators = [parameters_RF['n_estimators']]

max_features = [parameters_RF['max_features']]

if parameters_RF['max_depth'] != None :
    max_depth = [parameters_RF['max_depth'], parameters_RF['max_depth'] + 1, parameters_RF['max_depth'] + 2]
else:
    max_depth = [None]    

min_samples_split = [parameters_RF['min_samples_split'] ,parameters_RF['min_samples_split']+1,
                     parameters_RF['min_samples_split']+2]

min_samples_leaf = [parameters_RF['min_samples_leaf'],parameters_RF['min_samples_leaf']+1,
                    parameters_RF['min_samples_leaf']+2]

max_samples = [parameters_RF['max_samples'],parameters_RF['max_samples']+0.1,parameters_RF['max_samples']+0.2]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_samples': max_samples}
grid_search_RF = GridSearchCV(estimator = rf, param_grid = random_grid , cv = 10, n_jobs = -1, verbose = 2,scoring='accuracy')
grid_search_RF.fit(X_train, y_train)
grid_search_RF.best_params_

#### Save the best parameters

In [None]:
parameters_RF = grid_search_RF.best_params_
rf_accuracy = grid_search_RF.best_score_
accuracy = round(rf_accuracy*100,2)
print(bcolors.OKCYAN + 'accuracy after that train:'+ bcolors.ENDC + " ", accuracy ,"%")
if accuracy > float(best_RF[0]):
    f_w = open("./Best Params/Best_RF.txt", "w")
    f_w.write(str(accuracy)+'\n')
    f_w.write(str(parameters_RF['max_depth']) + '\n')
    f_w.write(str(parameters_RF['max_features']) + '\n')
    f_w.write(str(parameters_RF['max_samples']) + '\n')
    f_w.write(str(parameters_RF['min_samples_leaf']))
    f_w.write(str(parameters_RF['min_samples_split']))
    f_w.write(str(parameters_RF['n_estimators']))
    f_w.close()

else:
    accuracy = float(best_RF[0])
    parameters_RF['max_depth'] = best_RF[1]
    parameters_RF['max_features'] = int(best_RF[2])
    parameters_RF['max_samples'] = int(best_RF[3])
    parameters_RF['min_samples_leaf'] = int(best_RF[4])
    parameters_RF['min_samples_split'] = int(best_RF[5])
    parameters_RF['n_estimators'] = int(best_RF[6])

    
rf = RandomForestClassifier(max_depth = parameters_RF['max_depth'],max_features = parameters_RF['max_features'],
                            max_samples=parameters_RF['max_samples'] ,min_samples_leaf= parameters_RF['min_samples_leaf'],
                            min_samples_split= parameters_RF['min_samples_split'],n_estimators= parameters_RF['n_estimators'],
                            random_state=42)

rf.fit(X_train, y_train)

# <font color = 'red'> SVM </font>

In [None]:
# open the file Best_SVM 
f_r = open("./Best Params/Best_SVM.txt", "r")
Lines = f_r.read().splitlines()
f_r.close()
best_SVM = []
for line in Lines:
    best_SVM.append(line)

### Find the hyperparameters

#### Randomized Search

In [None]:
SVM = svm.SVC(probability=True)

random_grid = {'C': [0.1, 1, 10, 100, 1000],
               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
               'kernel': ['rbf', 'poly', 'sigmoid'],
               'degree':[k for k in range(2,10)]
              }

SVM_random = RandomizedSearchCV(estimator = SVM, param_distributions = random_grid, n_iter = 200,cv = 10,
                               verbose = 2, random_state = 42, n_jobs = -1)
SVM_random.fit(X_train, y_train)
parameters = SVM_random.best_params_

#### Grid Search

In [None]:
clf_svm_GS = svm.SVC(probability=True)

param_grid = {'C': [parameters['C']/10, parameters['C']*10, parameters['C']],
              'gamma': [parameters['gamma'], parameters['gamma']/10, parameters['gamma']*10],
              'kernel': [parameters['kernel']],
              'degree':[parameters['degree'], parameters['degree'] + 1, parameters['degree'] - 1]
             }
svm_grid_search = GridSearchCV(estimator = clf_svm_GS, param_grid = param_grid, cv = 10, n_jobs = -1,
                               verbose = 2, scoring = 'accuracy')
svm_grid_search.fit(X_train, y_train)
parameters_SVM = svm_grid_search.best_params_
accuracy = round(svm_grid_search.best_score_*100,2)

#### Save the best parameters

In [None]:
print(bcolors.OKCYAN + 'accuracy after that train:'+ bcolors.ENDC + " ", accuracy ,"%")
if accuracy > float(best_SVM[0]):
    f_w = open("./Best Params/Best_SVM.txt", "w")
    f_w.write(str(accuracy)+'\n')
    f_w.write(str(parameters_SVM['C']) + '\n')
    f_w.write(str(parameters_SVM['gamma']) + '\n')
    f_w.write(str(parameters_SVM['kernel']) + '\n')
    f_w.write(str(parameters_SVM['degree']) + '\n')
    f_w.close()

else:
    accuracy = float(Best_SVM[0])
    parameters_SVM['C'] = int(Best_SVM[1])
    parameters_SVM['gamma'] = int(Best_SVM[2])
    parameters_SVM['kernel'] = str(Best_SVM[3])
    parameters_SVM['degree'] = int(Best_SVM[4])
    
SVM = svm.SVC(kernel=parameters_SVM['kernel'], C=parameters_SVM['C'], gamma = parameters_SVM['gamma'],
                       degree = parameters_SVM['degree'] probability=True)

SVM.fit(X_train, y_train)

# <font color = 'red'> K-Nearest Neighbors </font> 

In [259]:
# open the file Best_KNN 
f_r = open("./Best Params/Best_KNN.txt", "r")
Lines = f_r.read().splitlines()
f_r.close()
best_KNN = []
for line in Lines:
    best_KNN.append(line)

### Find the hyperparameters

#### Randomized Search

In [260]:
KNN = KNeighborsClassifier()
n_neighbors = [i for i in range(5,40)]
random_grid = {'n_neighbors' : n_neighbors}
KNN_random = RandomizedSearchCV(estimator = KNN, param_distributions = random_grid, n_iter = 200,cv = 10,
                               verbose = 2, random_state = 42, n_jobs = -1)
KNN_random.fit(X_train, y_train)
parameters = KNN_random.best_params_

Fitting 10 folds for each of 35 candidates, totalling 350 fits




#### Grid Search

In [261]:
n_neighbors = [parameters['n_neighbors'] - 1 ,parameters['n_neighbors'],parameters['n_neighbors'] + 1]
param_grid = {'n_neighbors' : n_neighbors}
KNN_grid_search = GridSearchCV(estimator = KNN, param_grid = param_grid, cv = 10, n_jobs = -1, verbose = 2, scoring = 'accuracy')
KNN_grid_search.fit(X_train, y_train)
parameters_KNN = KNN_grid_search.best_params_
accuracy = round(KNN_grid_search.best_score_*100,2)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


#### Save the best parameters

In [262]:
print('accuracy after that train =' + ' '+ str(accuracy)+"%")
if accuracy > float(best_KNN[0]):
    f_w = open("./Best Params/Best_KNN.txt", "w")
    f_w.write(str(accuracy)+'\n')
    f_w.write(str(parameters_KNN['n_neighbors']) + '\n')
    f_w.close()

else:
    accuracy = float(Best_KNN[0])
    parameters_KNN['n_neighbors'] = int(Best_KNN[1])
    
KNN = KNeighborsClassifier(n_neighbors = parameters_KNN['n_neighbors'])


KNN.fit(X_train, y_train)

accuracy after that train = 89.54%


KNeighborsClassifier(n_neighbors=23)

# <font color = 'red'> Naive Bayes classifier </font> 

In [263]:
GNB = GaussianNB()
MNB = MultinomialNB()
COPNB = ComplementNB()
BNB = BernoulliNB()
CNB = CategoricalNB()

GNB.fit(X_train, y_train)
MNB.fit(X_train, y_train)
CNB.fit(X_train, y_train)
BNB.fit(X_train, y_train)
COPNB.fit(X_train, y_train)

ComplementNB()

# <font color = 'red'> Find the best classifier </font> 

### Comparison between best classifiers:

In [265]:
# K Fold
clf_list = [dt,KNN,GNB,MNB,CNB,BNB,COPNB]
cm_dict, auc_dict = {}, {}
X_train_npy = X_train.to_numpy()
y_train_npy = y_train.to_numpy()
kf = KFold(n_splits=10, shuffle=True, random_state=42) #for cross validation
k = 0
for train_index, test_index in kf.split(X_train):
    k+=1
    print(bcolors.HEADER + bcolors.BOLD + '_____________________________________',k,'_____________________________________'
          + bcolors.ENDC)
    print("")
    # split to train and test
    X_train_KF, X_test_KF = X_train_npy[train_index], X_train_npy[test_index]
    y_train_KF, y_test_KF = y_train_npy[train_index], y_train_npy[test_index]
    
    for clf in clf_list:
        # train the model and make a prediction
        clf.fit(X_train_KF,y_train_KF)
        y_pred = clf.predict(X_test_KF)
        cm = metrics.confusion_matrix(y_test_KF, y_pred)
        
        y_probs = clf.predict_proba(X_test_KF) # probability prediction
        fpr, tpr, _ = metrics.roc_curve(y_test_KF, y_probs[:,1])
        auc = metrics.auc(fpr, tpr)
        
        # save the results
        cm_list = cm_dict.get(clf,[])
        cm_list.append(cm)
        cm_dict[clf] = cm_list
        
        auc_list = auc_dict.get(clf,[])
        auc_list.append(auc)
        auc_dict[clf] = auc_list
        
        # print the results
        print(bcolors.UNDERLINE + bcolors.OKCYAN + "Classifier:" + bcolors.ENDC + " "  , clf)
        print(bcolors.UNDERLINE + bcolors.OKCYAN + "Accuracy:" + bcolors.ENDC + " " ,
              round((metrics.accuracy_score(y_test_KF, y_pred))*100,2),"%")
        print(bcolors.UNDERLINE + bcolors.OKCYAN + "Confusion matrix:" + bcolors.ENDC)
        display(pd.DataFrame(cm))
        print("")

[95m[1m_____________________________________ 1 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  90.39 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1985,42
1,178,84



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  88.82 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1987,40
1,216,46



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  82.7 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1763,264
1,132,130



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  88.51 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2025,2
1,261,1



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  88.03 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1967,60
1,214,48



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  87.68 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1968,59
1,223,39



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  64.57 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1295,732
1,79,183



[95m[1m_____________________________________ 2 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  90.04 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1967,42
1,186,94



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  88.95 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1984,25
1,228,52



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  83.92 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1772,237
1,131,149



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  87.81 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2009,0
1,279,1



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  88.16 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1960,49
1,222,58



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  86.59 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1940,69
1,238,42



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  66.06 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1311,698
1,79,201



[95m[1m_____________________________________ 3 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  90.87 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1988,31
1,178,92



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  89.52 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1996,23
1,217,53



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  85.54 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1814,205
1,126,144



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  88.34 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2019,0
1,267,3



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  89.21 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1982,37
1,210,60



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  88.34 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1977,42
1,225,45



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  65.57 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1307,712
1,76,194



[95m[1m_____________________________________ 4 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  89.82 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1972,53
1,180,84



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  89.17 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2000,25
1,223,41



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  83.92 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1789,236
1,132,132



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  88.47 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2023,2
1,262,2



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  88.51 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1972,53
1,210,54



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  87.55 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1964,61
1,224,40



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  64.74 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1288,737
1,70,194



[95m[1m_____________________________________ 5 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  89.47 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1964,54
1,187,84



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  89.03 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1988,30
1,221,50



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  82.48 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1744,274
1,127,144



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  88.16 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2018,0
1,271,0



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  88.16 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1958,60
1,211,60



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  86.54 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1934,84
1,224,47



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  65.09 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1307,711
1,88,183



[95m[1m_____________________________________ 6 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  90.56 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1983,43
1,173,90



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  89.86 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2000,26
1,206,57



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  84.49 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1791,235
1,120,143



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  88.51 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2026,0
1,263,0



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  88.86 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1971,55
1,200,63



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  86.5 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1944,82
1,227,36



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  65.22 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1294,732
1,64,199



[95m[1m_____________________________________ 7 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  91.26 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2018,47
1,153,71



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  91.35 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2039,26
1,172,52



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  85.06 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1823,242
1,100,124



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  90.21 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2064,1
1,223,1



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  90.65 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2017,48
1,166,58



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  89.82 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2010,55
1,178,46



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  63.43 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1284,781
1,56,168



[95m[1m_____________________________________ 8 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  90.25 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1986,36
1,187,79



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  89.55 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1998,24
1,215,51



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  85.27 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1804,218
1,119,147



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  88.42 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2022,0
1,265,1



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  88.77 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1972,50
1,207,59



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  87.81 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1960,62
1,217,49



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  65.95 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1307,715
1,64,202



[95m[1m_____________________________________ 9 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  90.47 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1979,47
1,171,91



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  89.47 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1995,31
1,210,52



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  83.26 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1755,271
1,112,150



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  88.55 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2026,0
1,262,0



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  88.37 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1967,59
1,207,55



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  86.67 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1947,79
1,226,36



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  66.22 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1314,712
1,61,201



[95m[1m_____________________________________ 10 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mAccuracy:[0m  89.73 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1979,53
1,182,74



[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mAccuracy:[0m  89.82 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2008,24
1,209,47



[4m[96mClassifier:[0m  GaussianNB()
[4m[96mAccuracy:[0m  84.35 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1804,228
1,130,126



[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mAccuracy:[0m  88.77 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,2031,1
1,256,0



[4m[96mClassifier:[0m  CategoricalNB()
[4m[96mAccuracy:[0m  88.77 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1983,49
1,208,48



[4m[96mClassifier:[0m  BernoulliNB()
[4m[96mAccuracy:[0m  87.89 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1974,58
1,219,37



[4m[96mClassifier:[0m  ComplementNB()
[4m[96mAccuracy:[0m  63.85 %
[4m[96mConfusion matrix:[0m


Unnamed: 0,0,1
0,1292,740
1,87,169





### Print average results:

In [267]:
acc_dict = {}
prec_dict = {}
k = 0
Best_clf = 0
Best_acc = 0
for clf in clf_list:
    k+=1
    acc = [(cm[0][0]+cm[1][1])/sum(sum(cm)) for cm in cm_dict[clf]]
    acc_dict[clf] = acc
    m_acc = np.mean(acc)
    precision = [(cm[1][1])/(cm[1][1]+cm[0][1]) for cm in cm_dict[clf]]
    prec_dict[clf] = precision
    m_p = np.mean(precision)
    m_auc = np.mean(auc_dict[clf])
    
    # Get the best clf
    if m_acc > Best_acc:
        Best_acc = m_acc
        Best_clf = clf

    # Print the result
    print(bcolors.HEADER + bcolors.BOLD + '_____________________________________' + " Classifier:",k,
          '_____________________________________' + bcolors.ENDC)
    print("")
    print(bcolors.UNDERLINE + bcolors.OKCYAN + "Classifier:" + bcolors.ENDC + " "  , clf)
    print(bcolors.UNDERLINE + bcolors.OKCYAN + "mean accuracy:" + bcolors.ENDC + " " , round(m_acc*100,2))
    print(bcolors.UNDERLINE + bcolors.OKCYAN + "mean precision:" + bcolors.ENDC + " " , round(m_p*100,2))
    print(bcolors.UNDERLINE + bcolors.OKCYAN + "mean AUC:" + bcolors.ENDC + " " , round(m_auc*100,2))

[95m[1m_____________________________________ Classifier: 1 _____________________________________[0m

[4m[96mClassifier:[0m  DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)
[4m[96mmean accuracy:[0m  90.29
[4m[96mmean precision:[0m  65.35
[4m[96mmean AUC:[0m  78.49
[95m[1m_____________________________________ Classifier: 2 _____________________________________[0m

[4m[96mClassifier:[0m  KNeighborsClassifier(n_neighbors=23)
[4m[96mmean accuracy:[0m  89.55
[4m[96mmean precision:[0m  64.76
[4m[96mmean AUC:[0m  82.4
[95m[1m_____________________________________ Classifier: 3 _____________________________________[0m

[4m[96mClassifier:[0m  GaussianNB()
[4m[96mmean accuracy:[0m  84.1
[4m[96mmean precision:[0m  36.64
[4m[96mmean AUC:[0m  80.59
[95m[1m_____________________________________ Classifier: 4 _____________________________________[0m

[4m[96mClassifier:[0m  MultinomialNB()
[4m[96mmean accuracy:[0m  88.57
[4m

  precision = [(cm[1][1])/(cm[1][1]+cm[0][1]) for cm in cm_dict[clf]]


### Print the best classifier we found: 

In [268]:
print(Best_clf)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=4, random_state=42)


# Test the best classifier

In [244]:
y_pred = Best_clf.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
print(bcolors.OKCYAN + 'confusion matrix:'+ bcolors.ENDC)
cm = pd.DataFrame(metrics.confusion_matrix(y_test, dt_y_pred))
display(pd.DataFrame(cm))
print(bcolors.OKCYAN + 'accuracy:'+bcolors.ENDC + " "
      ,round(((cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]))*100,2))
print(bcolors.OKCYAN + 'sensitivity:'+bcolors.ENDC + " "
      ,round(((cm[1][1])/(cm[1][1]+cm[0][1]))*100,2))
print(bcolors.OKCYAN + 'specificity:'+bcolors.ENDC + " "
      ,round(((cm[0][0])/(cm[0][0]+cm[1][0]))*100,2))
print(bcolors.OKCYAN + 'precision:'+bcolors.ENDC + " "
      ,round(((cm[1][1])/(cm[0][0]+cm[1][0]))*100,2))

[96mconfusion matrix:[0m


Unnamed: 0,0,1
0,6337,429
1,811,53


[96maccuracy:[0m  83.75
[96msensitivity:[0m  6.13
[96mspecificity:[0m  93.66
[96mprecision:[0m  0.78


# <font color = 'red'> Classifier the test file </font> 

In [291]:
# import test file
df_test = pd.read_csv("./csv files/marketing_campaigns_test_after_pre_proc.csv", index_col=0)

In [292]:
df_classifier_test = df_test.copy()
l = ['age', 'account_balance', 'n_contact', 'p_days','l_call_duration','n_p_contact','p_days','status_cat','education_cat',
    'profession_cat', 'device_cat', 'month_l_date_cat', 'p_outcome_cat','age_bin','account_balance_bin',
    'education_Pre_Proc_cat','profession_Pre_Proc_cat','device_Pre_Proc_cat', 'p_outcome_Pre_Proc_cat']

df_classifier_test.drop(l, axis='columns', inplace=True)

In [293]:
df_classifier_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14694 entries, 512496 to 487062
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   loan                             14694 non-null  int64  
 1   mortgage                         14694 non-null  int64  
 2   credit                           14694 non-null  int64  
 3   positive                         14694 non-null  int64  
 4   isEmployed_cat                   14694 non-null  int64  
 5   age_min_max                      14694 non-null  float64
 6   n_p_contact_min_max              14694 non-null  float64
 7   p_days_min_max                   14694 non-null  float64
 8   n_contact_min_max                14694 non-null  float64
 9   account_balance_min_max          14694 non-null  float64
 10  l_call_duration_min_max          14694 non-null  float64
 11  status_cat_min_max               14694 non-null  float64
 12  education_Pr