In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectPercentile
from sklearn.dummy import DummyClassifier

In [3]:
df = pd.read_csv('data_20319681.csv')
#Getting rid of the column Unnamed: 0
df1=df.loc[:,'age':'y']
df1.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,duration,campaign,pdays,previous,poutcome,y
0,35,management,single,tertiary,no,704,no,no,cellular,21,164,1,-1,0,unknown,yes
1,44,entrepreneur,married,tertiary,no,121,no,no,cellular,9,248,1,91,1,success,yes
2,63,management,married,tertiary,no,3115,no,no,cellular,16,432,5,-1,0,unknown,yes
3,35,blue-collar,married,secondary,no,149,yes,no,cellular,29,507,3,349,1,other,yes
4,30,services,single,secondary,no,140,yes,no,cellular,15,760,1,-1,0,unknown,yes


In [4]:
#Converting the target value to numeric
df1.loc[df1['y']=='yes','y']=1
df1.loc[df1['y']=='no','y']=0

In [5]:
df1.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,duration,campaign,pdays,previous,poutcome,y
0,35,management,single,tertiary,no,704,no,no,cellular,21,164,1,-1,0,unknown,1
1,44,entrepreneur,married,tertiary,no,121,no,no,cellular,9,248,1,91,1,success,1
2,63,management,married,tertiary,no,3115,no,no,cellular,16,432,5,-1,0,unknown,1
3,35,blue-collar,married,secondary,no,149,yes,no,cellular,29,507,3,349,1,other,1
4,30,services,single,secondary,no,140,yes,no,cellular,15,760,1,-1,0,unknown,1


In [6]:
#Removing default feature
df2 = df1.drop(columns='duration')
df2 = df2.drop(columns='default')
#df2 = df2.drop(columns='previous')

In [7]:
df2[df2['balance']>28000]

Unnamed: 0,age,job,marital,education,balance,housing,loan,contact,day,campaign,pdays,previous,poutcome,y
288,61,retired,married,tertiary,32685,no,no,cellular,2,2,-1,0,unknown,1
4350,50,services,married,secondary,57435,yes,no,cellular,21,3,-1,0,unknown,0
4830,31,management,single,tertiary,38279,no,no,cellular,16,2,-1,0,unknown,0


In [8]:
#Removing these outliers from balance
df2 = df2.drop([288,4350,4830])

In [9]:
len(df2)

5997

In [10]:
#Changing our categorical variables to dummy variables
df2_dummies = pd.get_dummies(df2)
print('Features after get_dummies: ',list(df2_dummies.columns))
print(len(list(df2_dummies.columns)))

Features after get_dummies:  ['age', 'balance', 'day', 'campaign', 'pdays', 'previous', 'y', 'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married', 'marital_single', 'education_primary', 'education_secondary', 'education_tertiary', 'education_unknown', 'housing_no', 'housing_yes', 'loan_no', 'loan_yes', 'contact_cellular', 'contact_telephone', 'contact_unknown', 'poutcome_failure', 'poutcome_other', 'poutcome_success', 'poutcome_unknown']
37


In [11]:
#split my data into data and target
target = df2_dummies['y']
data = df2_dummies.drop(columns='y')

In [12]:
#Extract Numpy arrays
X = data
y = target

In [13]:
#Print the shape of our data and target
print(X.shape)
print(y.shape)

(5997, 36)
(5997,)


In [14]:
#Divide our data into training set and testing set
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)

In [76]:
#Using function to select only some features.
#We set the percentile of how many features we want to keep
select = SelectPercentile(percentile=30)
select.fit(X_train,y_train)
#get_support can show you which features have been chosen and which have been not
#select.get_support()
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

#standardize our training data and testing data
scaler = StandardScaler()
scaler.fit(X_train_selected)
X_train_scaled = scaler.transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

In [28]:
#Using a model basedline 
dummy_clf = DummyClassifier(strategy="stratified")
#Train and test and display a confusion matrix
dummy = dummy_clf.fit(X_train_scaled,y_train)
pred = dummy.predict(X_test_scaled)
print('Confusion matrix ')
print(confusion_matrix(y_test,pred))
print()
#print a classification report that includes precision,recall and F-1
print('Logistic model classification report')
print(classification_report(y_test,pred))

Confusion matrix 
[[934 260]
 [241  65]]

Logistic model classification report
              precision    recall  f1-score   support

           0       0.79      0.78      0.79      1194
           1       0.20      0.21      0.21       306

    accuracy                           0.67      1500
   macro avg       0.50      0.50      0.50      1500
weighted avg       0.67      0.67      0.67      1500



In [31]:
#Using stratified k-fold cross validation
kfold = KFold(n_splits=10)

In [32]:
#Using our stratified cross validation on our three models and displaying the averages scores
#First we start with our logistic regression
# We will use score(which uses accuracy) and we will deploy other evaluation metrics such as confusion matrices 
#and performances metrics such as precision, recall and F-1

logreg = LogisticRegression(C=200)
scores = cross_val_score(logreg,X_train_scaled,y_train,cv=kfold)
print('Scores from the cross validation ', scores)
print('Average cross validation score on logistic regression is ',scores.mean())

#Train and test and display a confusion matrix
logreg1 = logreg.fit(X_train_scaled,y_train)
pred = logreg1.predict(X_test_scaled)
print('Score on test set: ', logreg1.score(X_test_scaled,y_test))
print('Confusion matrix ')
print(confusion_matrix(y_test,pred))
print()
#print a classification report that includes precision,recall and F-1
print('Logistic model classification report')
print(classification_report(y_test,pred))

Scores from the cross validation  [0.82888889 0.83777778 0.80666667 0.79111111 0.82       0.81777778
 0.81555556 0.82405345 0.83518931 0.81514477]
Average cross validation score on logistic regression is  0.8192165305617423
Score on test set:  0.8166666666666667
Confusion matrix 
[[1177   17]
 [ 258   48]]

Logistic model classification report
              precision    recall  f1-score   support

           0       0.82      0.99      0.90      1194
           1       0.74      0.16      0.26       306

    accuracy                           0.82      1500
   macro avg       0.78      0.57      0.58      1500
weighted avg       0.80      0.82      0.77      1500



In [85]:
#Decision tree
#Applying cross validation
tree = DecisionTreeClassifier(max_depth=7, random_state=0)
scores = cross_val_score(tree,X_train_scaled,y_train,cv=kfold)
print('Scores from the decision tree ', scores)
print('Average validation score on decision tree is ',scores.mean())
#Training on the training set and testing on the testing set
#Display confusion matrix and classification report
print('Training and testing ')
tree1 = tree.fit(X_train_scaled,y_train)
pred_tree = tree1.predict(X_test_scaled)
print('Score on test set: ',tree1.score(X_test_scaled,y_test))
print('Confusion matrix ')
print(confusion_matrix(y_test,pred_tree))
print()
print('Classification report ')
print(classification_report(y_test,pred_tree))

Scores from the decision tree  [0.82222222 0.81555556 0.80444444 0.77777778 0.81777778 0.80888889
 0.82       0.81959911 0.81959911 0.81514477]
Average validation score on decision tree is  0.8121009651076466
Training and testing 
Score on test set:  0.818
Confusion matrix 
[[1180   14]
 [ 259   47]]

Classification report 
              precision    recall  f1-score   support

           0       0.82      0.99      0.90      1194
           1       0.77      0.15      0.26       306

    accuracy                           0.82      1500
   macro avg       0.80      0.57      0.58      1500
weighted avg       0.81      0.82      0.77      1500



In [69]:
#Random forest
forest = RandomForestClassifier(n_estimators = 40,random_state =2)
scores = cross_val_score(forest,X_train_scaled,y_train,cv=kfold)
print('Scores from the decision tree ', scores)
print('Average cross validation score on random forest is ',scores.mean())
#Training on the training set and testing on the testing set
#Display confusion matrix and classification report
print('Training and testing ')
forest1 = forest.fit(X_train_scaled,y_train)
pred_forest = forest1.predict(X_test_scaled)
print('Score on test set: ',forest1.score(X_test_scaled,y_test))
print('Confusion matrix ')
print(confusion_matrix(y_test,pred_forest))
print()
print('Classification report ')
print(classification_report(y_test,pred_forest))

Scores from the decision tree  [0.81333333 0.81555556 0.79555556 0.78666667 0.82888889 0.81111111
 0.80888889 0.81737194 0.83296214 0.79287305]
Average cross validation score on random forest is  0.8103207126948775
Training and testing 
Score on test set:  0.814
Confusion matrix 
[[1163   31]
 [ 248   58]]

Classification report 
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1194
           1       0.65      0.19      0.29       306

    accuracy                           0.81      1500
   macro avg       0.74      0.58      0.59      1500
weighted avg       0.79      0.81      0.77      1500



# Function to automate the process

In [74]:
#Automating the training process for checking best parameters for logistic regression
def tuning_best_model_param(filename):
    df = pd.read_csv(filename)
    #Getting rid of the column Unnamed: 0
    df1=df.loc[:,'age':'y']
    #Converting the target value to numeric
    df1.loc[df1['y']=='yes','y']=1
    df1.loc[df1['y']=='no','y']=0
    #Removing default feature
    df2 = df1.drop(columns='duration')
    df2 = df2.drop(columns='default')
    #Drop the columns with balance over 28000
    df2 = df2[df2['balance']<28000]
    #Apply get dummies on categorical variable
    #print('Features before get_dummies: ')
    #print(list(df2.columns))
    df2_dummies = pd.get_dummies(df2)
    #print('Features after get_dummies: ')
    #print(list(df2_dummies.columns))
    #print(len(list(df2_dummies.columns)))
    #split my data into data and target
    target = df2_dummies['y']
    data = df2_dummies.drop(columns='y')
    #Extract Numpy arrays
    X = data.values
    y = target.values
    #Print the shape of our data and target
    #print(X.shape)
    #print(y.shape)
    #Divide our data into training set and testing set
    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
    #Trying many parameters
    percent = [100,80,70,60,50,40,30]
    param = [0.01,0.05,0.1,0.5,1,10,50,100,200]
    max_precision = 0.69
    min_FP = 25
    max_TP = 40
    #best_cross_val = 0.82
    best_percent = 50
    best_param = 100
    #best_score_test = 0.82
    for i in percent:
        for j in param:
            #Using select percentile
            select = SelectPercentile(percentile=i)
            select.fit(X_train,y_train)
            X_train_selected = select.transform(X_train)
            X_test_selected = select.transform(X_test)
            #standardize our training data and testing data
            scaler = StandardScaler()
            scaler.fit(X_train_selected)
            X_train_scaled = scaler.transform(X_train_selected)
            X_test_scaled = scaler.transform(X_test_selected)
    
            kfold = KFold(n_splits=10) 
            logreg = LogisticRegression(C=j)
            scores = cross_val_score(logreg,X_train_scaled,y_train,cv=kfold)
            
            logreg1 = logreg.fit(X_train_scaled,y_train)
            pred = logreg1.predict(X_test_scaled)
            score_test = logreg1.score(X_test_scaled,y_test)
            FP = confusion_matrix(y_test,pred)[0][1]
            TP = confusion_matrix(y_test,pred)[1][1]
            precision = float(classification_report(y_test,pred)[128:132])
            if  FP <= min_FP :
                min_FP = confusion_matrix(y_test,pred)[0][1]
                max_TP = confusion_matrix(y_test,pred)[1][1]
                #best_cross_val = scores.mean()
                #best_score_test = score_test
                max_precision = precision
                best_percent = i
                best_param = j
    print('Min FP is : ',min_FP)
    print('Max TP is : ',max_TP)
    print('Max precision is ', max_precision)
    print("Percent is ",best_percent)
    print('Param is ',best_param)
    

In [75]:
tuning_best_model_param('data_20319681.csv')

Min FP is :  17
Max TP is :  48
Max precision is  0.74
Percent is  30
Param is  200


In [72]:
#Automating the training process for decision tree
def tuning_best_model_param_dt(filename):
    df = pd.read_csv(filename)
    #Getting rid of the column Unnamed: 0
    df1=df.loc[:,'age':'y']
    #Converting the target value to numeric
    df1.loc[df1['y']=='yes','y']=1
    df1.loc[df1['y']=='no','y']=0
    #Removing default feature
    df2 = df1.drop(columns='duration')
    df2 = df2.drop(columns='default')
    #Drop the columns with balance over 28000
    df2 = df2[df2['balance']<28000]
    #Apply get dummies on categorical variable
    #print('Features before get_dummies: ')
    #print(list(df2.columns))
    df2_dummies = pd.get_dummies(df2)
    #print('Features after get_dummies: ')
    #print(list(df2_dummies.columns))
    #print(len(list(df2_dummies.columns)))
    #split my data into data and target
    target = df2_dummies['y']
    data = df2_dummies.drop(columns='y')
    #Extract Numpy arrays
    X = data.values
    y = target.values
    #Print the shape of our data and target
    #print(X.shape)
    #print(y.shape)
    #Divide our data into training set and testing set
    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
    #Trying many parameters
    percent = [80,70,60,50,40,30]
    param = [2,3,4,5,6,7,8]
    max_precision = 0.69
    min_FP = 25
    max_TP = 40
    #best_cross_val = 0.82
    best_percent = 80
    best_param = 4
    #best_score_test = 0.82
    for i in percent:
        for j in param:
            #Using select percentile
            select = SelectPercentile(percentile=i)
            select.fit(X_train,y_train)
            X_train_selected = select.transform(X_train)
            X_test_selected = select.transform(X_test)
            #standardize our training data and testing data
            scaler = StandardScaler()
            scaler.fit(X_train_selected)
            X_train_scaled = scaler.transform(X_train_selected)
            X_test_scaled = scaler.transform(X_test_selected)
    
            kfold = KFold(n_splits=10) 
            tree = DecisionTreeClassifier(max_depth=j, random_state=0)
            scores = cross_val_score(tree,X_train_scaled,y_train,cv=kfold)
            
            tree1 = tree.fit(X_train_scaled,y_train)
            pred = tree1.predict(X_test_scaled)
            score_test = tree1.score(X_test_scaled,y_test)
            FP = confusion_matrix(y_test,pred)[0][1]
            TP = confusion_matrix(y_test,pred)[1][1]
            precision = float(classification_report(y_test,pred)[128:132])
            if  FP <= min_FP :
                min_FP = confusion_matrix(y_test,pred)[0][1]
                max_TP = confusion_matrix(y_test,pred)[1][1]
                #best_cross_val = scores.mean()
                #best_score_test = score_test
                max_precision = precision
                best_percent = i
                best_param = j
    print('Min FP is : ',min_FP)
    print('Max TP is : ',max_TP)
    print('Max precision is ', max_precision)
    print("Percent is ",best_percent)
    print('Param is ',best_param)

In [73]:
tuning_best_model_param_dt('data_20319681.csv')

Min FP is :  14
Max TP is :  47
Max precision is  0.77
Percent is  30
Param is  7


In [66]:
#Automating the training process for decision tree
def tuning_best_model_param_rf(filename):
    df = pd.read_csv(filename)
    #Getting rid of the column Unnamed: 0
    df1=df.loc[:,'age':'y']
    #Converting the target value to numeric
    df1.loc[df1['y']=='yes','y']=1
    df1.loc[df1['y']=='no','y']=0
    #Removing default feature
    df2 = df1.drop(columns='duration')
    df2 = df2.drop(columns='default')
    #Drop the columns with balance over 28000
    df2 = df2[df2['balance']<28000]
    #Apply get dummies on categorical variable
    #print('Features before get_dummies: ')
    #print(list(df2.columns))
    df2_dummies = pd.get_dummies(df2)
    #print('Features after get_dummies: ')
    #print(list(df2_dummies.columns))
    #print(len(list(df2_dummies.columns)))
    #split my data into data and target
    target = df2_dummies['y']
    data = df2_dummies.drop(columns='y')
    #Extract Numpy arrays
    X = data.values
    y = target.values
    #Print the shape of our data and target
    #print(X.shape)
    #print(y.shape)
    #Divide our data into training set and testing set
    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
    #Trying many parameters
    percent = [100,80,70,60,50,40,30]
    param = [10,20,40,50,60,100,150,200]
    max_precision = 0.69
    min_FP = 40
    max_TP = 30
    #best_cross_val = 0.82
    best_percent = 100
    best_param = 10
    #best_score_test = 0.82
    for i in percent:
        for j in param:
            #Using select percentile
            select = SelectPercentile(percentile=i)
            select.fit(X_train,y_train)
            X_train_selected = select.transform(X_train)
            X_test_selected = select.transform(X_test)
            #standardize our training data and testing data
            scaler = StandardScaler()
            scaler.fit(X_train_selected)
            X_train_scaled = scaler.transform(X_train_selected)
            X_test_scaled = scaler.transform(X_test_selected)
    
            kfold = KFold(n_splits=10) 
            forest = RandomForestClassifier(n_estimators = j,random_state =2)
            scores = cross_val_score(forest,X_train_scaled,y_train,cv=kfold)
            
            forest1 = forest.fit(X_train_scaled,y_train)
            pred = forest1.predict(X_test_scaled)
            score_test = forest1.score(X_test_scaled,y_test)
            FP = confusion_matrix(y_test,pred)[0][1]
            TP = confusion_matrix(y_test,pred)[1][1]
            precision = float(classification_report(y_test,pred)[128:132])
            if  FP <= min_FP :
                min_FP = confusion_matrix(y_test,pred)[0][1]
                max_TP = confusion_matrix(y_test,pred)[1][1]
                #best_cross_val = scores.mean()
                #best_score_test = score_test
                max_precision = precision
                best_percent = i
                best_param = j
    print('Min FP is : ',min_FP)
    print('Max TP is : ',max_TP)
    print('Max precision is ', max_precision)
    print("Percent is ",best_percent)
    print('Param is ',best_param)

In [67]:
tuning_best_model_param_rf('data_20319681.csv')

Min FP is :  31
Max TP is :  58
Max precision is  0.65
Percent is  30
Param is  40
