Import packages

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns 
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV

Import dataset

In [2]:
data = pd.read_csv('term-deposit-marketing-2020.csv')

Data Exploration

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


In [4]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,40000.0,40.5446,9.641776,19.0,33.0,39.0,48.0,95.0
balance,40000.0,1274.27755,2903.769716,-8019.0,54.0,407.0,1319.0,102127.0
day,40000.0,16.017225,8.278127,1.0,8.0,17.0,21.0,31.0
duration,40000.0,254.8243,259.366498,0.0,100.0,175.0,313.0,4918.0
campaign,40000.0,2.882175,3.239051,1.0,1.0,2.0,3.0,63.0


Check for missing Values

In [5]:
data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
y            0
dtype: int64

Examining Categorical Variables

In [6]:
data['job'].value_counts()

blue-collar      9383
management       8166
technician       6852
admin            4483
services         3910
retired          1437
self-employed    1414
entrepreneur     1405
unemployed       1104
housemaid        1087
student           524
unknown           235
Name: job, dtype: int64

In [7]:
data['marital'].value_counts()

married     24386
single      10889
divorced     4725
Name: marital, dtype: int64

In [8]:
data['default'].value_counts()

no     39191
yes      809
Name: default, dtype: int64

In [9]:
data['housing'].value_counts()

yes    24031
no     15969
Name: housing, dtype: int64

In [10]:
data['contact'].value_counts()

cellular     24914
unknown      12765
telephone     2321
Name: contact, dtype: int64

In [11]:
data['month'].value_counts()

may    13532
jul     6380
aug     5215
jun     4734
nov     3598
apr     2718
feb     2296
jan     1176
mar      258
oct       80
dec       13
Name: month, dtype: int64

In [12]:
data['education'].value_counts()

secondary    20993
tertiary     11206
primary       6270
unknown       1531
Name: education, dtype: int64

In [13]:
data['y'].value_counts()

no     37104
yes     2896
Name: y, dtype: int64

Note the dataset is imbalanced

ETL

In [14]:
# One hot encoding

In [15]:
data_encoded = pd.get_dummies(data, columns = ['job','marital','default','housing','contact','month','education','loan','day','month'])

In [16]:
data_encoded.head()

Unnamed: 0,age,balance,duration,campaign,y,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct
0,58,2143,261,1,no,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,44,29,151,1,no,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,33,2,76,1,no,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,47,1506,92,1,no,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,33,1,198,1,no,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
#Scale data

In [18]:
scaler = MinMaxScaler()
data_encoded[['age','balance','duration','campaign']] = scaler.fit_transform(data_encoded[['age','balance','duration','campaign']])

In [19]:
X,Y = data_encoded.drop('y',axis = 1),data_encoded['y']
#X_train,y_train,X_test,y_test = train_test_split()

lr = LogisticRegression()

# Create the parameter grid
param_grid = {'penalty':['l2'],
              'C':[1,1.5,2],
              'class_weight':['balanced', None],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[10000000]}

# Create a GridSearchCV object
grid_lr_class = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    refit=True, return_train_score=True)

grid_lr_class.fit(X,Y)


cv_results_df = pd.DataFrame(grid_rf_class.grid_scores_)
print(cv_results_df)

In [None]:
rfc = RandomForestClassifier(random_state = 42)
param_grid = { 
    'n_estimators': [10, 25, 50, 100, 200, 250, 300, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2,4,8,10,15,20,25,50,None],
    'class_weight':['balanced',"balanced_subsample", None],
    'criterion' :['gini','entropy']
}


GS_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,refit=True, return_train_score=True)
GS_rfc.fit(X,Y)

print("Best parameters",GS_rfc.best_params_,"\n")

rf_best = GS_rfc.best_estimator_


In [None]:
cv_results_df = pd.DataFrame(GS_rfc.grid_scores_)
print(cv_results_df)

In [None]:
cross_validate(rfc,X,Y, cv=5)