Import packages

In [83]:
import numpy as np
import pandas as pd

import seaborn as sns 
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_validate,GridSearchCV

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

Import dataset

In [57]:
data = pd.read_csv('term-deposit-marketing-2020.csv')

Data Exploration

In [58]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


In [59]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,40000.0,40.5446,9.641776,19.0,33.0,39.0,48.0,95.0
balance,40000.0,1274.27755,2903.769716,-8019.0,54.0,407.0,1319.0,102127.0
day,40000.0,16.017225,8.278127,1.0,8.0,17.0,21.0,31.0
duration,40000.0,254.8243,259.366498,0.0,100.0,175.0,313.0,4918.0
campaign,40000.0,2.882175,3.239051,1.0,1.0,2.0,3.0,63.0


Check for missing Values

In [60]:
data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
y            0
dtype: int64

Examining Categorical Variables

In [61]:
data['job'].value_counts()

blue-collar      9383
management       8166
technician       6852
admin            4483
services         3910
retired          1437
self-employed    1414
entrepreneur     1405
unemployed       1104
housemaid        1087
student           524
unknown           235
Name: job, dtype: int64

In [62]:
data['marital'].value_counts()

married     24386
single      10889
divorced     4725
Name: marital, dtype: int64

In [63]:
data['default'].value_counts()

no     39191
yes      809
Name: default, dtype: int64

In [64]:
data['housing'].value_counts()

yes    24031
no     15969
Name: housing, dtype: int64

In [65]:
data['contact'].value_counts()

cellular     24914
unknown      12765
telephone     2321
Name: contact, dtype: int64

In [66]:
data['month'].value_counts()

may    13532
jul     6380
aug     5215
jun     4734
nov     3598
apr     2718
feb     2296
jan     1176
mar      258
oct       80
dec       13
Name: month, dtype: int64

In [67]:
data['education'].value_counts()

secondary    20993
tertiary     11206
primary       6270
unknown       1531
Name: education, dtype: int64

In [68]:
data['y'].value_counts()

no     37104
yes     2896
Name: y, dtype: int64

Note the dataset is imbalanced

ETL

In [69]:
# One hot encoding

In [71]:
data_encoded = pd.get_dummies(data, columns = ['job','marital','default','housing','contact','month','education','loan','day','month'])
data_encoded['y'] = pd.get_dummies(data_encoded['y'], columns = ['y'], drop_first=True)

In [72]:
data_encoded.head()

Unnamed: 0,age,balance,duration,campaign,y,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct
0,58,2143,261,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,44,29,151,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,33,2,76,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,47,1506,92,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,33,1,198,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [73]:
#Scale data


Unnamed: 0,age,balance,duration,campaign,y,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,month_jun,month_jun.1,month_mar,month_mar.1,month_may,month_may.1,month_nov,month_nov.1,month_oct,month_oct.1
0,58,2143,261,1,0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
1,44,29,151,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
2,33,2,76,1,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,0
3,47,1506,92,1,0,0,1,0,0,0,...,0,0,0,0,1,1,0,0,0,0
4,33,1,198,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,395,107,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
39996,30,3340,238,3,1,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0
39997,54,200,170,1,1,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
39998,34,1047,342,1,0,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,0


In [78]:
X, Y = data_encoded.drop('y',axis = 1), data_encoded['y'] 

scaler = MinMaxScaler()
X.loc[:,X.columns] = scaler.fit_transform(X.loc[:,X.columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [84]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, 
                                                    random_state = 3)

#base model
rfc = RandomForestClassifier(random_state = 3)

rfc.fit(X_train, Y_train)
pred = rfc.predict(X_test)

print("Confusion Matrix\n",confusion_matrix(Y_test, pred))
print("\nAccuracy_score:",accuracy_score(Y_test, pred))

Confusion Matrix
 [[7297  124]
 [ 395  184]]

Accuracy_score: 0.935125


# Create the parameter grid
param_grid = {'penalty':['l2'],
              'C':[1,1.5,2],
              'class_weight':['balanced', None],
              'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'max_iter':[10000000]}

# Create a GridSearchCV object
grid_lr_class = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    refit=True, return_train_score=True)

grid_lr_class.fit(X,Y)


cv_results_df = pd.DataFrame(grid_rf_class.grid_scores_)
print(cv_results_df)

In [None]:
rfc = RandomForestClassifier(random_state = 42)
param_grid = { 
    'n_estimators': [10, 25, 50, 100, 200, 250, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2,6,10,15,20,None],
    'class_weight':['balanced',"balanced_subsample", None],
    'criterion' :['gini','entropy'],
    'random_state' = [42]
}


GS_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,refit=True, return_train_score=True)
GS_rfc.fit(X,Y)

print("Best parameters",GS_rfc.best_params_,"\n")

rf_best = GS_rfc.best_estimator_


In [None]:
cv_results_df = pd.DataFrame(GS_rfc.grid_scores_)
print(cv_results_df)

In [None]:
cross_validate(rfc,X,Y, cv=5)

In [None]:
from sklearn.model_selection import KFold
from sklearn.manifold import TSNE

kf = KFold(n_splits=5, shuffle= True, random_state = 42)


scores = []
for train_index, test_index in kf.split(X):

    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = y[train_index], y[test_index]
    
    rfc = RandomForestClassifier(**GS_rfc.best_params_)
    
    rfc.fit(X_train, Y_train)
    pred = rfc.predict(X_test)

    acc = accuracy_score(Y_test, pred)
    scores.append(acc)
    
    print("Confusion Matrix\n",confusion_matrix(Y_test, pred))
    print("\nAccuracy_score:",acc)


    tsne = TSNE(random_state=42).fit_transform(X_test)
    tsne = pd.DataFrame(tsne)
    sns.scatterplot(x=tsne.iloc[:,0],y=tsne.iloc[:,1],hue = Y_test.to_list(),alpha =1 )
    plt.legend(loc='upper left')
    plt.title('TSNE Plot')
    plt.show()
    