In [20]:
import pandas as pd
import numpy as np
import lazypredict
import sklearn
import xgboost
import lightgbm
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from lazypredict.Supervised import LazyClassifier

In [2]:
data_url = ("/Users/vidyakumar/Desktop/python/apziva/term-deposit-marketing-2020.csv")

In [3]:
def load_data():
    data = pd.read_csv(data_url)
    return data

data = load_data()

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  int64 
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  object
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  y          40000 non-null  object
dtypes: int64(5), object(9)
memory usage: 4.3+ MB


In [5]:
data_job1 = pd.get_dummies(data['job'], prefix = 'job', prefix_sep = '_')
data_job1.head()

Unnamed: 0,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1


In [6]:
data_job2 = pd.get_dummies(data['job'], prefix = 'job', prefix_sep = '_', drop_first = True)
data_job2.head()

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown
0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1


In [7]:
data_job = pd.get_dummies(data['job'], prefix = 'job', prefix_sep = '_', drop_first = True)
data_marital = pd.get_dummies(data['marital'], prefix = 'marital', prefix_sep = '_', drop_first = True)
data_education = pd.get_dummies(data['education'], prefix = 'education', prefix_sep = '_', drop_first = True)
data_default = pd.get_dummies(data['default'], prefix = 'default', prefix_sep = '_', drop_first = True)
data_housing = pd.get_dummies(data['housing'], prefix = 'housing', prefix_sep = '_', drop_first = True)
data_loan = pd.get_dummies(data['loan'], prefix = 'loan', prefix_sep = '_', drop_first = True)
data_contact = pd.get_dummies(data['contact'], prefix = 'contact', prefix_sep = '_', drop_first = True)
data_month = pd.get_dummies(data['month'], prefix = 'month', prefix_sep = '_', drop_first = True)
data_y = pd.get_dummies(data['y'], prefix = 'y', prefix_sep = '_', drop_first = True)

In [8]:
# Concat original data frame and dummy columns
data_dc = data.drop(columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'y'])
data_dc = pd.concat([data_dc, data_job, data_marital, data_education, data_default, data_housing, data_loan, data_contact, data_month, data_y], axis = 1)
data_dc.head()

Unnamed: 0,age,balance,day,duration,campaign,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,y_yes
0,58,2143,5,261,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,44,29,5,151,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,33,2,5,76,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,47,1506,5,92,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,33,1,5,198,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [9]:
data_dc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  40000 non-null  int64
 1   balance              40000 non-null  int64
 2   day                  40000 non-null  int64
 3   duration             40000 non-null  int64
 4   campaign             40000 non-null  int64
 5   job_blue-collar      40000 non-null  uint8
 6   job_entrepreneur     40000 non-null  uint8
 7   job_housemaid        40000 non-null  uint8
 8   job_management       40000 non-null  uint8
 9   job_retired          40000 non-null  uint8
 10  job_self-employed    40000 non-null  uint8
 11  job_services         40000 non-null  uint8
 12  job_student          40000 non-null  uint8
 13  job_technician       40000 non-null  uint8
 14  job_unemployed       40000 non-null  uint8
 15  job_unknown          40000 non-null  uint8
 16  marital_married      4

In [10]:
#checking if its balanced dataset -- imbalanced dataset
data_dc['y_yes'].value_counts()

0    37104
1     2896
Name: y_yes, dtype: int64

In [11]:
data_dc.describe()

Unnamed: 0,age,balance,day,duration,campaign,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,y_yes
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,...,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,40.54,1274.28,16.02,254.82,2.88,0.23,0.04,0.03,0.2,0.04,...,0.0,0.06,0.03,0.16,0.12,0.01,0.34,0.09,0.0,0.07
std,9.64,2903.77,8.28,259.37,3.24,0.42,0.18,0.16,0.4,0.19,...,0.02,0.23,0.17,0.37,0.32,0.08,0.47,0.29,0.04,0.26
min,19.0,-8019.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,33.0,54.0,8.0,100.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39.0,407.0,17.0,175.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,48.0,1319.0,21.0,313.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
X = data_dc.drop(columns = ['y_yes'])
y = data_dc['y_yes']

In [13]:
rs=12
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = rs)
print(X_train.shape)           
print(X_test.shape)

(32000, 36)
(8000, 36)


In [14]:
#random oversampling
ros = RandomOverSampler(random_state = rs)     
X_ros, y_ros = ros.fit_resample(X_train, y_train) 

In [15]:
pd.value_counts(y_ros) 

1    29692
0    29692
Name: y_yes, dtype: int64

In [16]:
lazypredict.Supervised.CLASSIFIERS

[('AdaBoostClassifier', sklearn.ensemble._weight_boosting.AdaBoostClassifier),
 ('BaggingClassifier', sklearn.ensemble._bagging.BaggingClassifier),
 ('BernoulliNB', sklearn.naive_bayes.BernoulliNB),
 ('CalibratedClassifierCV', sklearn.calibration.CalibratedClassifierCV),
 ('CategoricalNB', sklearn.naive_bayes.CategoricalNB),
 ('DecisionTreeClassifier', sklearn.tree._classes.DecisionTreeClassifier),
 ('DummyClassifier', sklearn.dummy.DummyClassifier),
 ('ExtraTreeClassifier', sklearn.tree._classes.ExtraTreeClassifier),
 ('ExtraTreesClassifier', sklearn.ensemble._forest.ExtraTreesClassifier),
 ('GaussianNB', sklearn.naive_bayes.GaussianNB),
 ('KNeighborsClassifier',
  sklearn.neighbors._classification.KNeighborsClassifier),
 ('LabelPropagation',
  sklearn.semi_supervised._label_propagation.LabelPropagation),
 ('LabelSpreading', sklearn.semi_supervised._label_propagation.LabelSpreading),
 ('LinearDiscriminantAnalysis',
  sklearn.discriminant_analysis.LinearDiscriminantAnalysis),
 ('Linear

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_ros, X_test, y_ros, y_test)
print(models)