In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("bank-additional-full.csv", sep = ";")
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [53]:
import copy
clean_df = copy.deepcopy(df)
# clean_df.dtypes

In [54]:
clean_df["job"] = clean_df.job.astype("category")
clean_df["marital"] = clean_df.marital.astype("category")
clean_df["education"] = clean_df.education.astype("category")
clean_df["default"] = clean_df.default.astype("category")
clean_df["housing"] = clean_df.housing.astype("category")
clean_df["loan"] = clean_df.loan.astype("category")
clean_df["contact"] = clean_df.contact.astype("category")
clean_df["month"] = clean_df.month.astype("category")
clean_df["day_of_week"] = clean_df.day_of_week.astype("category")
clean_df["poutcome"] = clean_df.poutcome.astype("category")
clean_df['pdays'] = clean_df['pdays'].apply(lambda x: 0 if x==999 else(10 if x<=7 else(5 if x<=14 else 3)))
clean_df['y'] = clean_df['y'].apply(lambda x: 0 if x=='no' else (1 if x=='yes' else -1))

In [55]:
pd.set_option("display.max_columns", None)

In [56]:
clean_df.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
5,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,198,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
6,59,admin.,married,professional.course,no,no,no,telephone,may,mon,139,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
7,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,217,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,380,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
9,25,services,single,high.school,no,yes,no,telephone,may,mon,50,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [57]:
input_col = clean_df.columns.to_list()[:-1]

In [58]:
input_list = [25, 'services', 'married', 'high.school', 'no', 'no', 'no', 'telephone', 'may', 'mon', 123, 2, 0, 0, 'nonexistent', 1.3, 93.994, -36.4, 4.857, 5191.0]

In [59]:
input_df = pd.DataFrame(input_list).T
input_df.columns=input_col

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

In [61]:
clean_df.drop(['month', 'day_of_week'], axis=1, inplace=True)

In [62]:
y = clean_df.pop('y')

In [63]:
clean_df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,housemaid,married,basic.4y,no,no,no,telephone,261,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
1,57,services,married,high.school,unknown,no,no,telephone,149,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
2,37,services,married,high.school,no,yes,no,telephone,226,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
3,40,admin.,married,basic.6y,no,no,no,telephone,151,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
4,56,services,married,high.school,no,no,yes,telephone,307,1,0,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,334,1,0,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6
41184,46,blue-collar,married,professional.course,no,no,no,cellular,383,1,0,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6
41185,56,retired,married,university.degree,no,yes,no,cellular,189,2,0,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6
41186,44,technician,married,professional.course,no,no,no,cellular,442,1,0,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6


In [69]:
cat_df = clean_df[clean_df.select_dtypes(include=['category']).columns]
num_df = clean_df[clean_df.select_dtypes(include=[np.number]).columns]

In [70]:
cat_cols = list(clean_df.select_dtypes(include=['object']).columns)
num_cols = list(clean_df.select_dtypes(include=[np.number]).columns)

In [71]:
cat_df

Unnamed: 0,job,marital,education,default,housing,loan,contact,poutcome
0,housemaid,married,basic.4y,no,no,no,telephone,nonexistent
1,services,married,high.school,unknown,no,no,telephone,nonexistent
2,services,married,high.school,no,yes,no,telephone,nonexistent
3,admin.,married,basic.6y,no,no,no,telephone,nonexistent
4,services,married,high.school,no,no,yes,telephone,nonexistent
...,...,...,...,...,...,...,...,...
41183,retired,married,professional.course,no,yes,no,cellular,nonexistent
41184,blue-collar,married,professional.course,no,no,no,cellular,nonexistent
41185,retired,married,university.degree,no,yes,no,cellular,nonexistent
41186,technician,married,professional.course,no,no,no,cellular,nonexistent


In [72]:
num_df

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,261,1,0,0,1.1,93.994,-36.4,4.857,5191.0
1,57,149,1,0,0,1.1,93.994,-36.4,4.857,5191.0
2,37,226,1,0,0,1.1,93.994,-36.4,4.857,5191.0
3,40,151,1,0,0,1.1,93.994,-36.4,4.857,5191.0
4,56,307,1,0,0,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,0,0,-1.1,94.767,-50.8,1.028,4963.6
41184,46,383,1,0,0,-1.1,94.767,-50.8,1.028,4963.6
41185,56,189,2,0,0,-1.1,94.767,-50.8,1.028,4963.6
41186,44,442,1,0,0,-1.1,94.767,-50.8,1.028,4963.6


In [73]:
# eco = OneHotEncoder()
# eco_df=eco.fit_transform(cat_df)

In [74]:
OHE = OneHotEncoder()
scaler = StandardScaler()

level0 = list()
level0.append(('RF', RandomForestClassifier()))
level0.append(('GBM', GradientBoostingClassifier()))
level0.append(('XGB', XGBClassifier()))
level0.append(('Bagging', BaggingClassifier()))
level0.append(('LightGBM', LGBMClassifier()))

level1 = LGBMClassifier()
stacking_model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10)

In [75]:
transformer = ColumnTransformer([('cat_cols', OHE, cat_cols),
                                ('num_cols', scaler, num_cols)])

pipeline = Pipeline([('preprocessing', transformer),
                    ('classifier', stacking_model)])

In [None]:
pipeline.fit(clean_df, y)

In [76]:
def deploy_preprocessing(input_list, input_col):
    # for i in range(len(list)):
    #     if list[i]=='NA' or ' ':
    #         list[i] = np.nan
    input_data = pd.DataFrame(input_list).T
    input_data.columns=input_col

    input_data['age'] = input_data.age.astype('int')
    input_data["job"] = input_data.job.astype("category")
    input_data["marital"] = input_data.marital.astype("category")
    input_data["education"] = input_data.education.astype("category")
    input_data["default"] = input_data.default.astype("category")
    input_data["housing"] = input_data.housing.astype("category")
    input_data["loan"] = input_data.loan.astype("category")
    input_data["contact"] = input_data.contact.astype("category")
    input_data["month"] = input_data.month.astype("category")
    input_data["day_of_week"] = input_data.day_of_week.astype("category")
    input_data['duration'] = input_data.duration.astype('int')
    input_data['campaign'] = input_data.campaign.astype('int')
    input_data['pdays'] = input_data['pdays'].apply(lambda x: 0 if x==999 else(10 if x<=7 else(5 if x<=14 else 3)))
    input_data['previous'] = input_data.previous.astype('int')
    input_data["poutcome"] = input_data.poutcome.astype("category")
    input_data['emp.var.rate'] = input_data['emp.var.rate'].astype('float')
    input_data['cons.price.idx'] = input_data['cons.price.idx'].astype('float')
    input_data['cons.conf.idx'] = input_data['cons.conf.idx'].astype('float')
    input_data['euribor3m'] = input_data['euribor3m'].astype('float')
    input_data['nr.employed'] = input_data['nr.employed'].astype('float')

    input_data.drop(['month', 'day_of_week'], axis=1, inplace=True)
    return input_data

    

In [77]:
input_data = deploy_preprocessing(input_list, input_col)

In [78]:
input_data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,25,services,married,high.school,no,no,no,telephone,123,2,10,0,nonexistent,1.3,93.994,-36.4,4.857,5191.0


In [None]:
pipeline.predict(input_data)