# 0. Load dataset

In [1]:
import numpy as np
import pandas as pd
import researchpy as rp
import seaborn as sns
from sklearn.linear_model import LogisticRegression as LR
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from matplotlib import pyplot as plt
%matplotlib inline
from utils.stats_utils import OLS_model


In [2]:
df = pd.read_csv('data/cleaned_data_by_clustering_fill.csv', delimiter=",")
df.drop(['Component1', 'Component2', 'Component3', 'Component4', 'k-means cluster'], axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,31,admin.,single,high.school,no,yes,no,telephone,dec,mon,...,1,999,0,nonexistent,-0.2,92.756,-45.9,3.816,5176.3,no
1,39,housemaid,married,basic.4y,no,yes,no,telephone,dec,wed,...,1,999,1,failure,-0.2,92.756,-45.9,3.743,5176.3,no
2,41,technician,divorced,professional.course,no,no,yes,cellular,dec,thu,...,1,999,0,nonexistent,-0.2,92.756,-45.9,3.669,5176.3,no
3,37,admin.,married,high.school,no,yes,no,telephone,dec,fri,...,1,999,0,nonexistent,-0.2,92.756,-45.9,3.563,5176.3,no
4,48,admin.,married,high.school,no,yes,yes,telephone,dec,fri,...,1,999,0,nonexistent,-0.2,92.756,-45.9,3.563,5176.3,no


# 1. Standardlize Numeric data

In [4]:
df_numeric = df[["age", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]]
df_numeric

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,31,1,999,0,-0.2,92.756,-45.9,3.816,5176.3
1,39,1,999,1,-0.2,92.756,-45.9,3.743,5176.3
2,41,1,999,0,-0.2,92.756,-45.9,3.669,5176.3
3,37,1,999,0,-0.2,92.756,-45.9,3.563,5176.3
4,48,1,999,0,-0.2,92.756,-45.9,3.563,5176.3
...,...,...,...,...,...,...,...,...,...
41183,54,2,10,1,-1.1,94.767,-50.8,1.035,4963.6
41184,40,2,999,4,-1.1,94.767,-50.8,1.030,4963.6
41185,62,1,1,6,-1.1,94.767,-50.8,1.031,4963.6
41186,62,2,6,3,-1.1,94.767,-50.8,1.031,4963.6


In [5]:
ss = StandardScaler()
df_numeric_scaled = pd.DataFrame(ss.fit_transform(df_numeric),columns = df_numeric.columns)

In [6]:
df_numeric_scaled.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,-0.865939,-0.565922,0.195414,-0.349494,-0.179437,-1.416064,-1.166213,0.112261,0.128222
1,-0.098268,-0.565922,0.195414,1.671136,-0.179437,-1.416064,-1.166213,0.070173,0.128222
2,0.09365,-0.565922,0.195414,-0.349494,-0.179437,-1.416064,-1.166213,0.027507,0.128222
3,-0.290186,-0.565922,0.195414,-0.349494,-0.179437,-1.416064,-1.166213,-0.033608,0.128222
4,0.765363,-0.565922,0.195414,-0.349494,-0.179437,-1.416064,-1.166213,-0.033608,0.128222


# 2. Transform categorical data

In [7]:
job = pd.get_dummies(df['job'], drop_first=True)
job = job.add_prefix("job_")
marital = pd.get_dummies(df['marital'], drop_first=True)
marital = marital.add_prefix("marital_")
education = pd.get_dummies(df['education'], drop_first=True)
education = education.add_prefix("education_")
default = pd.get_dummies(df['default'], drop_first=True)
default = default.add_prefix("default_")
housing = pd.get_dummies(df['housing'], drop_first=True)
housing = housing.add_prefix("housing_")
loan = pd.get_dummies(df['loan'], drop_first=True)
loan = loan.add_prefix("loan_")
contact = pd.get_dummies(df['contact'], drop_first=True)
contact = contact.add_prefix("contact_")
month = pd.get_dummies(df['month'], drop_first=True)
month = month.add_prefix("month_")
day_of_week = pd.get_dummies(df['day_of_week'], drop_first=True)
day_of_week = day_of_week.add_prefix("day_of_week_")
poutcome = pd.get_dummies(df['poutcome'], drop_first=True)
poutcome = poutcome.add_prefix("poutcome_")
y = pd.get_dummies(df['y'], drop_first=True)
y = y.add_prefix("y_")

df_categorical = pd.concat([job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, y], axis=1)


In [8]:
df_categorical.head()

Unnamed: 0,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# 3. Do the first predictive analysis

In [9]:
df1 = pd.concat([df_numeric_scaled, df_categorical], axis=1)
df1.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_blue-collar,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,-0.865939,-0.565922,0.195414,-0.349494,-0.179437,-1.416064,-1.166213,0.112261,0.128222,0,...,0,0,0,1,0,0,0,1,0,0
1,-0.098268,-0.565922,0.195414,1.671136,-0.179437,-1.416064,-1.166213,0.070173,0.128222,0,...,0,0,0,0,0,0,1,0,0,0
2,0.09365,-0.565922,0.195414,-0.349494,-0.179437,-1.416064,-1.166213,0.027507,0.128222,0,...,0,0,0,0,1,0,0,1,0,0
3,-0.290186,-0.565922,0.195414,-0.349494,-0.179437,-1.416064,-1.166213,-0.033608,0.128222,0,...,0,0,0,0,0,0,0,1,0,0
4,0.765363,-0.565922,0.195414,-0.349494,-0.179437,-1.416064,-1.166213,-0.033608,0.128222,0,...,0,0,0,0,0,0,0,1,0,0


In [10]:
X = df1.drop(["y_yes", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"], axis=1)
y = df1['y_yes']
OLS_model(X, y)

                            OLS Regression Results                            
Dep. Variable:                  y_yes   R-squared:                       0.167
Model:                            OLS   Adj. R-squared:                  0.166
Method:                 Least Squares   F-statistic:                     200.7
Date:                Sat, 10 Sep 2022   Prob (F-statistic):               0.00
Time:                        13:58:00   Log-Likelihood:                -7262.4
No. Observations:               41188   AIC:                         1.461e+04
Df Residuals:                   41146   BIC:                         1.497e+04
Df Model:                          41                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         