In [238]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("psvishnu/bank-direct-marketing")

print("Path to dataset files:", path)

Path to dataset files: /Users/jeongho/.cache/kagglehub/datasets/psvishnu/bank-direct-marketing/versions/1


In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [240]:
df = pd.read_csv(os.path.join(path, "bank-full.csv"), delimiter=";")

In [241]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [242]:
{column: list(df[column]) for column in df.columns}

{'age': [58,
  44,
  33,
  47,
  33,
  35,
  28,
  42,
  58,
  43,
  41,
  29,
  53,
  58,
  57,
  51,
  45,
  57,
  60,
  33,
  28,
  56,
  32,
  25,
  40,
  44,
  39,
  52,
  46,
  36,
  57,
  49,
  60,
  59,
  51,
  57,
  25,
  53,
  36,
  37,
  44,
  50,
  60,
  54,
  58,
  36,
  58,
  44,
  55,
  29,
  54,
  48,
  32,
  42,
  24,
  38,
  38,
  47,
  40,
  46,
  32,
  53,
  57,
  33,
  49,
  51,
  60,
  59,
  55,
  35,
  57,
  31,
  54,
  55,
  43,
  53,
  44,
  55,
  49,
  55,
  45,
  47,
  42,
  59,
  46,
  51,
  56,
  41,
  46,
  57,
  42,
  30,
  60,
  60,
  57,
  36,
  55,
  60,
  39,
  46,
  44,
  53,
  52,
  59,
  27,
  44,
  47,
  34,
  59,
  45,
  29,
  46,
  56,
  36,
  59,
  44,
  41,
  33,
  59,
  57,
  56,
  51,
  34,
  43,
  52,
  33,
  29,
  34,
  31,
  55,
  55,
  32,
  38,
  55,
  28,
  23,
  32,
  43,
  32,
  46,
  53,
  34,
  57,
  37,
  59,
  33,
  56,
  48,
  43,
  54,
  51,
  26,
  40,
  39,
  50,
  41,
  51,
  60,
  52,
  48,
  48,
  39,
  47,
  40,
  45,
  2

In [243]:
y = df["y"]
X = df.drop(["y"], axis=1)

In [244]:
get_categorical_feature = X.select_dtypes("object").columns

In [245]:
get_categorical_feature

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

In [246]:
{column: list(X[column].unique()) for column in get_categorical_feature}

{'job': ['management',
  'technician',
  'entrepreneur',
  'blue-collar',
  'unknown',
  'retired',
  'admin.',
  'services',
  'self-employed',
  'unemployed',
  'housemaid',
  'student'],
 'marital': ['married', 'single', 'divorced'],
 'education': ['tertiary', 'secondary', 'unknown', 'primary'],
 'default': ['no', 'yes'],
 'housing': ['yes', 'no'],
 'loan': ['no', 'yes'],
 'contact': ['unknown', 'cellular', 'telephone'],
 'month': ['may',
  'jun',
  'jul',
  'aug',
  'oct',
  'nov',
  'dec',
  'jan',
  'feb',
  'mar',
  'apr',
  'sep'],
 'poutcome': ['unknown', 'failure', 'other', 'success']}

In [247]:
X = X.replace("unknown", np.NaN)

In [248]:
X.isna().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
dtype: int64

In [249]:
X = X.drop(["poutcome"], axis=1)

In [250]:
get_categorical_feature = X.select_dtypes("object").columns
{column: list(X[column].unique()) for column in get_categorical_feature}

{'job': ['management',
  'technician',
  'entrepreneur',
  'blue-collar',
  nan,
  'retired',
  'admin.',
  'services',
  'self-employed',
  'unemployed',
  'housemaid',
  'student'],
 'marital': ['married', 'single', 'divorced'],
 'education': ['tertiary', 'secondary', nan, 'primary'],
 'default': ['no', 'yes'],
 'housing': ['yes', 'no'],
 'loan': ['no', 'yes'],
 'contact': [nan, 'cellular', 'telephone'],
 'month': ['may',
  'jun',
  'jul',
  'aug',
  'oct',
  'nov',
  'dec',
  'jan',
  'feb',
  'mar',
  'apr',
  'sep']}

In [251]:
binary_feature = ["loan", "housing", "default"]

ordinal_feature = ["education", "month"]

nominal_feature = ["job", "marital", "contact"]

In [252]:
def binary_encode(df, columns, postive_label):
    df = df.copy()
    for column in columns:
        df[column] = df[column].apply(lambda x: 1 if x == postive_label else 0)
    return df


X = binary_encode(X, binary_feature, "yes")

In [253]:
education_ordering = ["primary", "secondary", "tertiary"]

month_ordering = [
    "jan",
    "feb",
    "mar",
    "apr",
    "may",
    "jun",
    "jul",
    "aug",
    "sep",
    "oct",
    "nov",
    "dec",
]

In [254]:
def ordinal_encode(df, column, orderings):
    df = df.copy()
    for column, ordering in zip(column, orderings):
        df[column] = df[column].apply(
            lambda x: ordering.index(x) if str(x) != "nan" else x
        )
    return df

In [255]:
orderings = [education_ordering, month_ordering]
X = ordinal_encode(X, ordinal_feature, orderings)

In [256]:
def onehot_encode(df, columns):
    df = df.copy()
    for column in columns:
        dummies = pd.get_dummies(df[column], dtype=int)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop([column], axis=1)
    return df


X = onehot_encode(X, nominal_feature)

In [257]:
X["month"]

0         4
1         4
2         4
3         4
4         4
         ..
45206    10
45207    10
45208    10
45209    10
45210    10
Name: month, Length: 45211, dtype: int64

In [258]:
X["education"] = X["education"].fillna(X["education"].median())

In [259]:
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [260]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [261]:
X

Unnamed: 0,age,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,admin.,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,divorced,married,single,cellular,telephone
0,1.606965,1.314507,-0.13549,0.256419,0.893915,-0.436803,-1.298476,-0.475354,0.011016,-0.569351,-0.411453,-0.251940,-0.359369,-0.523740,-0.184415,-0.16793,1.944270,-0.229600,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,-1.356030,-0.262091
1,0.288529,-0.218740,-0.13549,-0.437895,0.893915,-0.436803,-1.298476,-0.475354,-0.416127,-0.569351,-0.411453,-0.251940,-0.359369,-0.523740,-0.184415,-0.16793,-0.514332,-0.229600,-0.190234,-0.318082,-0.145557,2.225121,-0.172266,-0.360780,-1.229691,1.592128,-1.356030,-0.262091
2,-0.747384,-0.218740,-0.13549,-0.446762,0.893915,2.289359,-1.298476,-0.475354,-0.707361,-0.569351,-0.411453,-0.251940,-0.359369,-0.523740,5.422561,-0.16793,-0.514332,-0.229600,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,-1.356030,-0.262091
3,0.571051,-0.218740,-0.13549,0.047205,0.893915,-0.436803,-1.298476,-0.475354,-0.645231,-0.569351,-0.411453,-0.251940,-0.359369,1.909346,-0.184415,-0.16793,-0.514332,-0.229600,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,-1.356030,-0.262091
4,-0.747384,-0.218740,-0.13549,-0.447091,-1.118674,-0.436803,-1.298476,-0.475354,-0.233620,-0.569351,-0.411453,-0.251940,-0.359369,-0.523740,-0.184415,-0.16793,-0.514332,-0.229600,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,-1.229691,1.592128,-1.356030,-0.262091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.947747,1.314507,-0.13549,-0.176460,-1.118674,-0.436803,0.143418,2.016333,2.791329,0.076230,-0.411453,-0.251940,-0.359369,-0.523740,-0.184415,-0.16793,-0.514332,-0.229600,-0.190234,-0.318082,-0.145557,2.225121,-0.172266,-0.360780,0.813212,-0.628090,0.737447,-0.262091
45207,2.831227,-1.751986,-0.13549,0.120447,-1.118674,-0.436803,0.143418,2.016333,0.768224,-0.246560,-0.411453,-0.251940,-0.359369,-0.523740,-0.184415,-0.16793,-0.514332,4.355402,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,2.771775,-1.229691,-0.628090,0.737447,-0.262091
45208,2.925401,-0.218740,-0.13549,1.429593,-1.118674,-0.436803,0.143418,2.016333,3.373797,0.721811,1.436189,1.050473,-0.359369,-0.523740,-0.184415,-0.16793,-0.514332,4.355402,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,0.737447,-0.262091
45209,1.512791,-0.218740,-0.13549,-0.228024,-1.118674,-0.436803,0.143418,2.016333,0.970146,0.399020,-0.411453,-0.251940,-0.359369,1.909346,-0.184415,-0.16793,-0.514332,-0.229600,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,-1.356030,3.815470


In [262]:
y

array([0, 0, 0, ..., 1, 0, 0])

In [263]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [264]:
model = LogisticRegression()

model.fit(X_train, y_train)

In [265]:
model_acc = model.score(X_test, y_test)
print("Model Accurary", model_acc)

Model Accurary 0.8891919787673253
