In [18]:
import pandas as pd

df = pd.read_csv("bank-full.csv", delimiter=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [20]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [21]:
df.describe(include="object")

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
count,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211
unique,12,3,4,2,2,2,3,12,4,2
top,blue-collar,married,secondary,no,yes,no,cellular,may,unknown,no
freq,9732,27214,23202,44396,25130,37967,29285,13766,36959,39922


In [22]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    df.drop("y", axis=1), df["y"], test_size=0.3, random_state=26)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(31647, 16)
(13564, 16)
(31647,)
(13564,)


In [23]:
# create model with pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

numeric_features = ["age", "balance", "day", "duration",
                    "campaign", "pdays", "previous"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# categorical transfomer
categorical_features = ["job", "marital", "education",
                        "default", "housing", "loan", "contact",
                        "month", "poutcome"]

# categorical transfomer
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, n_jobs=10, max_depth=34))
])

model.fit(x_train, y_train)


In [24]:
# testing

y_pred = model.predict(x_test)

# confussion matrix with label
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          no       0.92      0.97      0.95     11955
         yes       0.67      0.40      0.50      1609

    accuracy                           0.91     13564
   macro avg       0.80      0.69      0.72     13564
weighted avg       0.89      0.91      0.89     13564



In [25]:
from sklearn.tree import DecisionTreeClassifier

model_dt = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier())
])

model_dt.fit(x_train, y_train)

y_pred = model_dt.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          no       0.93      0.93      0.93     11955
         yes       0.48      0.48      0.48      1609

    accuracy                           0.88     13564
   macro avg       0.70      0.71      0.70     13564
weighted avg       0.88      0.88      0.88     13564



In [26]:
# create another model with adaboost

from sklearn.ensemble import AdaBoostClassifier

model_ada = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", AdaBoostClassifier())
])

model_ada.fit(x_train, y_train)

y_pred = model_ada.predict(x_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

          no       0.92      0.97      0.94     11955
         yes       0.61      0.33      0.43      1609

    accuracy                           0.90     13564
   macro avg       0.76      0.65      0.69     13564
weighted avg       0.88      0.90      0.88     13564



# bongkar model

In [27]:
model["classifier"].estimators_

[DecisionTreeClassifier(max_depth=34, max_features='sqrt',
                        random_state=1529737416),
 DecisionTreeClassifier(max_depth=34, max_features='sqrt',
                        random_state=200311226),
 DecisionTreeClassifier(max_depth=34, max_features='sqrt',
                        random_state=1143563392),
 DecisionTreeClassifier(max_depth=34, max_features='sqrt',
                        random_state=1213926291),
 DecisionTreeClassifier(max_depth=34, max_features='sqrt',
                        random_state=594819530),
 DecisionTreeClassifier(max_depth=34, max_features='sqrt',
                        random_state=204383415),
 DecisionTreeClassifier(max_depth=34, max_features='sqrt',
                        random_state=271232978),
 DecisionTreeClassifier(max_depth=34, max_features='sqrt', random_state=39596097),
 DecisionTreeClassifier(max_depth=34, max_features='sqrt',
                        random_state=1837947254),
 DecisionTreeClassifier(max_depth=34, max_feature

In [28]:
from sklearn.metrics import f1_score

y_test_ = y_test.apply(lambda x: 1 if x=="yes" else 0)
for dt in model["classifier"].estimators_:
    x = model["preprocessor"].transform(x_test)
    y_pred = dt.predict(x)
    print(f1_score(y_test_, y_pred))

0.43586550435865506
0.41935483870967744
0.43889955214331416
0.461961503208066
0.4261986837981824
0.45414572864321606
0.45784654246921375
0.40933459476505835
0.4448540706605223
0.43216392424712824
0.4418006430868167
0.45247740729199126
0.417910447761194
0.42111801242236024
0.467459324155194
0.444583203246956
0.425585023400936
0.43685365089313694
0.43988816402609504
0.43390259329538267
0.4322274881516588
0.4443753884400249
0.43744090765836746
0.4545738370277865
0.44751728472658703
0.45796178343949046
0.44548872180451127
0.4277638190954774
0.4370066308809599
0.4358647096362476
0.4643304130162703
0.44941030415890754
0.4382093316519546
0.43057722308892354
0.45946811919256647
0.4709897610921502
0.4463171036204744
0.4137487953742371
0.4398003742981909
0.45885286783042395
0.4683812405446294
0.44199243379571246
0.4507836990595611
0.4448030987734022
0.4297674418604651
0.4352791878172589
0.4563926226945921
0.44299065420560746
0.46272811630064953
0.43993660855784467
0.41511771995043373
0.441085749

In [29]:
model["classifier"].estimators_[0].tree_.max_depth

34

In [30]:
model_dt["classifier"].tree_.max_depth

32

In [31]:
# prompt: build model using xgboost

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import xgboost as xgb

# Load the dataset
df = pd.read_csv("bank-full.csv", delimiter=";")

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    df.drop("y", axis=1), df["y"], test_size=0.3, random_state=26)

# Define numerical and categorical features
numeric_features = ["age", "balance", "day", "duration",
                    "campaign", "pdays", "previous"]
categorical_features = ["job", "marital", "education",
                        "default", "housing", "loan", "contact",
                        "month", "poutcome"]

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Additional evaluation (if needed)
y_test_ = y_test.apply(lambda x: 1 if x=="yes" else 0)
y_pred_num = [1 if x=="yes" else 0 for x in y_pred] #convert string to numeric
print(f1_score(y_test_, y_pred_num))

0.0


In [32]:
# build model using xgboost


import xgboost as xgb
# Create and train the XGBoost model
model_xgb = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", xgb.XGBClassifier()) # Use XGBClassifier
])
y_train_ = y_train.apply(lambda x: 1 if x=="yes" else 0)
y_test_ = y_test.apply(lambda x: 1 if x=="yes" else 0)
model_xgb.fit(x_train, y_train_)
# Make predictions and evaluate the model
y_pred = model_xgb.predict(x_test)
print(classification_report(y_test_, y_pred))



AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [34]:
# prompt: stacking model

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define the base estimators
estimators = [
    ('rf', model),  # RandomForestClassifier model
    ('dt', model_dt), # DecisionTreeClassifier model
    ('ada', model_ada) # AdaBoostClassifier model
]

# Define the stacking classifier
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression() # XGBoost as the final estimator
)


In [35]:
stacking_model

In [36]:

# Train the stacking model
stacking_model.fit(x_train, y_train_)

# Make predictions
y_pred = stacking_model.predict(x_test)

# Evaluate the stacking model
print(classification_report(y_test_, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     11955
           1       0.64      0.47      0.54      1609

    accuracy                           0.91     13564
   macro avg       0.79      0.72      0.74     13564
weighted avg       0.90      0.91      0.90     13564

