In [4]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

--2024-10-26 16:41:08--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [    <=>             ] 999.85K  1.04MB/s    in 0.9s    

2024-10-26 16:41:11 (1.04 MB/s) - ‘bank+marketing.zip’ saved [1023843]



In [5]:
!unzip *.zip

Archive:  bank+marketing.zip
 extracting: bank.zip                
 extracting: bank-additional.zip     


In [6]:
!unzip bank.zip
!unzip bank-add*

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                
Archive:  bank-additional.zip
   creating: bank-additional/
  inflating: bank-additional/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/bank-additional/
  inflating: __MACOSX/bank-additional/._.DS_Store  
  inflating: bank-additional/.Rhistory  
  inflating: bank-additional/bank-additional-full.csv  
  inflating: bank-additional/bank-additional-names.txt  
  inflating: bank-additional/bank-additional.csv  
  inflating: __MACOSX/._bank-additional  


Data preparation

In [91]:
import pandas as pd
import numpy as np

In [38]:
df = pd.read_csv("bank-full.csv", sep=";")

target = df["y"].map({"no": 0, "yes": 1})
features = df.drop(["default", "loan", "y"], axis=1)

features.isnull().sum().mean()

np.float64(0.0)

Question 1

In [51]:
features.education.value_counts().index[0]

'secondary'

Question 2

In [78]:
features.corr(numeric_only=True)

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [89]:
features.corr(numeric_only=True).abs().unstack().sort_values(
    ascending=False
).drop_duplicates()

age       age         1.000000
pdays     previous    0.454820
day       campaign    0.162490
balance   age         0.097783
pdays     day         0.093044
          campaign    0.088628
campaign  duration    0.084570
previous  day         0.051710
          campaign    0.032855
day       duration    0.030206
pdays     age         0.023758
balance   duration    0.021560
previous  balance     0.016674
balance   campaign    0.014578
age       day         0.009120
campaign  age         0.004760
age       duration    0.004648
balance   day         0.004503
pdays     balance     0.003435
          duration    0.001565
age       previous    0.001288
duration  previous    0.001203
dtype: float64

In [93]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=42
)  # 0.25 x 0.8 = 0.2

Question 3

In [109]:
from sklearn.metrics import mutual_info_score

X_train.select_dtypes(include="object").apply(
    lambda col: mutual_info_score(col, y_train)
).sort_values(ascending=False).round(2)

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

Question 4

In [120]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [131]:
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(drop="first")),
    ]
)
categorical_features = features.select_dtypes(include="object").columns.to_list()

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_features),
    ]
)

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])

In [132]:
pipeline.fit(X_train, y_train)

In [135]:
full_model_accuracy = (pipeline.predict(X_val) == y_val).mean()
full_model_accuracy.round(2)

np.float64(0.89)

Question 5

In [153]:
def calculate_feature_importance(feature):
    X_train_ = X_train.drop([feature], axis=1).copy()
    X_val_ = X_val.drop([feature], axis=1).copy()

    categorical_transformer = Pipeline(
        steps=[
            ("encoder", OneHotEncoder(drop="first")),
        ]
    )
    categorical_features = X_train_.select_dtypes(include="object").columns.to_list()

    preprocessor = ColumnTransformer(
        transformers=[
            ("categorical", categorical_transformer, categorical_features),
        ]
    )

    model = LogisticRegression(
        solver="liblinear", C=1.0, max_iter=1000, random_state=42
    )

    pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])

    pipeline.fit(X_train_, y_train)
    restricted_model_accuracy = (pipeline.predict(X_val_) == y_val).mean()

    return full_model_accuracy - restricted_model_accuracy


accuracy_gain = {}
for feature in X_train.columns.to_list():
    accuracy_gain[feature] = calculate_feature_importance(feature)

In [163]:
dict(sorted(accuracy_gain.items(), key=lambda item: abs(item[1]), reverse=True))

{'poutcome': np.float64(0.010395930103959228),
 'education': np.float64(0.0008847600088475183),
 'month': np.float64(-0.0006635700066357497),
 'job': np.float64(-0.0002211900022119906),
 'housing': np.float64(-0.0002211900022119906),
 'marital': np.float64(0.00011059500110588427),
 'age': np.float64(0.0),
 'balance': np.float64(0.0),
 'contact': np.float64(0.0),
 'day': np.float64(0.0),
 'duration': np.float64(0.0),
 'campaign': np.float64(0.0),
 'pdays': np.float64(0.0),
 'previous': np.float64(0.0)}

Question 6

In [165]:
alphas = [0.01, 0.1, 1, 10, 100]

accuracy_gain = {}
for alpha in alphas:
    categorical_transformer = Pipeline(
        steps=[
            ("encoder", OneHotEncoder(drop="first")),
        ]
    )
    categorical_features = features.select_dtypes(include="object").columns.to_list()

    preprocessor = ColumnTransformer(
        transformers=[
            ("categorical", categorical_transformer, categorical_features),
        ]
    )

    model = LogisticRegression(
        solver="liblinear",
        C=alpha,
        max_iter=1000,
        random_state=42,
    )

    pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])

    pipeline.fit(X_train, y_train)
    model_accuracy = (pipeline.predict(X_val) == y_val).mean()

    accuracy_gain[alpha] = model_accuracy

In [168]:
dict(sorted(accuracy_gain.items(), key=lambda item: abs(item[1]), reverse=True))

{0.1: np.float64(0.8902897589028976),
 1: np.float64(0.8895155938951559),
 10: np.float64(0.88940499889405),
 100: np.float64(0.88940499889405),
 0.01: np.float64(0.8879672638796726)}