# Week 3 Homework

## Dataset

In [2]:
# !wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip -O bank-marketing.zip && unzip bank-marketing.zip 'bank.zip' && unzip bank.zip 'bank.csv' && rm bank-marketing.zip bank.zip

In [3]:
import pandas as pd


df = pd.read_csv("bank.csv", sep=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


## Data Preparation

Select only the features from above

In [4]:
df = df[[
    "age",
    "job",
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y",
]]

In [5]:
numerical = df.select_dtypes("number").columns.to_list()
categorical = df.select_dtypes("object").columns.to_list()

In [6]:
numerical

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [7]:
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'y']

Check if the msising values are presented in the features

In [8]:
df.isna().any()

age          False
job          False
marital      False
education    False
balance      False
housing      False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

There seem to be no missing values in the features. However, let's take a look at the unique values per feature:

In [9]:
df.apply(lambda x: x.unique(), axis=0)

age          [30, 33, 35, 59, 36, 39, 41, 43, 20, 31, 40, 5...
job          [unemployed, services, management, blue-collar...
marital                            [married, single, divorced]
education              [primary, secondary, tertiary, unknown]
balance      [1787, 4789, 1350, 1476, 0, 747, 307, 147, 221...
housing                                              [no, yes]
contact                         [cellular, unknown, telephone]
day          [19, 11, 16, 3, 5, 23, 14, 6, 17, 20, 13, 30, ...
month        [oct, may, apr, jun, feb, aug, jan, jul, nov, ...
duration     [79, 220, 185, 199, 226, 141, 341, 151, 57, 31...
campaign     [1, 4, 2, 5, 3, 6, 18, 10, 9, 7, 12, 14, 13, 2...
pdays        [-1, 339, 330, 176, 147, 241, 152, 105, 342, 1...
previous     [0, 4, 1, 3, 2, 5, 20, 7, 6, 10, 9, 8, 18, 19,...
poutcome                    [unknown, failure, other, success]
y                                                    [no, yes]
dtype: object

These are the columns with missing values designated as `unknown`:

In [10]:
df.select_dtypes(include="object").apply(lambda x: print(x.unique()) if x.str.contains("unknown").any() else None, axis=0);

['unemployed' 'services' 'management' 'blue-collar' 'self-employed'
 'technician' 'entrepreneur' 'admin.' 'student' 'housemaid' 'retired'
 'unknown']
['primary' 'secondary' 'tertiary' 'unknown']
['cellular' 'unknown' 'telephone']
['unknown' 'failure' 'other' 'success']


## Question 1

In [11]:
df.education.mode()

0    secondary
Name: education, dtype: object

## Question 2

In [12]:
correlation_matrix = df[numerical].corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.08382,-0.017853,-0.002367,-0.005148,-0.008894,-0.003511
balance,0.08382,1.0,-0.008677,-0.01595,-0.009976,0.009437,0.026196
day,-0.017853,-0.008677,1.0,-0.024629,0.160706,-0.094352,-0.059114
duration,-0.002367,-0.01595,-0.024629,1.0,-0.068382,0.01038,0.01808
campaign,-0.005148,-0.009976,0.160706,-0.068382,1.0,-0.093137,-0.067833
pdays,-0.008894,0.009437,-0.094352,0.01038,-0.093137,1.0,0.577562
previous,-0.003511,0.026196,-0.059114,0.01808,-0.067833,0.577562,1.0


In [13]:
correlation_matrix.unstack().abs()[correlation_matrix.unstack().abs().lt(1)].idxmax()

('pdays', 'previous')

## Target Encoding

In [14]:
df = (
    df
    .assign(y=(df.y == "yes").astype(int))
)

In [16]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_full_train = df_full_train.y.values
y_train = df_train.y.values
y_test = df_test.y.values
y_val = df_val.y.values

df_full_train = df_full_train.drop(columns="y")
df_train = df_train.drop(columns="y")
df_test = df_test.drop(columns="y")
df_val = df_val.drop(columns="y")

In [18]:
from sklearn.metrics import mutual_info_score

for col in ["contact", "education", "housing", "poutcome"]:
    print(round(mutual_info_score(y_train, df_train[col]), 2))

0.01
0.0
0.01
0.03


## Question 4

In [21]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


dicts_full_train = df_full_train.to_dict(orient="records")
dicts_train = df_train.to_dict(orient="records")
dicts_test = df_test.to_dict(orient="records")
dicts_val = df_val.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train = dv.transform(dicts_train)
X_val = dv.transform(dicts_val)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)

(y_pred_val == y_val).mean().round(2)


np.float64(0.89)

In [24]:
accuracy_all = (y_pred_val == y_val).mean()

## Question 5

In [38]:
results = []

for feature_to_exclude in df_train.columns:
    
    dicts_train = df_train.drop(columns=feature_to_exclude).to_dict(orient="records")
    dicts_val = df_val.drop(columns=feature_to_exclude).to_dict(orient="records")

    dv = DictVectorizer(sparse=False)
    dv.fit(dicts_train)

    X_train = dv.transform(dicts_train)
    X_val = dv.transform(dicts_val)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)
    accuracy = (y_pred_val == y_val).mean()
    difference = abs(accuracy - accuracy_all)
    results.append((feature_to_exclude, accuracy, difference))

    # print(f"Excluded feature '{col_to_exclude}', Accuracy: {accuracy}, Accuracy difference with baseline: {(accuracy-accuracy_all).round(2)}")

    df_results = pd.DataFrame(data=results, columns=["excluded feature", "accuracy", "difference"])

In [42]:
df_results.sort_values(by="difference")

Unnamed: 0,excluded feature,accuracy,difference
7,day,0.887168,0.0
5,housing,0.887168,0.0
0,age,0.888274,0.001106
3,education,0.888274,0.001106
4,balance,0.886062,0.001106
6,contact,0.886062,0.001106
10,campaign,0.888274,0.001106
8,month,0.886062,0.001106
11,pdays,0.888274,0.001106
12,previous,0.888274,0.001106


Out of the available options, `age` is the least important feature. 

## Question 6

In [43]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


dicts_full_train = df_full_train.to_dict(orient="records")
dicts_train = df_train.to_dict(orient="records")
dicts_test = df_test.to_dict(orient="records")
dicts_val = df_val.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train = dv.transform(dicts_train)
X_val = dv.transform(dicts_val)

results = []

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)

    accuracy = (y_pred_val == y_val).mean().round(3)

    results.append((c, accuracy))

    df_results = pd.DataFrame(data=results, columns=["C", "accuracy"])


In [52]:
df_results.iloc[df_results["accuracy"].idxmax()]

C           1.000
accuracy    0.887
Name: 2, dtype: float64