# Homework 03. Classification

# Downloading the dataset

In [5]:
import zipfile
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from IPython.display import display
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

--2024-10-16 15:24:17--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip.3’

bank+marketing.zip.     [    <=>             ] 999.85K  1.02MB/s    in 1.0s    

2024-10-16 15:24:19 (1.02 MB/s) - ‘bank+marketing.zip.3’ saved [1023843]



In [7]:
with zipfile.ZipFile('bank.zip', 'r') as zip_ref:
    zip_ref.extractall('bank_dataset') 

In [8]:
extracted_files = os.listdir()
print(extracted_files)

['bank+marketing.zip', 'bank+marketing.zip.1', 'bank_dataset', 'bank.zip', 'path_to_extract', 'bank-additional.zip', 'bank+marketing.zip.2', '.ipynb_checkpoints', 'Untitled.ipynb', 'Downloads', 'homework-03.ipynb', 'bank+marketing.zip.3']


In [9]:
csv_path = 'bank_dataset/bank-full.csv'

df = pd.read_csv(csv_path, sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# Selecting the features

In [10]:
features = [
    'age', 'job', 'marital', 'education', 'balance', 
    'housing', 'contact', 'day', 'month', 'duration', 
    'campaign', 'pdays', 'previous', 'poutcome', 'y'
]

In [11]:
df = df[features]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


# Data Preparation

In [12]:
missing_values = df.isnull().sum()
print(missing_values)

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


# Question 1

In [13]:
# What is the most frequent observation (mode) for the column education?

In [14]:
education_mode = df['education'].mode()[0]

In [15]:
education_mode

'secondary'

# Question 2

In [16]:
# Create the correlation matrix for the numerical features of your dataset.

In [17]:
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

correlation_matrix = df[numerical_features].corr()

correlation_matrix


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [18]:
# What are the two features that have the biggest correlation?

In [19]:
correlation_matrix_abs = correlation_matrix.abs()
np.fill_diagonal(correlation_matrix_abs.values, 0)
max_corr = correlation_matrix_abs.unstack().idxmax()

In [20]:
max_corr

('pdays', 'previous')

In [21]:
# Target encoding

In [22]:
df.y = (df.y == 'yes').astype(int)

In [23]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


In [24]:
# Splitting the data

In [25]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [26]:
len(df_full_train), len(df_test)

(36168, 9043)

In [27]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [28]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [29]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [30]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [31]:
del df_train['y']
del df_val['y']
del df_test['y']

# Question 3 

In [32]:
# Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
# Round the scores to 2 decimals using round(score, 2).

In [33]:
categorical = df_train.select_dtypes(include=['object']).columns.tolist()

In [34]:
categorical

['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [35]:
def calculate_mi(series):
    return mutual_info_score(series, df_full_train.y)

df_mi = df_full_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

In [36]:
display(df_mi)

Unnamed: 0,MI
poutcome,0.029257
month,0.024774
contact,0.014164
housing,0.0098
job,0.007765
education,0.002458
marital,0.002019


# Question 4

In [37]:
# Now let's train a logistic regression.
# Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.

In [47]:
accuracy = accuracy_score(y_val, y_pred)
accuracy = round(accuracy, 2)

print("Accuracy:", accuracy)

Accuracy: 0.9


# Question 5

In [48]:
# Let's find the least useful feature using the feature elimination technique.
# Train a model with all these features (using the same parameters as in Q4).
#Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
# For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [80]:
features = ["age", "balance", "marital", "previous"]  # features we've specifically chosen
scores = pd.DataFrame(columns=["eliminated_feature", "accuracy", "difference"])

full_dv = DictVectorizer(sparse=False)
full_train_dict = df_train[features].to_dict(orient="records")
full_X_train = full_dv.fit_transform(full_train_dict)

full_val_dict = df_val[features].to_dict(orient="records")
full_X_val = full_dv.transform(full_val_dict)

model = LogisticRegression(solver="liblinear", max_iter=1000, C=1.0, random_state=42)
model.fit(full_X_train, y_train)
y_pred = model.predict(full_X_val)
original_score = accuracy_score(y_val, y_pred)

for feature in features:

    keep_cols = [col for col in full_dv.get_feature_names_out() if not col.startswith(feature)]
    X_train = full_X_train[:, [full_dv.vocabulary_[col] for col in keep_cols]]
    X_val = full_X_val[:, [full_dv.vocabulary_[col] for col in keep_cols]]

    model = LogisticRegression(solver="liblinear", max_iter=1000, C=1.0, random_state=SEED)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    scores.loc[len(scores)] = [feature, score, original_score - score]
    
scores = scores.sort_values(by="difference", ascending=True).reset_index(drop=True)
print(scores)

smallest_diff_feature = scores.iloc[0]["eliminated_feature"]
print(f"\nFeature with the smallest difference is: {smallest_diff_feature}")


  eliminated_feature  accuracy  difference
0           previous  0.881663   -0.001327
1                age  0.880447   -0.000111
2            balance  0.880336    0.000000
3            marital  0.880226    0.000111

Feature with the smallest difference is: previous


# Question 6

In [None]:
# Now let's train a regularized logistic regression.
# Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].
# Train models using all the features as in Q4.
# Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
# Which of these C leads to the best accuracy on the validation set?

In [82]:
dicts_train = df_train.to_dict(orient="records")
dicts_val = df_val.to_dict(orient="records")

dv = DictVectorizer(sparse=False)
dv.fit(dicts_train)

X_train = dv.transform(dicts_train)
X_val = dv.transform(dicts_val)
results = []

for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred_val)
    results.append((c, round(accuracy, 3)))


df_results = pd.DataFrame(data=results, columns=["C", "accuracy"])
print(df_results.sort_values("accuracy", ascending=False))


        C  accuracy
2    1.00     0.901
4  100.00     0.901
3   10.00     0.901
1    0.10     0.900
0    0.01     0.899
