<a href="https://colab.research.google.com/github/vperezguti/machine-learning-zoomcamp/blob/03-Classification/MLZ_Homework3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import zipfile
import requests
from io import BytesIO

# Download the zip file
url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
response = requests.get(url)
zip_file = zipfile.ZipFile(BytesIO(response.content))

# Find the inner zip file
inner_zip_filename = [
    name for name in zip_file.namelist() if name.endswith("bank.zip")
][0]
inner_zip_data = zip_file.read(inner_zip_filename)
inner_zip_file = zipfile.ZipFile(BytesIO(inner_zip_data))

# Find the CSV file and read it into a DataFrame
csv_filename = [
    name for name in inner_zip_file.namelist() if name.endswith("bank-full.csv")
][0]
csv_data = inner_zip_file.read(csv_filename).decode("utf-8")
df = pd.read_csv(BytesIO(csv_data.encode("utf-8")), sep=";")

# Now df contains the data from bank-full.csv
print(df.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


In [None]:
df.y = (df.y == 'yes').astype(int)

In [None]:
df.y.mean()

0.11698480458295547

In [None]:
selected_columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df_selected = df[selected_columns]
print(df_selected.head())

   age           job  marital  education  balance housing  contact  day month  \
0   58    management  married   tertiary     2143     yes  unknown    5   may   
1   44    technician   single  secondary       29     yes  unknown    5   may   
2   33  entrepreneur  married  secondary        2     yes  unknown    5   may   
3   47   blue-collar  married    unknown     1506     yes  unknown    5   may   
4   33       unknown   single    unknown        1      no  unknown    5   may   

   duration  campaign  pdays  previous poutcome  y  
0       261         1     -1         0  unknown  0  
1       151         1     -1         0  unknown  0  
2        76         1     -1         0  unknown  0  
3        92         1     -1         0  unknown  0  
4       198         1     -1         0  unknown  0  


In [None]:
# Check for missing values in each feature
missing_values_count = df_selected.isnull().sum()

# Print the count of missing values for each feature
print(missing_values_count)

# Check if any feature has missing values
has_missing_values = missing_values_count.any()

if has_missing_values:
  print("There are missing values in the features.")
else:
  print("There are no missing values in the features.")

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64
There are no missing values in the features.


In [None]:

mode_education = df_selected['education'].mode()[0]
print(f"The most frequent observation (mode) for the column education is: {mode_education}")

The most frequent observation (mode) for the column education is: secondary


In [None]:
numerical_features = df_selected.select_dtypes(include=['number'])
correlation_matrix = numerical_features.corr()
print(correlation_matrix)

               age   balance       day  duration  campaign     pdays  \
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758   
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435   
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044   
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565   
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628   
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000   
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820   
y         0.025155  0.052838 -0.028348  0.394521 -0.073172  0.103621   

          previous         y  
age       0.001288  0.025155  
balance   0.016674  0.052838  
day      -0.051710 -0.028348  
duration  0.001203  0.394521  
campaign -0.032855 -0.073172  
pdays     0.454820  0.103621  
previous  1.000000  0.093236  
y         0.093236  1.000000  


In [None]:
import numpy as np

# Find the features with the biggest correlation (excluding 1.0 which is self-correlation)
correlation_matrix_abs = correlation_matrix.abs()
np.fill_diagonal(correlation_matrix_abs.values, 0)  # Set diagonal to 0 to exclude self-correlation
max_value = np.nanmax(correlation_matrix_abs)
row, col = correlation_matrix_abs[correlation_matrix_abs == max_value].stack().idxmax()

print(f"The two features with the biggest correlation are: {row} and {col}")

The two features with the biggest correlation are: pdays and previous


### Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the biggest mutual information score?

contact

education

housing

poutcome

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_full_train, df_test = train_test_split(df_selected, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [None]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [None]:
df_train

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
20326,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown,0
24301,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown,0
38618,49,blue-collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown,0
18909,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown,0
23081,31,self-employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13264,27,services,single,secondary,167,no,cellular,8,jul,606,2,-1,0,unknown,0
28829,40,technician,single,tertiary,693,no,cellular,30,jan,427,1,-1,0,unknown,0
3844,54,technician,divorced,secondary,0,yes,unknown,16,may,161,1,-1,0,unknown,0
15597,25,services,single,secondary,2311,no,cellular,21,jul,1105,2,-1,0,unknown,1


In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

In [None]:
y_train

array([0, 0, 0, ..., 0, 1, 0])

In [None]:
# prompt: find the categorical characteristics in df

categorical = df_selected.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical characteristics: {categorical}")

Categorical characteristics: ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']


In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
def mutual_info_y_score(series):
    return round(mutual_info_score(series, y_train),2)

In [None]:
mi = df_train[categorical].apply(mutual_info_y_score)
mi.sort_values(ascending=False)

Unnamed: 0,0
month,0.03
poutcome,0.03
job,0.01
housing,0.01
contact,0.01
marital,0.0
education,0.0


### **Question 4**
Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.

Fit the model on the training dataset.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

0.6

0.7

0.8

0.9



In [None]:
from sklearn.feature_extraction import DictVectorizer

In [None]:
numerical= df_train.select_dtypes(include=['number']).columns.tolist()
print(f"Numerical characteristics: {numerical}")

Numerical characteristics: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']


In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
model.intercept_[0]

-0.524224912558905

In [None]:
model.coef_[0].round(3)

array([ 0.352,  0.066, -0.942, -0.294, -0.151,  0.033, -0.113,  0.042,
       -0.566, -0.003, -0.126, -0.152, -0.29 , -0.088,  0.359, -0.192,
       -0.08 ,  0.23 , -0.129,  0.271, -0.324, -0.167, -0.332, -0.025,
        0.173, -0.83 ,  0.68 , -0.438, -0.954, -0.766,  0.141,  1.183,
       -0.464, -0.793,  0.758,  0.785, -0.777, -0.499,  1.513, -0.761])

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
y_subsc = (y_pred >= 0.5)

In [None]:
accuracy = round((y_val == y_subsc).mean(),2)
print (accuracy)

0.89


### **Question 5**
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

age

balance

marital

previous

Note: The difference doesn't have to be positive.

In [None]:
def train_and_evaluate(df_train, y_train, df_val, y_val, features_to_use):
  dv = DictVectorizer(sparse=False)

  train_dict = df_train[features_to_use].to_dict(orient='records')
  X_train = dv.fit_transform(train_dict)

  val_dict = df_val[features_to_use].to_dict(orient='records')
  X_val = dv.transform(val_dict)

  model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  y_pred = model.predict_proba(X_val)[:, 1]
  y_subsc = (y_pred >= 0.5)
  return (y_val == y_subsc).mean()

In [None]:
original_accuracy = (y_val == y_subsc).mean()

In [None]:
feature_differences = {}
for feature_to_remove in categorical + numerical:
  features_to_use = [f for f in categorical + numerical if f != feature_to_remove]
  accuracy_without_feature = train_and_evaluate(df_train, y_train, df_val, y_val, features_to_use)
  difference = original_accuracy - accuracy_without_feature
  feature_differences[feature_to_remove] = difference



In [None]:
min_diff_feature = min(feature_differences, key=feature_differences.get)
print(f"Feature with smallest difference: {min_diff_feature}")

Feature with smallest difference: age


In [None]:
feature_differences

{'job': -0.011501880115018848,
 'marital': -0.011280690112806857,
 'education': -0.011280690112806857,
 'housing': -0.011501880115018848,
 'contact': -0.010838310108383098,
 'month': -0.01017474010174746,
 'poutcome': -0.0037602300376022857,
 'age': -0.011723070117230727,
 'balance': -0.011391285113912852,
 'day': -0.011723070117230727,
 'duration': -0.0001105950011059953,
 'campaign': -0.010617120106171218,
 'pdays': -0.011280690112806857,
 'previous': -0.011280690112806857}

In [None]:

sorted_feature_differences = dict(sorted(feature_differences.items(), key=lambda item: item[1]))
sorted_feature_differences

{'age': -0.011723070117230727,
 'day': -0.011723070117230727,
 'job': -0.011501880115018848,
 'housing': -0.011501880115018848,
 'balance': -0.011391285113912852,
 'marital': -0.011280690112806857,
 'education': -0.011280690112806857,
 'pdays': -0.011280690112806857,
 'previous': -0.011280690112806857,
 'contact': -0.010838310108383098,
 'campaign': -0.010617120106171218,
 'month': -0.01017474010174746,
 'poutcome': -0.0037602300376022857,
 'duration': -0.0001105950011059953}

### **Question 6**
Now let's train a regularized logistic regression.

Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].

Train models using all the features as in Q4.
Calculate the accuracy on the validation dataset and round it to 3 decimal digits.
Which of these C leads to the best accuracy on the validation set?

0.01
0.1
1
10
100
Note: If there are multiple options, select the smallest C.

In [None]:
c_values = [0.01, 0.1, 1, 10, 100]
best_c = None
best_accuracy = 0

for c in c_values:
  model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict_proba(X_val)[:, 1]
  y_subsc = (y_pred >= 0.5)
  accuracy = round((y_val == y_subsc).mean(), 3)

  print (f"C: {c}, Accuracy: {accuracy}")
  if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_c = c

print(f"Best C value: {best_c}, Best Accuracy: {best_accuracy}")

C: 0.01, Accuracy: 0.888
C: 0.1, Accuracy: 0.89
C: 1, Accuracy: 0.89
C: 10, Accuracy: 0.889
C: 100, Accuracy: 0.889
Best C value: 0.1, Best Accuracy: 0.89
