In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from matplotlib import pyplot as plt
%matplotlib inline

**Summary of Code**

The code in this Jupyter Notebook performs the following tasks:

1. **Imports and Data Loading**:
    - Imports necessary libraries such as `pandas`, `numpy`, `seaborn`, `sklearn`, and `matplotlib`.
    - Loads the dataset from a CSV file and preprocesses the column names.

2. **Data Exploration**:
    - Displays the data types of the columns in the dataframe.
    - Identifies categorical and numerical columns.
    - Displays the first few rows of the dataframe.

3. **Data Analysis**:
    - Counts the occurrences of each category in the 'education' column.
    - Calculates the correlation between 'age' and 'balance', 'pdays' and 'day', and 'pdays' and 'previous'.

4. **Feature Engineering**:
    - Converts the target variable 'y' to a binary format.
    - Splits the data into training and testing sets.
    - Further splits the training set into training and validation sets.
    - Deletes the target variable 'y' from the feature sets.

5. **Mutual Information Calculation**:
    - Defines a function to calculate mutual information between features and the target variable.
    - Applies this function to selected categorical features.

6. **Model Training**:
    - One-hot encodes categorical variables.
    - Trains a logistic regression model using the training data.
    - Transforms the validation data using the same encoder.
    - Predicts probabilities and calculates the accuracy of the model on the validation set.

7. **Feature Importance Analysis**:
    - Trains logistic regression models excluding one feature at a time (age, balance, marital, previous).
    - Calculates the accuracy of these models on the validation set.
    - Compares the accuracy of these models to the global model to determine the importance of each feature.

8. **Hyperparameter Tuning**:
    - Trains logistic regression models with different values of the regularization parameter `C`.
    - Evaluates the accuracy of these models on the validation set.

The notebook provides a comprehensive analysis of the dataset, including data preprocessing, feature engineering, model training, and evaluation. It also includes feature importance analysis and hyperparameter tuning to optimize the logistic regression model.

In [2]:
# !wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [3]:
df = pd.read_csv('bank+marketing/bank/bank-full.csv', sep=';')
df.columns = df.columns.str.lower().str.replace(' ', '_')


In [4]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [5]:
categorical =['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

df.head().T




Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


**Q. 1**

In [6]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

**Q. 2**

In [7]:
df[['age']].corrwith(df.balance).to_frame('correlation with balance')

Unnamed: 0,correlation with balance
age,0.097783


In [8]:
df[['pdays','campaign']].corrwith(df.day).to_frame('correlation with day')

Unnamed: 0,correlation with day
pdays,-0.093044
campaign,0.16249


In [9]:
df[['pdays']].corrwith(df.previous).to_frame('correlation with previous')

Unnamed: 0,correlation with previous
pdays,0.45482


In [10]:
(df.y == 'yes').astype(int)

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [11]:
from sklearn.metrics import mutual_info_score

In [12]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_train_full), len(df_test)

(36168, 9043)

In [13]:
df_train, df_val = train_test_split(df_train_full, test_size=len(df_test), random_state=42)
len(df_train), len(df_val), len(df_test)

(27125, 9043, 9043)

In [14]:
y_train = (df_train.y == 'yes').astype(int).values
y_val = (df_val.y == 'yes').astype(int).values
y_test = (df_test.y == 'yes').astype(int).values


del df_train['y']
del df_val['y']
del df_test['y']



In [15]:
def mi(series, target):
    return mutual_info_score(series, df_train[target])


for c in ['contact', 'education', 'housing', 'poutcome']:
    print(c, round(mi(y_train, c), 2))

def mutual_info_score_calc(series):
    return mutual_info_score(series, y_train)

df_train[['contact', 'education', 'housing', 'poutcome']].apply(mutual_info_score_calc)


contact 0.01
education 0.0
housing 0.01
poutcome 0.03


contact      0.013358
education    0.002698
housing      0.010342
poutcome     0.029533
dtype: float64

**Q. 4**

In [None]:
"""
Trains a logistic regression model using one-hot encoding for categorical variables.

Steps:
1. One-hot encode categorical variables in the dataset.
2. Fit the logistic regression model on the training dataset with specified parameters.
3. Calculate and return the accuracy on the validation dataset, rounded to 2 decimal places.

Parameters:
- solver: 'liblinear'
- C: 1.0
- max_iter: 1000
- random_state: 42

Returns:
- accuracy (float): The accuracy of the model on the validation dataset, rounded to 2 decimal places.
"""

In [17]:
from sklearn.feature_extraction import DictVectorizer

train_dict = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)


In [18]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [19]:
dicts_val = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(dicts_val)


In [20]:
y_pred=model.predict_proba(X_val)[:,1]
y_decision = (y_pred >= 0.5).astype(int)
global_acc = (y_decision == y_val).mean()
round((y_decision == y_val).mean(), 2)

np.float64(0.9)

**Q. 5**

In [21]:
# One-Hot Encoding
train_dict_no_age = df_train[categorical + numerical].drop(columns=['age']).to_dict(orient='records')
train_dict_no_balance = df_train[categorical + numerical].drop(columns=['balance']).to_dict(orient='records')
train_dict_no_marital = df_train[categorical + numerical].drop(columns=['marital']).to_dict(orient='records')
train_dict_no_previous = df_train[categorical + numerical].drop(columns=['previous']).to_dict(orient='records')
dv_age_extracted = DictVectorizer(sparse=False)
dv_balance_extracted = DictVectorizer(sparse=False)
dv_martial_extracted = DictVectorizer(sparse=False)
dv_previous_extracted = DictVectorizer(sparse=False)
X_train_no_age = dv_age_extracted.fit_transform(train_dict_no_age)
X_train_no_balance = dv_balance_extracted.fit_transform(train_dict_no_balance)
X_train_no_marital = dv_martial_extracted.fit_transform(train_dict_no_marital)
X_train_no_previous = dv_previous_extracted.fit_transform(train_dict_no_previous)

# Model Training
model_no_age = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_age.fit(X_train_no_age, y_train)

model_no_balance = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_balance.fit(X_train_no_balance, y_train)

model_no_marital = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_marital.fit(X_train_no_marital, y_train)

model_no_previous = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_no_previous.fit(X_train_no_previous, y_train)


In [32]:
dicts_val_no_age = df_val[categorical + numerical].drop(columns=['age']).to_dict(orient='records')
X_val_no_age = dv_age_extracted.transform(dicts_val_no_age)

# Fit DictVectorizer to training data without 'age' column
dv.fit(df_train[categorical + numerical].drop(columns=['age']).to_dict(orient='records'))

# Fit model on training data without 'age' column
model_no_age.fit(X_train_no_age, y_train)

# Predict probabilities for validation data without 'age' column
y_pred=model_no_age.predict_proba(X_val_no_age)[:,1]

# Make binary decisions based on predicted probabilities
y_decision = (y_pred >= 0.5).astype(int)

# Calculate accuracy
no_age_acc = (y_decision == y_val).mean()



In [23]:
dicts_val_no_balance = df_val[categorical + numerical].drop(columns=['balance']).to_dict(orient='records')
X_val_no_balance = dv_balance_extracted.transform(dicts_val_no_balance)
dv.fit(df_train[categorical + numerical].drop(columns=['balance']).to_dict(orient='records'))
model_no_balance.fit(X_train_no_balance, y_train)
y_pred=model_no_balance.predict_proba(X_val_no_balance)[:,1]
y_decision = (y_pred >= 0.5).astype(int)
no_balance_acc = (y_decision == y_val).mean()

In [24]:
dicts_val_no_marital = df_val[categorical + numerical].drop(columns=['marital']).to_dict(orient='records')
X_val_no_marital = dv_martial_extracted.transform(dicts_val_no_marital)
dv.fit(df_train[categorical + numerical].drop(columns=['marital']).to_dict(orient='records'))
model_no_marital.fit(X_train_no_marital, y_train)
y_pred=model_no_marital.predict_proba(X_val_no_marital)[:,1]
y_decision = (y_pred >= 0.5).astype(int)
no_marital_acc = (y_decision == y_val).mean()

In [25]:
dicts_val_no_previous = df_val[categorical + numerical].drop(columns=['previous']).to_dict(orient='records')
X_val_no_previous = dv_previous_extracted.transform(dicts_val_no_previous)
dv.fit(df_train[categorical + numerical].drop(columns=['previous']).to_dict(orient='records'))
X_train_no_previous = dv.transform(df_train[categorical + numerical].drop(columns=['previous']).to_dict(orient='records'))
model_no_previous.fit(X_train_no_previous, y_train)
y_pred=model_no_previous.predict_proba(X_val_no_previous)[:,1]
y_decision = (y_pred >= 0.5).astype(int)
no_previous_acc = (y_decision == y_val).mean()

In [26]:
global_acc - no_age_acc, global_acc - no_balance_acc, global_acc - no_marital_acc, global_acc - no_previous_acc

(np.float64(0.0005529138560211866),
 np.float64(0.00033174831361271195),
 np.float64(0.00022116554240847464),
 np.float64(0.0005529138560211866))

**Q. 6**

Training a regularized logistic regression with specific C parameters.

In [28]:
for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred=model.predict_proba(X_val)[:,1]
    y_decision = (y_pred >= 0.5).astype(int)
    print(c, (y_decision == y_val).mean())

0.01 0.8980426849496849
0.1 0.9011390025434037
1 0.9013601680858122
10 0.9009178370009953
100 0.9005860886873825
