In [70]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [71]:
df = pd.read_csv('bank-full.csv', sep=';')

In [72]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [73]:
df.y=(df.y=='yes').astype(int)

In [74]:
#Q1
df['education'].mode()

0    secondary
Name: education, dtype: object

In [75]:
df_numerical = df.select_dtypes(include='number')
df_categorical = df.select_dtypes(exclude='number') 
correlation_matrix = df_numerical.corr()

In [76]:
correlation_matrix_lower = correlation_matrix.where(
    np.tril(np.ones(correlation_matrix.shape), k=-1).astype(bool)
)

In [77]:
correlations = correlation_matrix_lower.unstack().sort_values(ascending=False).dropna()
correlations = correlations[correlations < 1]
top_two_correlations = correlations.head(2)

In [78]:

#Q2
for (feature1, feature2), corr_value in top_two_correlations.items():
    print(f"{feature1} and {feature2}: {corr_value}")

pdays and previous: 0.4548196354805043
duration and y: 0.3945210158945639


In [79]:
df_full_train,df_test=train_test_split(df,test_size=0.2,random_state=42)
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=42)
del(df_full_train)

In [80]:
y_train=df_train.y.values
y_test=df_test.y.values
y_val=df_val.y.values
del df_train['y']
del df_test['y']
del df_val['y']

In [81]:
categorical_features=df_categorical.columns
numerical_features=df_numerical.columns

In [82]:
categorical_features

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

In [83]:
numerical_features

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'y'],
      dtype='object')

In [84]:
#Q3
for i in categorical_features:
    print(f'MI value for feature {i}',round(mutual_info_score(y_train,df_train[i]),2))

MI value for feature job 0.01
MI value for feature marital 0.0
MI value for feature education 0.0
MI value for feature default 0.0
MI value for feature housing 0.01
MI value for feature loan 0.0
MI value for feature contact 0.01
MI value for feature month 0.03
MI value for feature poutcome 0.03


In [85]:
list(categorical_features)

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [61]:
#Q4
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

#doing OHE
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

model.fit(X_train,y_train)
y_pred=model.predict_proba(X_val)[:,1]
decision=y_pred >= 0.5
print(round((y_val == decision).mean(),2))

0.9


In [68]:
subset

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'age',
 'balance',
 'day',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'y']

In [86]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# Subset the training data with selected features
subset = ['age','balance','marital','previous']
train_dict_small = df_train[subset].to_dict(orient='records')
val_dict_small = df_val[subset].to_dict(orient='records')

# Initialize the DictVectorizer
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

# Transform training and validation data
X_small_train = dv_small.transform(train_dict_small)
X_small_val = dv_small.transform(val_dict_small)

# Initialize and train the logistic regression model
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_small_train, y_train)

# Get baseline accuracy
y_pred_base = model_small.predict(X_small_val)
accuracy_base = accuracy_score(y_val, y_pred_base)

# Initialize a dictionary to store the accuracy differences
accuracy_diff = {}

# Feature elimination process
for feature in subset:
    # Exclude the current feature
    subset_temp = [f for f in subset if f != feature]
    
    # Transform training and validation sets excluding the current feature
    train_dict_temp = df_train[subset_temp].to_dict(orient='records')
    val_dict_temp = df_val[subset_temp].to_dict(orient='records')
    
    # Re-train the vectorizer and model with the reduced feature set
    dv_temp = DictVectorizer(sparse=False)
    dv_temp.fit(train_dict_temp)
    
    X_temp_train = dv_temp.transform(train_dict_temp)
    X_temp_val = dv_temp.transform(val_dict_temp)
    
    # Train the model without the current feature
    model_temp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_temp.fit(X_temp_train, y_train)
    
    # Get the accuracy without the current feature
    y_pred_temp = model_temp.predict(X_temp_val)
    accuracy_temp = accuracy_score(y_val, y_pred_temp)
    
    # Store the difference in accuracy
    accuracy_diff[feature] = accuracy_base - accuracy_temp

# Output the accuracy differences and find the least useful feature
least_useful_feature = min(accuracy_diff, key=accuracy_diff.get)
accuracy_diff, least_useful_feature

({'age': -0.0001105950011059953,
  'balance': 0.0,
  'marital': 0.00011059500110588427,
  'previous': -0.0013271400132713884},
 'previous')

In [89]:
# Define the C values to try
C_values = [0.01, 0.1, 1, 10, 100]
# Dictionary to store accuracies for each C
accuracy_results = {}

# Loop through each value of C
for C in C_values:
    # Initialize and train the logistic regression model with regularization
    model_regularized = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # Train the model using all features
    model_regularized.fit(X_train, y_train)
    
    # Make predictions on the validation dataset
    y_pred = model_regularized.predict(X_val)
    
    # Calculate the accuracy and round it to 3 decimal digits
    accuracy = accuracy_score(y_val, y_pred)
    accuracy_results[C] = round(accuracy, 3)

# Output the accuracy results
accuracy_results

{0.01: 0.898, 0.1: 0.9, 1: 0.901, 10: 0.9, 100: 0.901}