In [7]:
import pandas as pd
import numpy as np
import requests
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import io

import matplotlib.pyplot as plt

In [8]:
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

def csv_in_zip(url, outer, csv_name, sep=","):
    response = requests.get(url)
    # if response.raise_for_status():
    #     pass # Ensure the request was successful
    # else:
    #     return
    # Step 2: Extract the outer zip file from the response
    outer_zip = zipfile.ZipFile(io.BytesIO(response.content))
    
    # Step 3: Open the inner zip file from the outer zip (assuming there's one zip file inside)
    inner_zip_name = [name for name in outer_zip.namelist() if name.endswith(outer+'.zip')][0]
    with outer_zip.open(inner_zip_name) as inner_zip_file:
        inner_zip = zipfile.ZipFile(io.BytesIO(inner_zip_file.read()))
    
        # Step 4: Find the CSV file within the inner zip file
        csv_file_name = [name for name in inner_zip.namelist() if name.endswith(csv_name+'.csv')][0]
        with inner_zip.open(csv_file_name) as csv_file:
            # Step 5: Read the CSV file into a pandas DataFrame
            df = pd.read_csv(csv_file, sep=sep)
    return df

In [9]:
df = csv_in_zip(url, 'bank', 'bank-full', ";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Data Preparation

In [10]:
data = df[['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous',
'poutcome','y']] #filtering columns
print(data.isna().sum()) #checking for missing values
data.y = (data.y == 'yes').astype(int)

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.y = (data.y == 'yes').astype(int)


In [11]:
# Splitting data
def train_test_val(df, y, test=0.2, val=0.2, random_state=42):
    df_full_train, df_test = train_test_split(df, test_size=test, random_state=random_state)
    df_train, df_val = train_test_split(df_full_train, test_size= val/(1-test) , random_state=random_state)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    y_train = df_train.y.values
    y_val = df_val.y.values
    y_test = df_test.y.values
    del df_train[y]
    del df_val[y]
    del df_test[y]
    return df_train, df_val, df_test, y_train, y_val, y_test

In [12]:
df_train, df_val, df_test, y_train, y_val, y_test = train_test_val(data, 'y')

### Question 1

In [24]:
# Defining variables
numerical = ['age', 'balance', 'day', 'duration']
categorical = ['job','marital','education','default','housing','loan','contact','month','poutcome']
for i in numerical:
    fpr, tpr, thresholds = roc_curve(y_train, df_train[i])
    print(auc(fpr,tpr))

0.4927845383108439
0.5868318909887845
0.4749832093850958
0.8065630842212128


### Question 2

In [9]:
# One-hot encoding of variables
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Training logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Calculating accuracy
y_pred = model.predict_proba(X_val)[:, 1]
term_deposit = (y_pred) >= 0.5