In [21]:
import pandas as pd
import numpy as np
import requests
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
import io

import matplotlib.pyplot as plt

In [2]:
url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

def csv_in_zip(url, csv_name, sep=","):
    response = requests.get(url)
    # if response.raise_for_status():
    #     pass # Ensure the request was successful
    # else:
    #     return
    # Step 2: Extract the outer zip file from the response
    outer_zip = zipfile.ZipFile(io.BytesIO(response.content))
    
    # Step 3: Open the inner zip file from the outer zip (assuming there's one zip file inside)
    inner_zip_name = [name for name in outer_zip.namelist() if name.endswith(csv_name+'.zip')][0]
    with outer_zip.open(inner_zip_name) as inner_zip_file:
        inner_zip = zipfile.ZipFile(io.BytesIO(inner_zip_file.read()))
    
        # Step 4: Find the CSV file within the inner zip file
        csv_file_name = [name for name in inner_zip.namelist() if name.endswith(csv_name+'.csv')][0]
        with inner_zip.open(csv_file_name) as csv_file:
            # Step 5: Read the CSV file into a pandas DataFrame
            df = pd.read_csv(csv_file, sep=sep)
    return df

In [3]:
df = csv_in_zip(url, 'bank', ";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


### Data Preparation

In [4]:
data = df[['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous',
'poutcome','y']] #filtering columns
print(data.isna().sum()) #checking for missing values
data.y = (data.y == 'yes').astype(int)
data.head()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.y = (data.y == 'yes').astype(int)


Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,1787,no,cellular,19,oct,79,1,-1,0,unknown,0
1,33,services,married,secondary,4789,yes,cellular,11,may,220,1,339,4,failure,0
2,35,management,single,tertiary,1350,yes,cellular,16,apr,185,1,330,1,failure,0
3,30,management,married,tertiary,1476,yes,unknown,3,jun,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,0,yes,unknown,5,may,226,1,-1,0,unknown,0


### Question 1

In [5]:
data.education.mode()

0    secondary
Name: education, dtype: object

### Question 2

In [23]:
numerical = ['age','balance','day','duration','campaign','pdays','previous']
categorical = ['job','marital','education','housing','contact','month','poutcome']
data[numerical].corrwith(data.y)[-2:] #highest 2 features

pdays       0.104087
previous    0.116714
dtype: float64

In [18]:
# encoding y/n variables
data.housing = (data.housing == 'yes').astype(int)
# Splitting data
def train_test_val(df, y, test=0.2, val=0.2, random_state=42):
    df_full_train, df_test = train_test_split(df, test_size=test, random_state=random_state)
    df_train, df_val = train_test_split(df_full_train, test_size= val/(1-test) , random_state=random_state)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)
    y_train = df_train.y.values
    y_val = df_val.y.values
    y_test = df_test.y.values
    del df_train[y]
    del df_val[y]
    del df_test[y]
    return df_train, df_val, df_test, y_train, y_val, y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.housing = (data.housing == 'yes').astype(int)


In [20]:
df_train, df_val, df_test, y_train, y_val, y_test = train_test_val(data, 'y')

### Question 3

In [31]:
#mutual info

def mutual_info_y_score(series):
    return mutual_info_score(series, y_train)
mi = df_train[categorical].apply(mutual_info_y_score)
mi[-1:]

poutcome    0.030355
dtype: float64

### Question 4