In [96]:
import os.path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
sns.set()
warnings.filterwarnings("ignore")

### Loading the data

In [97]:
bank_additional_full = pd.read_csv(r'C:\Users\TALEHOUSE\Documents\bank-additional-full.csv',sep=';') 

In [98]:
bank_additional_full.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [99]:
bank_additional_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [100]:
num_features = bank_additional_full.select_dtypes(exclude='object').columns.values
cat_features = bank_additional_full.select_dtypes(include='object').columns.values
print(f"Number of numerical features: {len(num_features)}")
print(f"Number of categorical features: {len(cat_features)}")

Number of numerical features: 10
Number of categorical features: 11


### Dealing with categorical features

#### Checking for unknown values

In [101]:
for cat_feature in cat_features:
    if "unknown" in bank_additional_full[cat_feature].value_counts():
        print(f'count of unknown in {cat_feature}: {bank_additional_full[cat_feature].value_counts()["unknown"]}')
        print(f'% of unknown in {cat_feature}: {(bank_additional_full[cat_feature].value_counts()["unknown"]/bank_additional_full.shape[0])*100:.3f}%')
        print("-----------------------------------------------------------------")

count of unknown in job: 330
% of unknown in job: 0.801%
-----------------------------------------------------------------
count of unknown in marital: 80
% of unknown in marital: 0.194%
-----------------------------------------------------------------
count of unknown in education: 1731
% of unknown in education: 4.203%
-----------------------------------------------------------------
count of unknown in default: 8597
% of unknown in default: 20.873%
-----------------------------------------------------------------
count of unknown in housing: 990
% of unknown in housing: 2.404%
-----------------------------------------------------------------
count of unknown in loan: 990
% of unknown in loan: 2.404%
-----------------------------------------------------------------


In [102]:
import random

In [103]:
bank_additional_full.columns = ['age', 'job', 'marital', 'education', 'default', 
                                'housing', 'loan', 'contact', 'month', 'day_of_week', 
                                'duration', 'campaign', 'pdays', 'previous', 'poutcome', 
                                'emp_var_rate', 'cons_price_idx','cons_conf_idx', 'euribor3m', 'nr_employed', 'y']

#### For the categorical feature Job:

In [104]:
print(bank_additional_full.job.value_counts())
bank_additional_full.loc[bank_additional_full["job"] == "unknown", "job"] = "admin."

admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64


In [105]:
print(bank_additional_full.job.value_counts())

admin.           10752
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
Name: job, dtype: int64


#### For the categorical feature Marital:

In [106]:
print(bank_additional_full.marital.value_counts())
bank_additional_full.loc[bank_additional_full["marital"] == "unknown", "marital"] = "married"

married     24928
single      11568
divorced     4612
unknown        80
Name: marital, dtype: int64


In [107]:
print(bank_additional_full.marital.value_counts())

married     25008
single      11568
divorced     4612
Name: marital, dtype: int64


#### For the categorical feature Education:

In [108]:
print(bank_additional_full.education.value_counts())
bank_additional_full.loc[bank_additional_full["education"] == "unknown", "education"] = "university.degree"

university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: education, dtype: int64


In [109]:
print(bank_additional_full.education.value_counts())

university.degree      13899
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
illiterate                18
Name: education, dtype: int64


#### For the categorical feature Default:

Due to class disparity amongst the default column with no entries making 79.1%  
whilst the yes entries making 0.007% of the entire column suggest there is no information   
for this feature hence we dropped it from the data set.    

In [110]:
bank_additional_full.drop(["default"], axis = 1)

Unnamed: 0,age,job,marital,education,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,yes,no,cellular,nov,fri,334,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,cellular,nov,fri,383,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,yes,no,cellular,nov,fri,189,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,cellular,nov,fri,442,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


#### For the categorical feature housing:

In [111]:
print(bank_additional_full.housing.value_counts())
bank_additional_full.loc[bank_additional_full["housing"] == "unknown", "housing"] = random.choice(["yes", "no"])

yes        21576
no         18622
unknown      990
Name: housing, dtype: int64


In [112]:
print(bank_additional_full.housing.value_counts())

yes    21576
no     19612
Name: housing, dtype: int64


#### For the categorical feature loan:

In [113]:
print(bank_additional_full.loan.value_counts())
bank_additional_full.loc[bank_additional_full["loan"] == "unknown", "loan"] = "no"

no         33950
yes         6248
unknown      990
Name: loan, dtype: int64


In [114]:
print(bank_additional_full.loan.value_counts())

no     34940
yes     6248
Name: loan, dtype: int64
