In [18]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np

In [19]:
df = pd.read_csv("../data/raw/bank-additional-full.csv", delimiter=";")
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [20]:
df.shape

(41188, 21)

In [21]:
df = df.drop_duplicates()
df.shape

(41176, 21)

In [22]:
df = df.dropna()

# Set impute strategy

In [23]:
impute_missing = SimpleImputer(missing_values="unknown", strategy="most_frequent")

# Now, for each columns check the values. If we have "unknown" value we can delete the row or impute it. In case the procentage of the "unknown" values is greater than 10% -> impute, otherwise -> delete.

In [24]:
df.job.value_counts()

admin.           10419
blue-collar       9253
technician        6739
services          3967
management        2924
retired           1718
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64

In [25]:
df = df.drop(df[df.job == "unknown"].index)
df.shape

(40846, 21)

In [26]:
df.marital.value_counts()

married     24687
single      11490
divorced     4598
unknown        71
Name: marital, dtype: int64

In [27]:
df = df.drop(df[df.marital == "unknown"].index)
df.shape

(40775, 21)

In [28]:
df.default.value_counts()

no         32337
unknown     8435
yes            3
Name: default, dtype: int64

In [29]:
df["default"] = impute_missing.fit_transform(df[["default"]])
df.default.value_counts()

no     40772
yes        3
Name: default, dtype: int64

In [30]:
df.loan.value_counts()

no         33608
yes         6183
unknown      984
Name: loan, dtype: int64

In [31]:
df = df.drop(df[df.loan == "unknown"].index)
df.shape

(39791, 21)

In [32]:
df.housing.value_counts()

yes    21371
no     18420
Name: housing, dtype: int64

In [33]:
df = df.drop(df[df.housing == "unknown"].index)
df.shape

(39791, 21)

In [34]:
df.age.value_counts()

31    1895
32    1778
33    1765
36    1733
35    1700
      ... 
89       2
91       2
87       1
94       1
95       1
Name: age, Length: 78, dtype: int64

In [35]:
df.education.value_counts()

university.degree      11817
high.school             9241
basic.9y                5856
professional.course     5097
basic.4y                4002
basic.6y                2203
unknown                 1557
illiterate                18
Name: education, dtype: int64

In [36]:
df = df.drop(df[df.education == "unknown"].index)
df.shape

(38234, 21)

In [37]:
df.contact.value_counts()

cellular     24432
telephone    13802
Name: contact, dtype: int64

In [38]:
df.month.value_counts()

may    12792
jul     6626
aug     5820
jun     4846
nov     3897
apr     2435
oct      658
sep      502
mar      497
dec      161
Name: month, dtype: int64

In [39]:
df.day_of_week.value_counts()

thu    8007
mon    7925
wed    7602
tue    7477
fri    7223
Name: day_of_week, dtype: int64

In [40]:
df.duration.value_counts()

90      163
85      161
111     152
72      152
87      151
       ... 
1710      1
1934      1
1966      1
879       1
2129      1
Name: duration, Length: 1515, dtype: int64

In [41]:
df.campaign.value_counts()

1     16368
2      9816
3      4955
4      2462
5      1502
6       911
7       584
8       372
9       258
10      207
11      166
12      115
13       76
14       64
17       54
15       47
16       45
18       31
20       30
19       23
21       19
22       17
23       16
24       15
27       11
29        9
28        8
31        7
26        7
25        7
30        7
35        5
33        4
32        4
34        3
40        2
42        2
43        2
37        1
39        1
41        1
Name: campaign, dtype: int64

In [42]:
df.pdays.value_counts()

999    36868
3        393
6        379
4        106
2         57
9         55
12        53
7         52
5         45
10        44
13        33
11        26
1         24
15        22
14        18
0         15
8         14
16         8
17         6
18         5
19         3
22         3
21         2
26         1
25         1
27         1
Name: pdays, dtype: int64

In [43]:
df.previous.value_counts()

0    33055
1     4234
2      672
3      195
4       57
5       16
6        4
7        1
Name: previous, dtype: int64

In [44]:
df.poutcome.value_counts()

nonexistent    33055
failure         3936
success         1243
Name: poutcome, dtype: int64

In [45]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [46]:
df = df.rename(columns={'emp.var.rate': 'emp_var_rate'})

In [47]:
df.emp_var_rate.value_counts()

 1.4    15095
-1.8     8620
 1.1     7104
-0.1     3516
-2.9     1496
-3.4      969
-1.7      701
-1.1      572
-3.0      151
-0.2       10
Name: emp_var_rate, dtype: int64

In [48]:
df = df.rename(columns={'cons.price.idx': 'cons_price_idx'})

In [49]:
df.cons_price_idx.value_counts()

93.994    7104
93.918    6187
92.893    5506
93.444    4921
94.465    3987
93.200    3453
93.075    2285
92.201     696
92.963     642
92.431     408
92.649     328
94.215     281
92.843     269
94.199     269
92.379     233
93.369     228
94.055     217
94.027     203
94.601     187
93.876     182
92.469     158
92.713     151
93.749     150
94.767     116
93.798      63
92.756      10
Name: cons_price_idx, dtype: int64

In [50]:
df = df.rename(columns={'cons.conf.idx': 'cons_conf_idx'})

In [51]:
df.cons_conf_idx.value_counts()

-36.4    7104
-42.7    6187
-46.2    5506
-36.1    4921
-41.8    3987
-42.0    3453
-47.1    2285
-31.4     696
-40.8     642
-26.9     408
-30.1     328
-40.3     281
-37.5     269
-50.0     269
-29.8     233
-34.8     228
-39.8     217
-38.3     203
-49.5     187
-40.0     182
-33.6     158
-33.0     151
-34.6     150
-50.8     116
-40.4      63
-45.9      10
Name: cons_conf_idx, dtype: int64

In [52]:
df.euribor3m.value_counts()

4.857    2635
4.962    2399
4.963    2349
4.961    1742
4.964    1107
         ... 
3.053       1
1.045       1
0.937       1
3.669       1
3.329       1
Name: euribor3m, Length: 314, dtype: int64

In [53]:
df = df.rename(columns={'nr.employed': 'nr_employed'})

In [54]:
df.nr_employed.value_counts()

5228.1    15095
5099.1     8060
5191.0     7104
5195.8     3516
5076.2     1496
5017.5      969
4991.6      701
4963.6      572
5008.7      560
5023.5      151
5176.3       10
Name: nr_employed, dtype: int64

In [55]:
df.y.value_counts()

no     33977
yes     4257
Name: y, dtype: int64

In [56]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [57]:
df.to_csv("../data/clean/bank-additional-clean.csv", index=False)
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
