In [1]:
import numpy as np
from sklearn import datasets
import pandas as pd 
from sklearn import preprocessing

In [2]:
# adult data comes in two different files, one for training and one for testing, however, we will combine data from both the files

data = pd.read_csv("dataset/bank/bank.csv", skipinitialspace=True, delimiter=';') 

In [3]:
data.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
5,35,management,single,tertiary,no,747,no,no,cellular,23,feb,141,2,176,3,failure,no
6,36,self-employed,married,tertiary,no,307,yes,no,cellular,14,may,341,1,330,2,other,no
7,39,technician,married,secondary,no,147,yes,no,cellular,6,may,151,2,-1,0,unknown,no
8,41,entrepreneur,married,tertiary,no,221,yes,no,unknown,14,may,57,2,-1,0,unknown,no
9,43,services,married,primary,no,-88,yes,yes,cellular,17,apr,313,1,147,2,failure,no


In [4]:
data.shape

(4521, 17)

In [5]:
data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [6]:
for column in data.columns:
    print("Column:", column, " -->", data[column].unique())

Column: age  --> [30 33 35 59 36 39 41 43 20 31 40 56 37 25 38 42 44 26 55 67 53 68 32 49
 78 23 52 34 61 45 48 57 54 63 51 29 50 27 60 28 21 58 22 46 24 77 75 47
 70 65 64 62 66 19 81 83 80 71 72 69 79 73 86 74 76 87 84]
Column: job  --> ['unemployed' 'services' 'management' 'blue-collar' 'self-employed'
 'technician' 'entrepreneur' 'admin.' 'student' 'housemaid' 'retired'
 'unknown']
Column: marital  --> ['married' 'single' 'divorced']
Column: education  --> ['primary' 'secondary' 'tertiary' 'unknown']
Column: default  --> ['no' 'yes']
Column: balance  --> [ 1787  4789  1350 ...  -333 -3313  1137]
Column: housing  --> ['no' 'yes']
Column: loan  --> ['no' 'yes']
Column: contact  --> ['cellular' 'unknown' 'telephone']
Column: day  --> [19 11 16  3  5 23 14  6 17 20 13 30 29 27  7 18 12 21 26 22  2  4 15  8
 28  9  1 10 31 25 24]
Column: month  --> ['oct' 'may' 'apr' 'jun' 'feb' 'aug' 'jan' 'jul' 'nov' 'sep' 'mar' 'dec']
Column: duration  --> [  79  220  185  199  226  141  341  151   5

In [7]:
g = data.columns.to_series().groupby(data.dtypes).groups

In [8]:
g

{dtype('int64'): Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object'),
 dtype('O'): Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
        'month', 'poutcome', 'y'],
       dtype='object')}

In [10]:
data['month-seasonal'] = data['month'].apply(lambda x: 'q1' if x in ["jan", "feb", "mar"] else x )
data['month-seasonal'] = data['month-seasonal'].apply(lambda x: 'q2' if x in ["apr", "may", "jun"] else x )
data['month-seasonal'] = data['month-seasonal'].apply(lambda x: 'q3' if x in ["jul", "aug", "sep"] else x )
data['month-seasonal'] = data['month-seasonal'].apply(lambda x: 'q4' if x in ["oct", "nov", "dec"] else x )

data.drop(['month'], inplace=True, axis=1)


In [11]:
data.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,duration,campaign,pdays,previous,poutcome,y,month-seasonal
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,79,1,-1,0,unknown,no,q4
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,220,1,339,4,failure,no,q2
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,185,1,330,1,failure,no,q2
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,199,4,-1,0,unknown,no,q2
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,226,1,-1,0,unknown,no,q2


In [12]:
encoded_object_df = pd.DataFrame()

for column in ['job', 'marital', 'education', 'default', 'housing', 'loan','contact', 'poutcome','y','month-seasonal']:
    encoded_object_df = pd.concat([encoded_object_df,pd.get_dummies(data[column], prefix=column, drop_first=True)] ,axis=1)

In [15]:
data.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,duration,campaign,pdays,previous,poutcome,y,month-seasonal
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,79,1,-1,0,unknown,no,q4
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,220,1,339,4,failure,no,q2
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,185,1,330,1,failure,no,q2
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,199,4,-1,0,unknown,no,q2
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,226,1,-1,0,unknown,no,q2


In [16]:
min_max_scaler = preprocessing.MinMaxScaler()

cols_to_scale = ['age','balance', 'day','duration', 'campaign', 'pdays','previous']

encoded_int_df = data[cols_to_scale]

encoded_int_df[cols_to_scale] = min_max_scaler.fit_transform(encoded_int_df[cols_to_scale])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [19]:
final_df = pd.concat([encoded_object_df, encoded_int_df], axis=1)

In [20]:
final_df.shape

(4521, 35)

In [22]:
final_df.columns

Index(['job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'education_secondary',
       'education_tertiary', 'education_unknown', 'default_yes', 'housing_yes',
       'loan_yes', 'contact_telephone', 'contact_unknown', 'poutcome_other',
       'poutcome_success', 'poutcome_unknown', 'y_yes', 'month-seasonal_q2',
       'month-seasonal_q3', 'month-seasonal_q4', 'age', 'balance', 'day',
       'duration', 'campaign', 'pdays', 'previous'],
      dtype='object')

In [23]:
y = np.where(final_df['y_yes'] == 0, -1, 1)

In [25]:
X_control = final_df['age']