In [18]:
import numpy as np 
import pandas as pd

In [2]:
train_df = pd.read_csv('data/train.csv',sep = ';')
test_df = pd.read_csv("data/test.csv",sep = ';')

## Data preprocessing 

In [3]:
train_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
target_col = 'y'

In [5]:
# getting the categorical columns for one hote encoding 
cat_cols = set(train_df.dtypes[train_df.dtypes=='object'].index)
cat_cols = list(cat_cols-set(target_col))

# numerical cols 
num_cols = list(set(train_df.columns)-set(cat_cols)-set(target_col))

print(cat_cols)
print(num_cols)

['marital', 'default', 'poutcome', 'loan', 'job', 'month', 'housing', 'contact', 'education']
['balance', 'day', 'pdays', 'duration', 'campaign', 'previous', 'age']


In [6]:
cat_df = train_df[cat_cols]
num_df = train_df[num_cols]
target_df = train_df[target_col]

In [7]:
cat_df.head()

Unnamed: 0,marital,default,poutcome,loan,job,month,housing,contact,education
0,married,no,unknown,no,management,may,yes,unknown,tertiary
1,single,no,unknown,no,technician,may,yes,unknown,secondary
2,married,no,unknown,yes,entrepreneur,may,yes,unknown,secondary
3,married,no,unknown,no,blue-collar,may,yes,unknown,unknown
4,single,no,unknown,no,unknown,may,no,unknown,unknown


In [8]:
num_df.head()

Unnamed: 0,balance,day,pdays,duration,campaign,previous,age
0,2143,5,-1,261,1,0,58
1,29,5,-1,151,1,0,44
2,2,5,-1,76,1,0,33
3,1506,5,-1,92,1,0,47
4,1,5,-1,198,1,0,33


In [9]:
encoded_cat_df = pd.get_dummies(cat_df,drop_first = False)
encoded_cat_df.head()

Unnamed: 0,marital_divorced,marital_married,marital_single,default_no,default_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,loan_no,...,month_sep,housing_no,housing_yes,contact_cellular,contact_telephone,contact_unknown,education_primary,education_secondary,education_tertiary,education_unknown
0,0,1,0,1,0,0,0,0,1,1,...,0,0,1,0,0,1,0,0,1,0
1,0,0,1,1,0,0,0,0,1,1,...,0,0,1,0,0,1,0,1,0,0
2,0,1,0,1,0,0,0,0,1,0,...,0,0,1,0,0,1,0,1,0,0
3,0,1,0,1,0,0,0,0,1,1,...,0,0,1,0,0,1,0,0,0,1
4,0,0,1,1,0,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,1


In [10]:
cat_encoded_cols = encoded_cat_df.columns

In [11]:
cat_encoded_cols

Index(['marital_divorced', 'marital_married', 'marital_single', 'default_no',
       'default_yes', 'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown', 'loan_no', 'loan_yes', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'housing_no', 'housing_yes',
       'contact_cellular', 'contact_telephone', 'contact_unknown',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown'],
      dtype='object')

In [12]:
to_target = lambda x: 1 if x=='yes' else 0
final_train = pd.concat([num_df,encoded_cat_df],axis = 1)
final_train['target'] = [to_target(i) for i in target_df]

In [13]:
final_train.head()

Unnamed: 0,balance,day,pdays,duration,campaign,previous,age,marital_divorced,marital_married,marital_single,...,housing_no,housing_yes,contact_cellular,contact_telephone,contact_unknown,education_primary,education_secondary,education_tertiary,education_unknown,target
0,2143,5,-1,261,1,0,58,0,1,0,...,0,1,0,0,1,0,0,1,0,0
1,29,5,-1,151,1,0,44,0,0,1,...,0,1,0,0,1,0,1,0,0,0
2,2,5,-1,76,1,0,33,0,1,0,...,0,1,0,0,1,0,1,0,0,0
3,1506,5,-1,92,1,0,47,0,1,0,...,0,1,0,0,1,0,0,0,1,0
4,1,5,-1,198,1,0,33,0,0,1,...,1,0,0,0,1,0,0,0,1,0


In [14]:
## testing data preparation 
cat_df_test = test_df[cat_cols]
num_df_test = test_df[num_cols]
target_df_test = test_df[target_col]
encoded_cat_df_test = pd.get_dummies(cat_df_test)
encoded_cat_df_test = encoded_cat_df_test[cat_encoded_cols]
final_test = pd.concat([num_df_test,encoded_cat_df_test],axis = 1)
final_test['target'] = [to_target(i) for i in target_df_test]

In [15]:
final_test.head()

Unnamed: 0,balance,day,pdays,duration,campaign,previous,age,marital_divorced,marital_married,marital_single,...,housing_no,housing_yes,contact_cellular,contact_telephone,contact_unknown,education_primary,education_secondary,education_tertiary,education_unknown,target
0,1787,19,-1,79,1,0,30,0,1,0,...,1,0,1,0,0,1,0,0,0,0
1,4789,11,339,220,1,4,33,0,1,0,...,0,1,1,0,0,0,1,0,0,0
2,1350,16,330,185,1,1,35,0,0,1,...,0,1,1,0,0,0,0,1,0,0
3,1476,3,-1,199,4,0,30,0,1,0,...,0,1,0,0,1,0,0,1,0,0
4,0,5,-1,226,1,0,59,0,1,0,...,0,1,0,0,1,0,1,0,0,0


In [16]:
final_train.shape

(45211, 52)

In [17]:
final_test.shape

(4521, 52)

In [20]:
final_train.to_csv('preprocessed_data/final_train.csv',index = False)
final_test.to_csv('preprocessed_data/final_test.csv',index = False)