In this notebook we will prepare our data for our model

In [2]:
# Imports

import pandas as pd
import numpy as np

# sklearn imports

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('../data/input/train.csv')
df.reset_index()
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,32,admin_,single,university_degree,no,yes,no,cellular,mar,mon,...,2,999,0,nonexistent,-1.8,92.843,-50.0,1.52,5099.1,no
1,50,retired,married,university_degree,no,no,no,cellular,nov,fri,...,3,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,no
2,24,services,single,high_school,no,yes,no,cellular,apr,fri,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
3,46,blue-collar,married,basic_9y,unknown,no,no,cellular,jul,fri,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.959,5228.1,no
4,40,technician,married,professional_course,no,no,no,cellular,aug,mon,...,7,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,no


In [4]:
df.drop('duration', axis='columns', inplace=True)

In [5]:
nulls = df.isna().sum()
nulls[nulls>0]

Series([], dtype: int64)

In [6]:
X = df.drop('y', axis='columns')
y = df.y

In [7]:
list(X.columns)

['age',
 'job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'emp_var_rate',
 'cons_price_idx',
 'cons_conf_idx',
 'euribor3m',
 'nr_employed']

In [8]:
categorical = X.select_dtypes(include='object').columns.tolist()
numerical = X.select_dtypes(exclude='object').columns.tolist()

In [9]:
train_dict = X[categorical + numerical].to_dict(orient='records')

In [10]:
train_dict[0]

{'job': 'admin_',
 'marital': 'single',
 'education': 'university_degree',
 'default': 'no',
 'housing': 'yes',
 'loan': 'no',
 'contact': 'cellular',
 'month': 'mar',
 'day_of_week': 'mon',
 'poutcome': 'nonexistent',
 'age': 32,
 'campaign': 2,
 'pdays': 999,
 'previous': 0,
 'emp_var_rate': -1.8,
 'cons_price_idx': 92.843,
 'cons_conf_idx': -50.0,
 'euribor3m': 1.52,
 'nr_employed': 5099.1}

In [11]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [12]:
X_train = dv.transform(train_dict)

In [13]:
X_train.shape

(32950, 62)

In [14]:
dv.get_feature_names()



['age',
 'campaign',
 'cons_conf_idx',
 'cons_price_idx',
 'contact=cellular',
 'contact=telephone',
 'day_of_week=fri',
 'day_of_week=mon',
 'day_of_week=thu',
 'day_of_week=tue',
 'day_of_week=wed',
 'default=no',
 'default=unknown',
 'default=yes',
 'education=basic_4y',
 'education=basic_6y',
 'education=basic_9y',
 'education=high_school',
 'education=illiterate',
 'education=professional_course',
 'education=university_degree',
 'education=unknown',
 'emp_var_rate',
 'euribor3m',
 'housing=no',
 'housing=unknown',
 'housing=yes',
 'job=admin_',
 'job=blue-collar',
 'job=entrepreneur',
 'job=housemaid',
 'job=management',
 'job=retired',
 'job=self-employed',
 'job=services',
 'job=student',
 'job=technician',
 'job=unemployed',
 'job=unknown',
 'loan=no',
 'loan=unknown',
 'loan=yes',
 'marital=divorced',
 'marital=married',
 'marital=single',
 'marital=unknown',
 'month=apr',
 'month=aug',
 'month=dec',
 'month=jul',
 'month=jun',
 'month=mar',
 'month=may',
 'month=nov',
 'mont