In [14]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    MeanMedianImputer,
    CategoricalImputer,
    DropMissingData
)

from feature_engine.encoding import (
    OrdinalEncoder,
)

from  feature_engine.outliers import(
    OutlierTrimmer
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper


In [2]:
data = pd.read_csv('Bank_Campaign.csv', sep=';')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [9]:
def replaceWNAN(df):
    for l in ['unknown','nonexistent']:
        df=df.replace(l, np.nan)
    return df
def binarize(df):
    df=df.replace('yes',1)
    df=df.replace('no',0)
    df=df.replace('cellular',1)
    df=df.replace('telephone',0)
    return df

In [11]:
data = replaceWNAN(data)
data = binarize(data)
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['poutcome', 'month','day_of_week','subscribed'], axis=1), # predictive variables
    data['subscribed'], # target
    test_size=0.2, # portion of dataset to allocate to test set
    random_state=0, # we are setting the seed here
)

X_train.shape, X_test.shape

((32950, 17), (8238, 17))

In [13]:
def knn_imput(data,k_neighbors):
    imputer = KNNImputer(n_neighbors=k_neighbors)
    data_imputer = pd.DataFrame(imputer.fit_transform(data),columns=data.columns)
    return data_imputer

In [5]:
# categorical variables with NA in train set
DROP_OBS_WITH_NA = ['job','marital','education','housing','loan']

# variables to map
QUAL_VARS = ['education']

# categorical variables to encode
CATEGORICAL_VARS = ['job','marital']

# variables to binarize
BINARIZE_VARS = ['pdays']

# variable mappings
EDU_MAPPINGS = {'illiterate':0,'basic.4y':1, 'basic.6y':2,
                 'basic.9y':3, 'high.school':4,'professional.course':5, 'university.degree':6}

# outliers handling
OUTLIERS_VARS = ['age','duration','campaign','previous']

# the selected variables
FEATURES = [
    'age',
    'education',
    'duration',
    'campaign',
    'pdays',
    'cons.conf.idx',
    'euribor3m',
    'nr.employed']