In [5]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder, BinaryEncoder, BaseNEncoder, OrdinalEncoder

from sklearn.preprocessing import FunctionTransformer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

In [6]:
df = pd.read_csv('bank.csv', delimiter = ';', na_values = 'unknown')

In [7]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [20]:
df['y'].replace({'yes':1, 'no':0}, inplace = True)

In [9]:
class SinCosTransformer():
    def __init__(self, weekday, hour):
        self.wd = weekday
        self.hr = hour
        self.wd_T = max(self.wd) - min(self.wd) + 1
        self.hr_T = max(self.hr) - min(self.hr) + 1
        
    def transform(self):
        wd, hr, wd_T, hr_T = self.wd, self.hr, self.wd_T, self.hr_T
        transformed = np.round(np.array([
                    np.sin(2*np.pi/wd_T*wd),
                        np.sin(2*np.pi/hr_T*hr)]
                        ), 3)
        return transformed
        

In [10]:
data_types = df.dtypes
int_cols = [df.columns[i] for i in range(df.shape[1]) if data_types[i] == 'int64']
float_cols = [df.columns[i] for i in range(df.shape[1]) if data_types[i] == 'float64']
cat_cols = [df.columns[i] for i in range(df.shape[1]) if data_types[i] == 'O']

При импорте значение признака unknown было принято за пропуск и заменено None.
Проверим есть ли пропуски у целочисленных признаков.

In [11]:
for col in int_cols:
    print(col, len(df[col][df[col].isna()]))
# нет, но у pdays 999 значит, что контакта не было, я не знаю что с этим делать, поэтому не буду его кодировать
int_cols.remove('pdays')

age 0
duration 0
campaign 0
pdays 0
previous 0
y 0


In [12]:
for col in float_cols:
    print(col, len(df[col][df[col].isna()]))

emp.var.rate 0
cons.price.idx 0
cons.conf.idx 0
euribor3m 0
nr.employed 0


In [13]:
for col in cat_cols:
    print(col, len(df[col][df[col].isna()]))

job 330
marital 80
education 1731
default 8597
housing 990
loan 990
contact 0
month 0
day_of_week 0
poutcome 0


In [17]:
df[cat_cols[1]].value_counts()

married     24928
single      11568
divorced     4612
Name: marital, dtype: int64

In [19]:
cat_cols[1]

'marital'

In [14]:
mon_num = {'mar':1, 'apr':2, 'may':3, 'jun': 4, 'jul':5, 'aug':6, 'nov':7, 'sep':7, 'oct':8, 'nov':9, 'dec':10}
day_num = {day:i for day, i in zip(df['day_of_week'].unique(), range(1, 6))}
education_sorted = {'illiterate': 0, 'unknown': 1, 'basic.4y': 2,
                    'basic.6y': 3, 'basic.9y': 4, 'high.school': 5,
                    'professional.course': 6, 'university.degree': 7}

In [112]:
ord_mon = OrdinalEncoder(mapping=[{'col': 'month', 'mapping': mon_num}])
ord_day =  OrdinalEncoder(mapping=[{'col': 'day_of_week', 'mapping': day_num}])
ord_edu = OrdinalEncoder(mapping=[{'col': 'education', 'mapping': education_sorted}])

cat_steps = [('encoder', TargetEncoder()),
            ('imputer', KNNImputer()),
              ('scaler', MinMaxScaler())
             ]


In [118]:
transf = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

transformer = ColumnTransformer(
    [
    ('func_trans', transf, ['age', 'duration']),
    ('scale', StandardScaler(), [*float_cols, 'previous', 'campaign']),
    ('month', ord_mon, 'month'),
    ('day', ord_day, 'day_of_week'),
    ('encode_scale', Pipeline(steps = cat_steps), list(set(cat_cols)-set(['day_of_week', 'month', 'y'])))
    ]
)
    

In [119]:
transformer.fit(df, df['y'])

  elif pd.api.types.is_categorical(cols):


ColumnTransformer(transformers=[('func_trans',
                                 FunctionTransformer(func=<ufunc 'log1p'>,
                                                     inverse_func=<ufunc 'expm1'>),
                                 ['age', 'duration']),
                                ('scale', StandardScaler(),
                                 ['emp.var.rate', 'cons.price.idx',
                                  'cons.conf.idx', 'euribor3m', 'nr.employed',
                                  'previous', 'campaign']),
                                ('month',
                                 OrdinalEncoder(mapping=[{'col': 'month',
                                                          'mapping': {'apr': 2,
                                                                      'aug': 6,
                                                                      'dec': 10,
                                                                      'jul':...
                                   

In [120]:
df1 = transformer.transform(df)

In [125]:
df1 = pd.DataFrame(df1, columns = list(set(df.columns) - set(['y', 'pdays'])))
df1

Unnamed: 0,duration,euribor3m,campaign,day_of_week,nr.employed,emp.var.rate,housing,cons.price.idx,cons.conf.idx,age,month,poutcome,marital,contact,default,education,job,previous,loan
0,4.043051,5.568345,0.648092,0.722722,0.886447,0.712460,0.331680,-0.349494,-0.565922,3.0,1.0,1.0,1.000000,1.0,0.000000,0.831611,0.873414,0.911842,0.000000
1,4.060443,5.010635,0.648092,0.722722,0.886447,0.712460,0.331680,-0.349494,-0.565922,3.0,1.0,1.0,1.000000,1.0,0.669725,0.790876,0.949305,0.911842,0.000000
2,3.637586,5.424950,0.648092,0.722722,0.886447,0.712460,0.331680,-0.349494,-0.565922,3.0,1.0,1.0,1.000000,1.0,0.000000,0.790876,0.949305,0.000000,0.000000
3,3.713572,5.023881,0.648092,0.722722,0.886447,0.712460,0.331680,-0.349494,-0.565922,3.0,1.0,1.0,1.000000,1.0,0.000000,0.973760,0.752255,0.911842,0.000000
4,4.043051,5.730100,0.648092,0.722722,0.886447,0.712460,0.331680,-0.349494,-0.565922,3.0,1.0,1.0,1.000000,1.0,0.000000,0.790876,0.949305,0.911842,0.768067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,4.304065,5.814131,-0.752343,2.058168,-2.224953,-1.495186,-2.815697,-0.349494,-0.565922,9.0,5.0,1.0,1.000000,0.0,0.000000,0.755249,0.252545,0.000000,0.000000
41184,3.850148,5.950643,-0.752343,2.058168,-2.224953,-1.495186,-2.815697,-0.349494,-0.565922,9.0,5.0,1.0,1.000000,0.0,0.000000,0.755249,1.000000,0.911842,0.000000
41185,4.043051,5.247024,-0.752343,2.058168,-2.224953,-1.495186,-2.815697,-0.349494,-0.204909,9.0,5.0,1.0,1.000000,0.0,0.000000,0.590217,0.252545,0.000000,0.000000
41186,3.806662,6.093570,-0.752343,2.058168,-2.224953,-1.495186,-2.815697,-0.349494,-0.565922,9.0,5.0,1.0,1.000000,0.0,0.000000,0.755249,0.839745,0.911842,0.000000
