Обработка фичей с помощью инструментов sklearn: Pipeline, FeatureUnion, ColumnTransformer, ...

Фиксируем numeric и categorical columns. Ищем datetime переменные.

Для datetime (каждый пункт строится независимо):
    - выделение дней, месяцев, ...

Для numeric (каждый пункт строится независимо):
    - lag-фичи по каждой datetime переменной
    - agg-фичи по месяцам, неделям и дням каждой datetime переменной

Для categorical (каждый пункт строится независимо):
    - dummies
    
Отдельный пайплайн на обработку пропусков в категориальных и числовых признаках:
http://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py

In [80]:
import time
from functools import partial
from IPython.display import display
from itertools import product

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

In [4]:
from utils import load

task = 4
df_x, target = load(task, 'train')
_, y_test = load(task, 'test-target')
x_test, _ = load(task, 'test')

In [5]:
df_x.shape

(114130, 142)

Разбиваем колонки по типам значений

In [73]:
def group_columns_by_type(df_x, max_categorical_levels):
    max_categorical_levels = 10
    datetime_columns, numerical_columns, categorical_columns, idx_columns, single_value_columns = [], [], [], [], []
    for col in df_x.columns:
        column_unique_values = df_x[col].nunique()
        if column_unique_values <= max_categorical_levels:
            if column_unique_values == 1:
                single_value_columns.append(col)
            else:
                categorical_columns.append(col)
        elif col.startswith('datetime'):
            datetime_columns.append(col)
        elif col.startswith('number'):
            numerical_columns.append(col)
        elif col.startswith('id_') or col.endswith('_id'):
            idx_columns.append(col)
    return datetime_columns, numerical_columns, categorical_columns, idx_columns, single_value_columns

In [74]:
datetime_columns, numerical_columns, categorical_columns, idx_columns, single_value_columns = group_columns_by_type(df_x, 10)

Выделение datetime фичей

In [133]:
import datetime
from sklearn.base import BaseEstimator, TransformerMixin

class DatetimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):            
        self.created_features = None
        
    def transform(self, col):
        df_datetime = pd.DataFrame()
        col = col.apply(lambda x: self.parse_dt(x))
        df_datetime[f'weekday_dt'] = col.apply(lambda x: x.weekday())
        df_datetime[f'month_dt'] = col.apply(lambda x: x.month)
        df_datetime[f'day_dt'] = col.apply(lambda x: x.day)
        df_datetime[f'hour_dt'] = col.apply(lambda x: x.hour)
        df_datetime[f'hour_of_week_dt'] = col.apply(lambda x: x.hour + x.weekday() * 24)
        df_datetime[f'minute_of_day_dt'] = col.apply(lambda x: x.minute + x.hour * 60)
        
        if self.created_features is None:
            self.created_features = list(df_datetime.columns)
        else:
            assert self.created_features == list(df_datetime.columns)
        return df_datetime
    
    def fit(self, x, **fit_params):
        return self
    
    def parse_dt(self, x):        
        if isinstance(x, datetime.datetime) or isinstance(x, datetime.date):
            return x
        elif not isinstance(x, str):
            return np.nan
        elif len(x) == len('2010-01-01'):
            return datetime.datetime.strptime(x, '%Y-%m-%d')
        elif len(x) == len('2010-01-01 10:10:10'):
            return datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
        else:
            return np.nan
        
    def get_feature_names(self):
        return self.created_features

In [134]:
from sklearn.compose import ColumnTransformer

datetime_transformer = ColumnTransformer([
    (col, DatetimeTransformer(), col) 
    for col in datetime_columns
])

# _x = datetime_transformer.fit_transform(df_x)
# _features = datetime_transformer.get_feature_names()
# pd.DataFrame(data=_x, columns=_features)

In [135]:
class LagValues(BaseEstimator, TransformerMixin):
    def __init__(self, by):
        self.by = by
        self.created_features = None
        
    def transform(self, df_x):
        columns_to_shift = [col for col in df_x.columns if col != self.by]
        df_lags = df_x.sort_values(self.by, ascending=True)
        df_lags.drop(self.by, axis=1, inplace=True)
        df_lags = df_lags.shift(-1)
        df_lags.columns = [f'{col}_shift_1' for col in columns_to_shift]
        
        if self.created_features is None:
            self.created_features = list(df_lags.columns)
        else:
            assert self.created_features == list(df_lags.columns)
        return df_lags
        
    def fit(self, x, **fit_params):
        return self
    
    def get_feature_names(self):
        return self.created_features

In [136]:
lag_transformer = ColumnTransformer([
    (col, LagValues(by=col), [col] + numerical_columns) 
    for col in datetime_columns
])

# _x = lag_transformer.fit_transform(df_x)
# _features = lag_transformer.get_feature_names()
# pd.DataFrame(data=_x, columns=_features)

In [139]:
from sklearn.pipeline import FeatureUnion

feature_generator = FeatureUnion([
    ('datetime', datetime_transformer),
    ('lag_numeric', lag_transformer)
], n_jobs=1)

In [142]:
%%time
feature_generator.fit_transform(df_x.head(1000))

CPU times: user 153 ms, sys: 8.81 ms, total: 161 ms
Wall time: 173 ms


array([[4.00000000e+00, 3.00000000e+00, 3.10000000e+01, ...,
        6.66710206e-01, 9.70118879e-01, 7.63741154e-01],
       [3.00000000e+00, 3.00000000e+00, 3.00000000e+01, ...,
        5.40185538e-01, 4.35392339e-01, 6.73276360e-01],
       [5.00000000e+00, 4.00000000e+00, 1.50000000e+01, ...,
        8.35761686e-01, 3.09176040e-01, 3.40171207e-01],
       ...,
       [0.00000000e+00, 3.00000000e+00, 2.70000000e+01, ...,
        7.77300436e-01, 2.28450957e-03, 7.54403534e-01],
       [3.00000000e+00, 4.00000000e+00, 2.00000000e+01, ...,
        5.60477365e-01, 4.30710881e-01, 4.47019213e-01],
       [3.00000000e+00, 3.00000000e+00, 9.00000000e+00, ...,
                   nan,            nan,            nan]])

In [144]:
%%time
feature_generator.transform(df_x.tail(1000))

CPU times: user 135 ms, sys: 6.36 ms, total: 141 ms
Wall time: 145 ms


array([[4.00000000e+00, 2.00000000e+00, 2.40000000e+01, ...,
        2.47276764e-04, 3.55766894e-01, 8.83302774e-01],
       [1.00000000e+00, 8.00000000e+00, 2.90000000e+01, ...,
        3.30443470e-01, 5.75818984e-01, 7.27896819e-01],
       [0.00000000e+00, 5.00000000e+00, 2.90000000e+01, ...,
        3.76982815e-02, 4.98714372e-01, 5.85553918e-02],
       ...,
       [5.00000000e+00, 5.00000000e+00, 6.00000000e+00, ...,
        9.61862674e-01, 5.40502857e-01, 3.69302317e-01],
       [4.00000000e+00, 6.00000000e+00, 1.60000000e+01, ...,
        7.62253520e-01, 5.04556728e-01, 8.62314438e-01],
       [2.00000000e+00, 4.00000000e+00, 1.20000000e+01, ...,
                   nan,            nan,            nan]])