Обработка фичей с помощью инструментов sklearn: Pipeline, FeatureUnion, ColumnTransformer, ...<br>
http://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer<br>
http://scikit-learn.org/stable/modules/compose.html<br>
http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html<br>
http://michelleful.github.io/code-blog/2015/06/20/pipelines/<br>

Фиксируем numeric и categorical columns. Ищем datetime переменные.

Для datetime (каждый пункт строится независимо):
    - выделение дней, месяцев, ...

Для numeric (каждый пункт строится независимо):
    - lag-фичи по каждой datetime переменной
    - agg-фичи по месяцам, неделям и дням каждой datetime переменной

Для categorical (каждый пункт строится независимо):
    - dummies
   
Заполняем пропуски: http://scikit-learn.org/stable/modules/impute.html#impute
    
Отдельный пайплайн на обработку пропусков в категориальных и числовых признаках:<br>
http://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html#sphx-glr-auto-examples-compose-plot-column-transformer-mixed-types-py<br>

In [1]:
import time
from functools import partial
from IPython.display import display
from itertools import product

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
from utils import load

task = 4
df_x, target = load(task, 'train')
_, y_test = load(task, 'test-target')
x_test, _ = load(task, 'test')

In [3]:
df_x.shape

(114130, 142)

Разбиваем колонки по типам значений

In [4]:
def group_columns_by_type(df_x, max_categorical_levels):
    max_categorical_levels = 10
    datetime_columns, numerical_columns, categorical_columns, idx_columns, single_value_columns = [], [], [], [], []
    for col in df_x.columns:
        column_unique_values = df_x[col].nunique()
        if column_unique_values <= max_categorical_levels:
            if column_unique_values == 1:
                single_value_columns.append(col)
            else:
                categorical_columns.append(col)
        elif col.startswith('datetime'):
            datetime_columns.append(col)
        elif col.startswith('number'):
            numerical_columns.append(col)
        elif col.startswith('id_') or col.endswith('_id'):
            idx_columns.append(col)
    return datetime_columns, numerical_columns, categorical_columns, idx_columns, single_value_columns

In [5]:
datetime_columns, numerical_columns, categorical_columns, idx_columns, single_value_columns = group_columns_by_type(df_x, 10)

Выделение datetime фичей

In [35]:
# http://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#sklearn.compose.ColumnTransformer
# http://scikit-learn.org/stable/modules/compose.html
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
# http://michelleful.github.io/code-blog/2015/06/20/pipelines/

import datetime
from sklearn.base import BaseEstimator, TransformerMixin

class DatetimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):            
        self.created_features = None
        
    def transform(self, col):
        df_datetime = pd.DataFrame()
        col = col.apply(lambda x: self.parse_dt(x))
        df_datetime[f'weekday_dt'] = col.apply(lambda x: x.weekday())
        df_datetime[f'month_dt'] = col.apply(lambda x: x.month)
        df_datetime[f'day_dt'] = col.apply(lambda x: x.day)
        df_datetime[f'hour_dt'] = col.apply(lambda x: x.hour)
        df_datetime[f'hour_of_week_dt'] = col.apply(lambda x: x.hour + x.weekday() * 24)
        df_datetime[f'minute_of_day_dt'] = col.apply(lambda x: x.minute + x.hour * 60)
        
        if self.created_features is None:
            self.created_features = list(df_datetime.columns)
        else:
            assert self.created_features == list(df_datetime.columns)
        return df_datetime
    
    def fit(self, x, y=None, **fit_params):
        return self
    
    def parse_dt(self, x):        
        if isinstance(x, datetime.datetime) or isinstance(x, datetime.date):
            return x
        elif not isinstance(x, str):
            return np.nan
        elif len(x) == len('2010-01-01'):
            return datetime.datetime.strptime(x, '%Y-%m-%d')
        elif len(x) == len('2010-01-01 10:10:10'):
            return datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
        else:
            return np.nan
        
    def get_feature_names(self):
        return self.created_features

In [36]:
from sklearn.compose import ColumnTransformer

datetime_transformer = ColumnTransformer([
    (col, DatetimeTransformer(), col) 
    for col in datetime_columns
])

# _x = datetime_transformer.fit_transform(df_x)
# _features = datetime_transformer.get_feature_names()
# pd.DataFrame(data=_x, columns=_features)

In [37]:
class LagValues(BaseEstimator, TransformerMixin):
    def __init__(self, by):
        self.by = by
        self.created_features = None
        
    def transform(self, df_x):
        columns_to_shift = [col for col in df_x.columns if col != self.by]
        df_lags = df_x.sort_values(self.by, ascending=True)
        df_lags.drop(self.by, axis=1, inplace=True)
        df_lags = df_lags.shift(-1)
        df_lags.columns = [f'{col}_shift_1' for col in columns_to_shift]
        
        if self.created_features is None:
            self.created_features = list(df_lags.columns)
        else:
            assert self.created_features == list(df_lags.columns)
        return df_lags
        
    def fit(self, x, y=None, **fit_params):
        return self
    
    def get_feature_names(self):
        return self.created_features

In [38]:
lag_transformer = ColumnTransformer([
    (col, LagValues(by=col), [col] + numerical_columns) 
    for col in datetime_columns
])

# _x = lag_transformer.fit_transform(df_x)
# _features = lag_transformer.get_feature_names()
# pd.DataFrame(data=_x, columns=_features)

In [39]:
class ColumnsSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.created_features = None
    
    def transform(self, df_x):                
        df_selected = df_x[self.columns].copy()
        self.created_features = list(df_selected)
        return df_selected
    
    def fit(self, x, y=None, **fit_params):
        return self
    
    def get_feature_names(self):
        return self.created_features

In [40]:
from sklearn.pipeline import FeatureUnion

feature_generator = FeatureUnion([
    ('numeric', ColumnsSelector(columns=numerical_columns)),
    ('datetime', datetime_transformer),
    ('lag_numeric', lag_transformer)
], n_jobs=1)

# feature_generator.fit_transform(df_x.head(1000))
# feature_generator.transform(df_x.tail(1000))
# feature_generator.get_feature_names()

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

pipeline = Pipeline([
    ('feature_generator', feature_generator),
    ('imputer', SimpleImputer(strategy='median')),
#     ('model', RandomForestClassifier(n_jobs=1))
])

In [49]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_x, target, test_size=.33, random_state=123)
# pipeline.fit(x_train, y_train)
model = RandomForestClassifier().fit(pipeline.fit_transform(x_train), y_train)



In [54]:
from sklearn.metrics import roc_auc_score

pred = model.predict_proba(pipeline.transform(x_test))[:, 1]
roc_auc_score(y_test, pred)

0.8375851027875603