### Base Line

In [117]:
from functools import partial
from IPython.display import display

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from utils import load, transform_datetime_features

Preprocessing

In [133]:
def constant_features(df_x):
    constant_columns = [
        col_name
        for col_name in df_x.columns
        if df_x[col_name].nunique() == 1
    ]
    return constant_columns

In [134]:
def drop_columns(df_x, cols):
    return df_x.drop(constant_columns, axis=1)

In [135]:
def select_for_encoding(df_x, max_unique=20):
    categorical = {}
    for col_name in list(df_x.columns):
        col_unique_values = df_x[col_name].unique()
        if 2 < len(col_unique_values) <= max_unique:
            categorical[col_name] = col_unique_values
    return categorical

In [136]:
def one_hot_encoding(df_x, categorical):
    df_dummies = pd.DataFrame()
    for col_name, unique_values in categorical.items():
        for unique_value in unique_values:
            df_dummies[f'onehot_{col_name}={unique_value}'] = (df_x[col_name] == unique_value).astype(int)
    return pd.concat([df_x, df_dummies], axis=1)

In [171]:
def find_missings(df_x):
    columns_with_missings = df_x.columns[df_x.isna().any()].tolist()
    return columns_with_missings

In [172]:
def fill_missings(df_x, columns):
    df_x = df_x.copy()
    for col_name in columns:
        df_x[col_name].fillna(-1, inplace=True)
    return df_x

In [173]:
def select_numeric_columns(df_x):
    numeric_columns = [
        col_name
        for col_name in df_x.columns
        if col_name.startswith('number') or col_name.startswith('onehot')
    ]
    return numeric_columns

In [174]:
def keep_columns(df_x, columns):
    return df_x[columns].copy()

In [185]:
def create_scaler(df_x):
    scaler = StandardScaler()
    scaler.fit(df_x)
    return scaler

In [186]:
def scale(df_x, scaler):
    return pd.DataFrame(data=scaler.transform(df_x), columns=df_x.columns)

In [217]:
def make_predictions(df_transformed, model):
    predictions = model.predict(df_transformed)
    return predictions

### Архитектура
Функции деляться на добытчиков (miner) и преобразователей (transformer). Добытчик извлекает правила преобразования на основе обучающего множества. Преобразователь применяет правила, извлеченные добытчиком к данным. Такая архитектура позволяет безболезненно переносить модели с x_train на x_test, даже если в данных имеются весомые различия (например x_test не имеет каких-то уровней категориальных переменных, что могло бы повлечь ошибки one-hot-encoding)

Modify

In [221]:
def create_pipeline(df_x, target):
    pipeline = []

    constant_columns = constant_features(df_x)
    df_x = drop_columns(df_x, cols=constant_columns)
    pipeline.append(partial(drop_columns, cols=constant_columns))

    categorical = select_for_encoding(df_x, max_unique=20)
    df_x = one_hot_encoding(df_x, categorical=categorical)
    pipeline.append(partial(one_hot_encoding, categorical=categorical))

    columns_with_missings = find_missings(df_x)
    df_x = fill_missings(df_x, columns=columns_with_missings)
    pipeline.append(partial(fill_missings, columns=columns_with_missings))

    numeric_columns = select_numeric_columns(df_x)
    df_x = keep_columns(df_x, columns=numeric_columns)
    pipeline.append(partial(keep_columns, columns=numeric_columns))

    scaler = create_scaler(df_x)
    df_x = scale(df_x, scaler)
    pipeline.append(partial(scale, scaler=scaler))
    
    df_transformed = df_x
    
    model = Ridge()
    model.fit(df_transformed, target)
    predictions = make_predictions(df_transformed, model)
    pipeline.append(partial(make_predictions, model=model))
    return pipeline, df_transformed, predictions

In [222]:
df_x, target = load(1, 'train')
_, y_test = load(1, 'test-target')
x_test, _ = load(1, 'test')

In [223]:
x_train, x_valid, y_train, y_valid = train_test_split(df_x, target, test_size=.2, random_state=123)

Model

In [225]:
pipeline, x_train, train_predictions = create_pipeline(x_train, y_train)

In [232]:
def predict(df_x, pipeline):
    result = df_x
    for transform in pipeline:
        result = transform(result)
    return result

Assess

In [244]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [245]:
def assess(y_true, y_pred):
    rmse = root_mean_squared_error(y_true, y_pred)
    return rmse

In [250]:
df_x, target = load(1, 'train')
_, y_test = load(1, 'test-target')
x_test, _ = load(1, 'test')

x_train, x_valid, y_train, y_valid = train_test_split(df_x, target, test_size=.2, random_state=123)

pipeline, x_train, train_predictions = create_pipeline(x_train, y_train)
train_rmse = assess(y_train, train_predictions)

valid_predictions = predict(x_valid, pipeline)
valid_rmse = assess(y_valid, valid_predictions)

test_predictions = predict(x_test, pipeline)
test_rmse = assess(y_test, test_predictions)

print(f'Train RMSE: {train_rmse}, \nValidation RMSE: {valid_rmse}, \nTest RMSE: {test_rmse}')

Train RMSE: 5.491683645412526, 
Validation RMSE: 6.4206762174510885, 
Test RMSE: 11.131148284326269
