# Baseline

In [1]:
import gc
import os
import random
import math

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import RobustScaler

In [9]:
train_df = pd.read_csv(f"../data/ventilator-pressure-prediction/train.csv")
test_df = pd.read_csv(f"../data/ventilator-pressure-prediction/test.csv")

In [10]:
CATE_FEATURES = ['R_cate', 'C_cate', 'RC_dot', 'RC_sum']

c_dic = {10: 0, 20: 1, 50:2}
r_dic = {5: 0, 20: 1, 50:2}
rc_sum_dic = {v: i for i, v in enumerate([15, 25, 30, 40, 55, 60, 70, 100])}
rc_dot_dic = {v: i for i, v in enumerate([50, 100, 200, 250, 400, 500, 2500, 1000])}    

def add_category_features(df):
    df['C_cate'] = df['C'].map(c_dic)
    df['R_cate'] = df['R'].map(r_dic)
    df['RC_sum'] = (df['R'] + df['C']).map(rc_sum_dic)
    df['RC_dot'] = (df['R'] * df['C']).map(rc_dot_dic)
    return df

train_df = add_category_features(train_df)
test_df = add_category_features(test_df)

train_df[CATE_FEATURES].to_feather(f"../data/features/train_cate_v1.ftr")
test_df[CATE_FEATURES].to_feather(f"../data/features/test_cate_v1.ftr")

train_df = train_df.drop(CATE_FEATURES, axis=1)
test_df = test_df.drop(CATE_FEATURES, axis=1)
gc.collect()

In [17]:
USE_LAG = 4
LAG_FEATURES = ['breath_time']
LAG_FEATURES += [f'u_in_lag_{i}' for i in range(1, USE_LAG+1)]
LAG_FEATURES += [f'u_in_time{i}' for i in range(1, USE_LAG+1)]
LAG_FEATURES += [f'u_out_lag_{i}' for i in range(1, USE_LAG+1)]

def add_lag_feature(df):
    # https://www.kaggle.com/kensit/improvement-base-on-tensor-bidirect-lstm-0-173
    for lag in range(1, USE_LAG+1):
        df[f'breath_id_lag{lag}']=df['breath_id'].shift(lag).fillna(0)
        df[f'breath_id_lag{lag}same']=np.select([df[f'breath_id_lag{lag}']==df['breath_id']], [1], 0)

        # u_in 
        df[f'u_in_lag_{lag}'] = df['u_in'].shift(lag).fillna(0) * df[f'breath_id_lag{lag}same']
        df[f'u_in_time{lag}'] = df['u_in'] - df[f'u_in_lag_{lag}']
        df[f'u_out_lag_{lag}'] = df['u_out'].shift(lag).fillna(0) * df[f'breath_id_lag{lag}same']

    # breath_time
    df['time_step_lag'] = df['time_step'].shift(1).fillna(0) * df[f'breath_id_lag{lag}same']
    df['breath_time'] = df['time_step'] - df['time_step_lag']

    drop_columns = ['time_step_lag']
    drop_columns += [f'breath_id_lag{i}' for i in range(1, USE_LAG+1)]
    drop_columns += [f'breath_id_lag{i}same' for i in range(1, USE_LAG+1)]
    df = df.drop(drop_columns, axis=1)

    # fill na by zero
    df = df.fillna(0)
    return df

train_df = add_lag_feature(train_df)
test_df = add_lag_feature(test_df)

train_df[LAG_FEATURES].to_feather(f"../data/features/train_lag_v1.ftr")
test_df[LAG_FEATURES].to_feather(f"../data/features/test_lag_v1.ftr")

train_df = train_df.drop(LAG_FEATURES, axis=1)
test_df = test_df.drop(LAG_FEATURES, axis=1)
gc.collect()

8

In [26]:
ROLLING = [2, 4, 8]
ROLLING_FEATURES = [f"u_in_rolling_mean{w}" for w in ROLLING]
ROLLING_FEATURES += [f"u_in_rolling_max{w}" for w in ROLLING]
ROLLING_FEATURES += [f"u_in_rolling_min{w}" for w in ROLLING]
ROLLING_FEATURES += [f"u_in_rolling_std{w}" for w in ROLLING]
    
def add_rolling_features(df):
    for w in ROLLING:
        print('  ', w, 'mean')
        df[f"u_in_rolling_mean{w}"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(w).mean()["u_in"].reset_index(drop=True)
        print('  ', w, 'max')
        df[f"u_in_rolling_max{w}"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(w).max()["u_in"].reset_index(drop=True)
        print('  ', w, 'min')
        df[f"u_in_rolling_min{w}"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(w).min()["u_in"].reset_index(drop=True)
        print('  ', w, 'std')
        df[f"u_in_rolling_std{w}"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(w).std()["u_in"].reset_index(drop=True)
    df = df.fillna(0)
    return df

print('train')
train_df = add_rolling_features(train_df)
print('test')
test_df = add_rolling_features(test_df)

train_df[ROLLING_FEATURES].to_feather(f"../data/features/train_roll_v1.ftr")
test_df[ROLLING_FEATURES].to_feather(f"../data/features/test_roll_v1.ftr")

train_df = train_df.drop(ROLLING_FEATURES, axis=1)
test_df = test_df.drop(ROLLING_FEATURES, axis=1)
gc.collect()

train
   2 mean
   2 max
   2 min
   2 std
   4 mean
   4 max
   4 min
   4 std
   8 mean
   8 max
   8 min
   8 std
test
   2 mean
   2 max
   2 min
   2 std
   4 mean
   4 max
   4 min
   4 std
   8 mean
   8 max
   8 min
   8 std


0

In [23]:
CONT_FEATURES = ['u_in_cumsum', 'u_in_cummean', 'area', 'cross', 'cross2']

def add_cont_feature(df):
    df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df['delta'] = df['time_delta'] * df['u_in']
    df['area'] = df.groupby('breath_id')['delta'].cumsum()

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] / df['count']
    
    df = df.drop(['count','one'], axis=1)
    return df

train_df = add_cont_feature(train_df)
test_df = add_cont_feature(test_df)

train_df[CONT_FEATURES].to_feather(f"../data/features/train_cont_v1.ftr")
test_df[CONT_FEATURES].to_feather(f"../data/features/test_cont_v1.ftr")

train_df = train_df.drop(CONT_FEATURES, axis=1)
test_df = test_df.drop(CONT_FEATURES, axis=1)

gc.collect()

0

In [None]:
train_df[ROLLING_FEATURES].to_feather(f"../data/features/train_roll_v1.ftr")
test_df[ROLLING_FEATURES].to_feather(f"../data/features/test_roll_v1.ftr")

In [27]:
use_features = ["lag_v1", "cate_v1", "cont_v1", "roll_v1"]

In [None]:
ORG_FEATURES = ['u_in', 'u_out', 'time_step']

dfs = [train_df[ORG_FEATURES]]
for fname in use_features:
    _df = pd.read_feather(f"../data/features/train_{fname}.ftr")
    dfs.append(_df)
feat_train = pd.concat(dfs ,axis=1)

dfs = [test_df[ORG_FEATURES]]
for fname in use_features:
    _df = pd.read_feather(f"../data/features/test_{fname}.ftr")
    dfs.append(_df)
feat_test = pd.concat(dfs ,axis=1)

del dfs, _df

gc.collect()

In [38]:
NORM_FEATURES = CONT_FEATURES + LAG_FEATURES + ROLLING_FEATURES + ['u_in', 'time_step']

def norm_scale(train_df, test_df):
    scaler = RobustScaler()
    all_u_in = np.vstack([train_df[NORM_FEATURES].values, test_df[NORM_FEATURES].values])
    scaler.fit(all_u_in)
    train_df[NORM_FEATURES] = scaler.transform(train_df[NORM_FEATURES].values)
    test_df[NORM_FEATURES] = scaler.transform(test_df[NORM_FEATURES].values)
    return train_df, test_df


feat_train, feat_test = norm_scale(feat_train, feat_test)

In [39]:
feat_train.to_feather(f"../data/features/train_v1_all_norm.ftr")
feat_test.to_feather(f"../data/features/test_v1_all_norm.ftr")

In [35]:
!ls ../data/features/

test_cate_v1.ftr      test_v1_all_norm.ftr  train_roll_v1.ftr
test_cont_v1.ftr      train_cate_v1.ftr     train_v1_all_norm.ftr
test_lag_v1.ftr       train_cont_v1.ftr
test_roll_v1.ftr      train_lag_v1.ftr


In [40]:
feat_train

Unnamed: 0,u_in,u_out,time_step,breath_time,u_in_lag_1,u_in_lag_2,u_in_lag_3,u_in_lag_4,u_in_time1,u_in_time2,...,u_in_rolling_mean8,u_in_rolling_max2,u_in_rolling_max4,u_in_rolling_max8,u_in_rolling_min2,u_in_rolling_min4,u_in_rolling_min8,u_in_rolling_std2,u_in_rolling_std4,u_in_rolling_std8
0,-0.938051,-1.0,-0.989105,-15.407460,-0.864294,-0.841938,-0.818664,-0.793497,0.472113,0.218070,...,-0.843952,-1.037090,-0.915159,-0.857995,-0.814921,-0.660482,-0.289212,-0.149243,-0.201996,-0.270651
1,3.054611,-1.0,-0.963659,0.077784,-0.847520,-0.841938,-0.818664,-0.793497,113.649156,58.785756,...,-0.843952,3.179717,-0.915159,-0.857995,-0.798138,-0.660482,-0.289212,32.412620,-0.201996,-0.270651
2,3.954880,-1.0,-0.938055,15.659572,2.835938,-0.825195,-0.818664,-0.793497,25.590379,71.724949,...,-0.843952,4.126217,-0.915159,-0.857995,2.887529,-0.660482,-0.289212,7.192838,-0.201996,-0.270651
3,4.020235,-1.0,-0.912325,31.317608,3.666488,2.851513,-0.801907,-0.793497,1.815413,14.115946,...,-0.843952,4.194928,3.617129,-0.857995,3.718577,-0.643557,-0.289212,0.383756,14.809927,-0.270651
4,4.575950,-1.0,-0.886455,0.335930,3.726782,3.680541,2.877892,-0.776723,15.778870,9.061736,...,-0.843952,4.779180,4.123244,-0.857995,3.778907,3.073201,-0.289212,4.382850,3.821394,-0.270651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035995,-0.631204,0.0,0.904713,-0.033022,-0.578326,-0.570057,-0.545674,-0.519295,0.383070,0.388254,...,-0.546211,-0.695371,-0.619141,-0.583184,-0.528782,-0.385640,-0.028487,-0.026463,-0.112520,-0.210059
6035996,-0.631470,0.0,0.929935,-0.057817,-0.564437,-0.556494,-0.546555,-0.520236,-0.053195,0.168307,...,-0.539397,-0.695371,-0.619141,-0.583184,-0.515130,-0.385640,-0.004149,-0.147078,-0.111227,-0.220918
6035997,-0.616092,0.0,0.955226,-0.016786,-0.564682,-0.542631,-0.532980,-0.521117,0.392258,0.173040,...,-0.534334,-0.679483,-0.605378,-0.570407,-0.515130,-0.371929,-0.004149,-0.023832,-0.123284,-0.203294
6035998,-0.678561,0.0,0.980433,-0.067361,-0.550495,-0.542875,-0.519105,-0.507529,-1.824488,-0.739407,...,-0.536904,-0.679483,-0.605378,-0.570407,-0.558600,-0.401998,-0.021116,0.360215,-0.028664,-0.190003


In [None]:
feat_train.shape