# Make feature values

In [3]:
import gc
import os
import random
import math

import numpy as np
import pandas as pd

from sklearn.preprocessing import RobustScaler

%matplotlib inline

In [4]:
train_df = pd.read_csv(f"../data/ventilator-pressure-prediction/train.csv")
test_df = pd.read_csv(f"../data/ventilator-pressure-prediction/test.csv")

## Categorical Features

In [None]:
CATE_FEATURES = ['R_cate', 'C_cate', 'RC_dot', 'RC_sum']

c_dic = {10: 0, 20: 1, 50:2}
r_dic = {5: 0, 20: 1, 50:2}
rc_sum_dic = {v: i for i, v in enumerate([15, 25, 30, 40, 55, 60, 70, 100])}
rc_dot_dic = {v: i for i, v in enumerate([50, 100, 200, 250, 400, 500, 2500, 1000])}    

def add_category_features(df):
    df['C_cate'] = df['C'].map(c_dic)
    df['R_cate'] = df['R'].map(r_dic)
    df['RC_sum'] = (df['R'] + df['C']).map(rc_sum_dic)
    df['RC_dot'] = (df['R'] * df['C']).map(rc_dot_dic)
    return df

train_df = add_category_features(train_df)
test_df = add_category_features(test_df)

train_df[CATE_FEATURES].to_feather(f"../data/features/train_cate_v1.ftr")
test_df[CATE_FEATURES].to_feather(f"../data/features/test_cate_v1.ftr")

train_df = train_df.drop(CATE_FEATURES, axis=1)
test_df = test_df.drop(CATE_FEATURES, axis=1)
gc.collect()

## Lag Features

In [None]:
USE_LAG = 4
LAG_FEATURES = ['breath_time']
LAG_FEATURES += [f'u_in_lag_{i}' for i in range(1, USE_LAG+1)]
LAG_FEATURES += [f'u_in_time{i}' for i in range(1, USE_LAG+1)]
LAG_FEATURES += [f'u_out_lag_{i}' for i in range(1, USE_LAG+1)]

def add_lag_feature(df):
    # https://www.kaggle.com/kensit/improvement-base-on-tensor-bidirect-lstm-0-173
    for lag in range(1, USE_LAG+1):
        df[f'breath_id_lag{lag}']=df['breath_id'].shift(lag).fillna(0)
        df[f'breath_id_lag{lag}same']=np.select([df[f'breath_id_lag{lag}']==df['breath_id']], [1], 0)

        # u_in 
        df[f'u_in_lag_{lag}'] = df['u_in'].shift(lag).fillna(0) * df[f'breath_id_lag{lag}same']
        df[f'u_in_time{lag}'] = df['u_in'] - df[f'u_in_lag_{lag}']
        df[f'u_out_lag_{lag}'] = df['u_out'].shift(lag).fillna(0) * df[f'breath_id_lag{lag}same']

    # breath_time
    df['time_step_lag'] = df['time_step'].shift(1).fillna(0) * df[f'breath_id_lag{lag}same']
    df['breath_time'] = df['time_step'] - df['time_step_lag']

    drop_columns = ['time_step_lag']
    drop_columns += [f'breath_id_lag{i}' for i in range(1, USE_LAG+1)]
    drop_columns += [f'breath_id_lag{i}same' for i in range(1, USE_LAG+1)]
    df = df.drop(drop_columns, axis=1)

    # fill na by zero
    df = df.fillna(0)
    return df

train_df = add_lag_feature(train_df)
test_df = add_lag_feature(test_df)

train_df[LAG_FEATURES].to_feather(f"../data/features/train_lag_v1.ftr")
test_df[LAG_FEATURES].to_feather(f"../data/features/test_lag_v1.ftr")

train_df = train_df.drop(LAG_FEATURES, axis=1)
test_df = test_df.drop(LAG_FEATURES, axis=1)
gc.collect()

## Rolling Features

In [None]:
ROLLING = [2, 4, 8]
ROLLING_FEATURES = [f"u_in_rolling_mean{w}" for w in ROLLING]
ROLLING_FEATURES += [f"u_in_rolling_max{w}" for w in ROLLING]
ROLLING_FEATURES += [f"u_in_rolling_min{w}" for w in ROLLING]
ROLLING_FEATURES += [f"u_in_rolling_std{w}" for w in ROLLING]
    
def add_rolling_features(df):
    for w in ROLLING:
        df[f"u_in_rolling_mean{w}"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(w).mean()["u_in"].reset_index(drop=True)
        df[f"u_in_rolling_max{w}"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(w).max()["u_in"].reset_index(drop=True)
        df[f"u_in_rolling_min{w}"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(w).min()["u_in"].reset_index(drop=True)
        df[f"u_in_rolling_std{w}"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(w).std()["u_in"].reset_index(drop=True)
    df = df.fillna(0)
    return df

train_df = add_rolling_features(train_df)
test_df = add_rolling_features(test_df)

train_df[ROLLING_FEATURES].to_feather(f"../data/features/train_roll_v1.ftr")
test_df[ROLLING_FEATURES].to_feather(f"../data/features/test_roll_v1.ftr")

train_df = train_df.drop(ROLLING_FEATURES, axis=1)
test_df = test_df.drop(ROLLING_FEATURES, axis=1)
gc.collect()

## Continuous Features

In [None]:
CONT_FEATURES = ['u_in_cumsum', 'u_in_cummean', 'area', 'cross', 'cross2']

def add_cont_feature(df):
    df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df['delta'] = df['time_delta'] * df['u_in']
    df['area'] = df.groupby('breath_id')['delta'].cumsum()

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] / df['count']
    
    df = df.drop(['count','one'], axis=1)
    return df

train_df = add_cont_feature(train_df)
test_df = add_cont_feature(test_df)

train_df[CONT_FEATURES].to_feather(f"../data/features/train_cont_v1.ftr")
test_df[CONT_FEATURES].to_feather(f"../data/features/test_cont_v1.ftr")

train_df = train_df.drop(CONT_FEATURES, axis=1)
test_df = test_df.drop(CONT_FEATURES, axis=1)

gc.collect()

## Extract V1
### Re-Load Features

In [None]:
use_features = ["lag_v1", "cate_v1", "cont_v1", "roll_v1"]

In [None]:
ORG_FEATURES = ['u_in', 'u_out', 'time_step']

dfs = [train_df[ORG_FEATURES]]
for fname in use_features:
    _df = pd.read_feather(f"../data/features/train_{fname}.ftr")
    dfs.append(_df)
feat_train = pd.concat(dfs ,axis=1)

dfs = [test_df[ORG_FEATURES]]
for fname in use_features:
    _df = pd.read_feather(f"../data/features/test_{fname}.ftr")
    dfs.append(_df)
feat_test = pd.concat(dfs ,axis=1)

del dfs, _df

gc.collect()

### Normarize

In [None]:
NORM_FEATURES = CONT_FEATURES + LAG_FEATURES + ROLLING_FEATURES + ['u_in', 'time_step']

def norm_scale(train_df, test_df):
    scaler = RobustScaler()
    all_u_in = np.vstack([train_df[NORM_FEATURES].values, test_df[NORM_FEATURES].values])
    scaler.fit(all_u_in)
    train_df[NORM_FEATURES] = scaler.transform(train_df[NORM_FEATURES].values)
    test_df[NORM_FEATURES] = scaler.transform(test_df[NORM_FEATURES].values)
    return train_df, test_df

feat_train, feat_test = norm_scale(feat_train, feat_test)

In [None]:
feat_train.to_feather(f"../data/features/train_v1_all_norm.ftr")
feat_test.to_feather(f"../data/features/test_v1_all_norm.ftr")

!ls ../data/features/

##  Lag Back

In [None]:
USE_LAG = 4
LAG_BACK_FEATURES = [f'u_in_lag_{i}_back' for i in range(1, USE_LAG+1)]
LAG_BACK_FEATURES += [f'u_in_time{i}_back' for i in range(1, USE_LAG+1)]
LAG_BACK_FEATURES += [f'u_out_lag_{i}_back' for i in range(1, USE_LAG+1)]

def add_lag_back_feature(df):
    # https://www.kaggle.com/kensit/improvement-base-on-tensor-bidirect-lstm-0-173
    for lag in range(1, USE_LAG+1):
        df[f'breath_id_lag{lag}_back']=df['breath_id'].shift(-lag).fillna(0)
        df[f'breath_id_lag{lag}same_back']=np.select([df[f'breath_id_lag{lag}_back']==df['breath_id']], [1], 0)

        # u_in 
        df[f'u_in_lag_{lag}_back'] = df['u_in'].shift(-lag).fillna(0) * df[f'breath_id_lag{lag}same_back']
        df[f'u_in_time{lag}_back'] = df['u_in'] - df[f'u_in_lag_{lag}_back']
        df[f'u_out_lag_{lag}_back'] = df['u_out'].shift(-lag).fillna(0) * df[f'breath_id_lag{lag}same_back']

    drop_columns = [f'breath_id_lag{i}_back' for i in range(1, USE_LAG+1)]
    drop_columns += [f'breath_id_lag{i}same_back' for i in range(1, USE_LAG+1)]
    df = df.drop(drop_columns, axis=1)

    # fill na by zero
    df = df.fillna(0)
    return df

train_df = add_lag_back_feature(train_df)
test_df = add_lag_back_feature(test_df)

train_df[LAG_BACK_FEATURES].to_feather(f"../data/features/train_lag_back_v1.ftr")
test_df[LAG_BACK_FEATURES].to_feather(f"../data/features/test_lag_back_v1.ftr")

train_df = train_df.drop(LAG_BACK_FEATURES, axis=1)
test_df = test_df.drop(LAG_BACK_FEATURES, axis=1)
gc.collect()

## Continuous Features V2

In [None]:
CONT_FEATURES_V2 = ['u_in_mean', 'u_in_std', 'breath_id_u_out_mean', 'last_value_u_in', 'first_value_u_in']

def add_cont_feature_v2(df):
    u_in_mean_dict = df.groupby('breath_id')['u_in'].mean().to_dict()
    df['u_in_mean'] = df['breath_id'].map(u_in_mean_dict)
    del u_in_mean_dict
    u_in_std_dict = df.groupby('breath_id')['u_in'].std().to_dict()
    df['u_in_std'] = df['breath_id'].map(u_in_std_dict)
    del u_in_std_dict

    df['breath_id_u_out_mean'] =df.groupby(['breath_id'])['u_out'].sum()

    df['last_value_u_in'] = df.groupby('breath_id')['u_in'].transform('last')
    df['first_value_u_in'] = df.groupby('breath_id')['u_in'].transform('first')
    
    return df

train_df = add_cont_feature_v2(train_df)
test_df = add_cont_feature_v2(test_df)

train_df[CONT_FEATURES_V2].to_feather(f"../data/features/train_cont_v2.ftr")
test_df[CONT_FEATURES_V2].to_feather(f"../data/features/test_cont_v2.ftr")

train_df = train_df.drop(CONT_FEATURES_V2, axis=1)
test_df = test_df.drop(CONT_FEATURES_V2, axis=1)
gc.collect()

## EWM

In [None]:
EWM_LST = [8, 16, 32]
EWM_FEATURES = [f'ewm_u_in_mean_{i}' for i in EWM_LST]
EWM_FEATURES += [f'ewm_u_in_std_{i}' for i in EWM_LST]
EWM_FEATURES += [f'ewm_u_in_corr_{i}' for i in EWM_LST]

def add_ewm_feature(df):
    for e in EWM_LST:
        df[f'ewm_u_in_mean_{e}'] = df.groupby('breath_id')['u_in'].ewm(halflife=8).mean().reset_index(level=0,drop=True)
        df[f'ewm_u_in_std_{e}'] = df.groupby('breath_id')['u_in'].ewm(halflife=16).std().reset_index(level=0,drop=True) 
        df[f'ewm_u_in_corr_{e}'] = df.groupby('breath_id')['u_in'].ewm(halflife=32).corr().reset_index(level=0,drop=True) 
    return df

train_df = add_ewm_feature(train_df)
test_df = add_ewm_feature(test_df)

train_df[EWM_FEATURES].to_feather(f"../data/features/train_ewm_v1.ftr")
test_df[EWM_FEATURES].to_feather(f"../data/features/test_ewm_v1.ftr")

train_df = train_df.drop(EWM_FEATURES, axis=1)
test_df = test_df.drop(EWM_FEATURES, axis=1)

gc.collect()

## Extract V2

In [None]:
use_features = ["lag_back_v1", "cont_v2", "ewm_v1"]

In [None]:
dfs = []
for fname in use_features:
    _df = pd.read_feather(f"../data/features/train_{fname}.ftr")
    dfs.append(_df)
feat_train_v2 = pd.concat(dfs ,axis=1)

dfs = []
for fname in use_features:
    _df = pd.read_feather(f"../data/features/test_{fname}.ftr")
    dfs.append(_df)
feat_test_v2 = pd.concat(dfs ,axis=1)

del dfs, _df

gc.collect()

### Normarize V2

In [None]:
NORM_FEATURES_V2 = LAG_BACK_FEATURES + CONT_FEATURES_V2 + EWM_FEATURES

def norm_scale_v2(train_df, test_df):
    scaler = RobustScaler()
    all_u_in = np.vstack([train_df[NORM_FEATURES_V2].values, test_df[NORM_FEATURES_V2].values])
    scaler.fit(all_u_in)
    train_df[NORM_FEATURES_V2] = scaler.transform(train_df[NORM_FEATURES_V2].values)
    test_df[NORM_FEATURES_V2] = scaler.transform(test_df[NORM_FEATURES_V2].values)
    return train_df, test_df

feat_train_v2, feat_test_v2 = norm_scale_v2(feat_train_v2, feat_test_v2)

In [None]:
feat_train_v2.to_feather(f"../data/features/train_v2_all_norm.ftr")
feat_test_v2.to_feather(f"../data/features/test_v2_all_norm.ftr")

!ls ../data/features/

## target encoding

In [5]:
feat = {}
for r, df in train_df.groupby('R'):
    d = {
         f"R_mean": df['pressure'].mean(),
         f"R_std": df['pressure'].std(),
         f"R_mean_u_out0": df.query('u_out==0')['pressure'].mean(),
         f"R_std_u_out0": df.query('u_out==0')['pressure'].std(),
    }
    feat[r] = d
train_target_r = pd.DataFrame(train_df['R'].map(feat).tolist())
test_target_r = pd.DataFrame(test_df['R'].map(feat).tolist())

In [13]:
feat = {}
for c, df in train_df.groupby('C'):
    d = {
         f"C_mean": df['pressure'].mean(),
         f"C_std": df['pressure'].std(),
         f"C_mean_u_out0": df.query('u_out==0')['pressure'].mean(),
         f"C_std_u_out0": df.query('u_out==0')['pressure'].std(),
    }
    feat[c] = d
train_target_c = pd.DataFrame(train_df['C'].map(feat).tolist())
test_target_c = pd.DataFrame(test_df['C'].map(feat).tolist())

In [7]:
train_df['R_C'] = train_df.apply(lambda x: f"{x.R}_{x.C}", axis=1)
test_df['R_C'] = test_df.apply(lambda x: f"{x.R}_{x.C}", axis=1)

In [9]:
feat = {}
for rc, df in train_df.groupby(['R_C']):
    d = {
         f"RC_mean": df['pressure'].mean(),
         f"RC_std": df['pressure'].std(),
         f"RC_mean_u_out0": df.query('u_out==0')['pressure'].mean(),
         f"RC_std_u_out0": df.query('u_out==0')['pressure'].std(),
    }
    feat[rc] = d
train_target_rc = pd.DataFrame(train_df['R_C'].map(feat).tolist())
test_target_rc = pd.DataFrame(test_df['R_C'].map(feat).tolist())

In [14]:
train_target_features = pd.concat([train_target_r, train_target_c, train_target_rc], axis=1)
test_target_features = pd.concat([test_target_r, test_target_c, test_target_rc], axis=1)

train_target_features.to_feather(f"../data/features/train_target_feat_v1.ftr")
test_target_features.to_feather(f"../data/features/test_target_feat_v1.ftr")

In [16]:
train_target_features.to_feather(f"../data/features/train_target_feat_v1.ftr")
test_target_features.to_feather(f"../data/features/test_target_feat_v1.ftr")

In [18]:
TARGET_FEATURES = [
    'R_mean', 'R_std', 'R_mean_u_out0', 'R_std_u_out0',
    'C_mean', 'C_std', 'C_mean_u_out0', 'C_std_u_out0',
    'RC_mean', 'RC_std', 'RC_mean_u_out0', 'RC_std_u_out0'
]

def norm_scale_v3(train_df, test_df):
    scaler = RobustScaler()
    all_u_in = np.vstack([train_df[TARGET_FEATURES].values, test_df[TARGET_FEATURES].values])
    scaler.fit(all_u_in)
    train_df[TARGET_FEATURES] = scaler.transform(train_df[TARGET_FEATURES].values)
    test_df[TARGET_FEATURES] = scaler.transform(test_df[TARGET_FEATURES].values)
    return train_df, test_df

feat_train_v3, feat_test_v3 = norm_scale_v3(train_target_features, test_target_features)

In [20]:
feat_train_v3.to_feather(f"../data/features/train_v3_all_norm.ftr")
feat_test_v3.to_feather(f"../data/features/test_v3_all_norm.ftr")