In [1]:
import sys
sys.path.append('../')
from pathlib import Path
import pandas as pd
import numpy as np

from scripts.base import Feature, get_arguments, generate_features
Feature.dir = "F:/Kaggle/ventilator-pressure-prediction/data/input/features"

In [2]:
INPUT_DIR = Path("F:/Kaggle/ventilator-pressure-prediction/data/input/")
OUTPUT_DIR = Path('F:/Kaggle/ventilator-pressure-prediction/data/output/')
FEAT_DIR = Path("F:/Kaggle/ventilator-pressure-prediction/data/input/features/")

def get_train_test():
    _train = pd.read_csv(INPUT_DIR / "train.csv")
    _test = pd.read_csv(INPUT_DIR / "test.csv")
    return _train, _test

def load_datasets(feats):
    dfs = [pd.read_feather(FEAT_DIR / f'{f}_train.ftr') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_feather(FEAT_DIR / f'{f}_test.ftr') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

In [41]:
class Base(Feature):
    """結合用のデータ"""
    def create_features(self):
        self.train = train[['id', 'breath_id', 'time_step', 'u_in', 'u_out', 'pressure', 'fold']].copy()
        self.test = test[['id', 'breath_id', 'time_step', 'u_in', 'u_out']].copy()
        
train, test = get_train_test()
train_v2 = pd.read_csv(INPUT_DIR / "train_v2.csv")
train = pd.concat([train, train_v2[["fold"]]],axis=1)
#Base().run().save()

[Base] start
[Base] done in 0 s


In [32]:
class _Area(Feature):
    """u_inの下の面積っぽいけどなんか式違いそう"""
    def create_features(self):
        train['area'] = train['time_step'] * train['u_in'] 
        self.train['area'] = train.groupby('breath_id')['area'].cumsum()
        test['area'] = test['time_step'] * test['u_in'] 
        self.test['area'] = test.groupby('breath_id')['area'].cumsum()
train, test = get_train_test()
#Area().run().save()

[Area] start
[Area] done in 1 s


In [9]:
class Area(Feature):
    """その時点までのu_in曲線の積分、変化量、時間の変化量"""
    def create_features(self):
        def create(df):
            df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
            df['delta'] = df['time_delta'] * df['u_in']
            df['area'] = df.groupby('breath_id')['delta'].cumsum()
            df = df.fillna(0)
            return df
        cols = ['time_delta','delta','area']
        self.train = create(train)[cols]
        self.test = create(test)[cols]
        
train, test = get_train_test()
#Area().run().save()

[Area] start
[Area] done in 27 s


In [34]:
class Cross(Feature):
    """よくわからない"""
    def create_features(self):
        self.train['cross']  = train['u_in']*train['u_out'] 
        self.train['cross2'] = train['time_step']*train['u_out']
        self.test['cross']  = test['u_in']*test['u_out']
        self.test['cross2'] = test['time_step']*test['u_out']
        
train, test = get_train_test()
Cross().run().save()

[Cross] start
[Cross] done in 1 s


In [38]:
class U_in_cumsum_mean(Feature):
    """累積和と累積平均"""
    def create_features(self):
        def create(df):
            df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()# 累積和
            df['one'] = 1
            df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
            df['u_in_cummean'] =df['u_in_cumsum'] /df['count']
            return df
        self.train = create(train)[['u_in_cumsum', 'u_in_cummean']]
        self.test = create(test)[['u_in_cumsum', 'u_in_cummean']]
        
train, test = get_train_test()
U_in_cumsum_mean().run().save()

[U_in_cumsum_mean] start
[U_in_cumsum_mean] done in 1 s


In [15]:
class U_in_Lag(Feature):
    """累積和と累積平均"""
    def create_features(self):
        def create(df):
            df['u_in_lag'] = df.groupby('breath_id')['u_in'].shift(1)
            df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
            df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
            df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
            df['u_in_lag_back'] = df.groupby('breath_id')['u_in'].shift(-1)
            df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
            df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
            df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
            df = df.fillna(0)
            return df
        cols = ['u_in_lag','u_in_lag2','u_in_lag3','u_in_lag4','u_in_lag_back','u_in_lag_back2','u_in_lag_back3','u_in_lag_back4']
        self.train = create(train)[cols]
        self.test = create(test)[cols]
        
train, test = get_train_test()
U_in_Lag().run().save()

[U_in_Lag] start
[U_in_Lag] done in 3 s


In [16]:
class U_out_Lag(Feature):
    """累積和と累積平均"""
    def create_features(self):
        def create(df):
            df['u_out_lag'] = df.groupby('breath_id')['u_out'].shift(1)
            df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
            df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
            df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
            df['u_out_lag_back'] = df.groupby('breath_id')['u_out'].shift(-1)
            df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
            df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
            df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
            df = df.fillna(0)
            return df
        cols = ['u_out_lag','u_out_lag2','u_out_lag3','u_out_lag4','u_out_lag_back','u_out_lag_back2','u_out_lag_back3','u_out_lag_back4']
        self.train = create(train)[cols]
        self.test = create(test)[cols]
        
train, test = get_train_test()
U_out_Lag().run().save()

[U_out_Lag] start
[U_out_Lag] done in 3 s


In [67]:
class RC_OHE(Feature):
    """
    RとCのOHE
    """
    def create_features(self):
        def create(df):
            df['R'] = df['R'].astype(str)
            df['C'] = df['C'].astype(str)
            df['RC'] = df['R']+df['C']
            df = pd.get_dummies(df)
            return df
        cols = ['R_20', 'R_5', 'R_50', 'C_10', 'C_20', 'C_50', 'RC_2010', 'RC_2020', 'RC_2050', 'RC_5010', 'RC_5020', 'RC_5050', 'RC_510', 'RC_520', 'RC_550']
        self.train = create(train)[cols]
        self.test = create(test)[cols]
        
train, test = get_train_test()
RC_OHE().run().save()

[RC_OHE] start
[RC_OHE] done in 16 s


In [10]:
class U_out_stat(Feature):
    """
    u_outが0のときと1のときのu_inのmean,std,max
    breath_idごとに値はおなじになる
    """
    def create_features(self):
        def create(df):
            df = df.merge(
                df[df["u_out"]==0].groupby('breath_id')['u_in'].agg(["mean", "std", "max"]).add_prefix("u_out0_").reset_index(),
                on="breath_id"
                )
            df = df.merge(
                df[df["u_out"]==1].groupby('breath_id')['u_in'].agg(["mean", "std", "max"]).add_prefix("u_out1_").reset_index(),
                on="breath_id"
                )
            return df
        cols = ['u_out0_mean', 'u_out0_max', 'u_out0_std', 'u_out1_mean', 'u_out1_max', 'u_out1_std']
        self.train = create(train)[cols]
        self.test = create(test)[cols]

train, test = get_train_test()
U_out_stat().run().save()

[U_out_stat] start
[U_out_stat] done in 7 s


In [47]:
class U_inout_max(Feature):
    """
    u_out関係なくu_inのmaxとu_outのmax
    """
    def create_features(self):
        def create(df):
            df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
            df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
            return df
        cols = ['breath_id__u_in__max', 'breath_id__u_out__max']
        self.train = create(train)[cols]
        self.test = create(test)[cols]

train, test = get_train_test()
U_inout_max().run().save()

[U_inout_max] start
[U_inout_max] done in 1 s


In [13]:
"""
breath_time は　Areaのtime_deltaと同じ
u_in_timeは後から　u_in changeを作る
"""
# class Time(Feature):
#     """
#     """
#     def create_features(self):
#         def create(df):
#             # breath_time
#             df['breath_time'] = df['time_step'] - df['time_step'].shift(1)
#             df.loc[df['time_step'] == 0, 'breath_time'] = 0
#             # u_in_time
#             df['u_in_time'] = df['u_in'] - df['u_in'].shift(1)
#             df.loc[df['time_step'] == 0, 'u_in_time'] = 0
#             return df
#         cols = ['breath_time', 'u_in_time']
#         self.train = create(train)[cols]
#         self.test = create(test)[cols]

# train, test = get_train_test()
# Time().run().save()

[Time] start
[Time] done in 1 s


In [14]:
class U_in_Lag_Diff(Feature):
    """
    """
    def create_features(self):
        def create(df):
            df["u_in_lag1_diff"] = df["u_in"] - df["u_in_lag"]
            df["u_in_lag2_diff"] = df["u_in"] - df["u_in_lag2"]
            df["u_in_lag3_diff"] = df["u_in"] - df["u_in_lag3"]
            df["u_in_lag4_diff"] = df["u_in"] - df["u_in_lag4"]
            return df
        cols = ['u_in_lag1_diff', 'u_in_lag2_diff','u_in_lag3_diff', 'u_in_lag4_diff']
        self.train = create(train)[cols]
        self.test = create(test)[cols]
        display(self.train)

feats = ['Base', 'U_in_Lag']
train, test = load_datasets(feats)
display(train)
U_in_Lag_Diff().run().save()

Unnamed: 0,id,breath_id,time_step,u_in,u_out,pressure,fold,u_in_lag,u_in_lag2,u_in_lag3,u_in_lag4,u_in_lag_back,u_in_lag_back2,u_in_lag_back3,u_in_lag_back4
0,1,1,0.000000,0.083334,0,5.837492,4,0.000000,0.000000,0.000000,0.000000,18.383041,22.509278,22.808822,25.355850
1,2,1,0.033652,18.383041,0,5.907794,4,0.083334,0.000000,0.000000,0.000000,22.509278,22.808822,25.355850,27.259866
2,3,1,0.067514,22.509278,0,7.876254,4,18.383041,0.083334,0.000000,0.000000,22.808822,25.355850,27.259866,27.127486
3,4,1,0.101542,22.808822,0,11.742872,4,22.509278,18.383041,0.083334,0.000000,25.355850,27.259866,27.127486,26.807732
4,5,1,0.135756,25.355850,0,12.234987,4,22.808822,22.509278,18.383041,0.083334,27.259866,27.127486,26.807732,27.864715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035995,6035996,125749,2.504603,1.489714,1,3.869032,0,1.420711,1.353205,1.357586,1.362261,1.488497,1.558978,1.272663,1.482739
6035996,6035997,125749,2.537961,1.488497,1,3.869032,0,1.489714,1.420711,1.353205,1.357586,1.558978,1.272663,1.482739,0.000000
6035997,6035998,125749,2.571408,1.558978,1,3.798729,0,1.488497,1.489714,1.420711,1.353205,1.272663,1.482739,0.000000,0.000000
6035998,6035999,125749,2.604744,1.272663,1,4.079938,0,1.558978,1.488497,1.489714,1.420711,1.482739,0.000000,0.000000,0.000000


[U_in_Lag_Diff] start


Unnamed: 0,u_in_lag1_diff,u_in_lag2_diff,u_in_lag3_diff,u_in_lag4_diff
0,0.083334,0.083334,0.083334,0.083334
1,18.299707,18.383041,18.383041,18.383041
2,4.126236,22.425944,22.509278,22.509278
3,0.299544,4.425781,22.725488,22.808822
4,2.547028,2.846573,6.972809,25.272516
...,...,...,...,...
6035995,0.069002,0.136509,0.132127,0.127452
6035996,-0.001217,0.067785,0.135292,0.130910
6035997,0.070481,0.069264,0.138266,0.205773
6035998,-0.286315,-0.215834,-0.217050,-0.148048


[U_in_Lag_Diff] done in 1 s


In [44]:
class U_in_Rolling(Feature):
    """
    """
    def create_features(self):
        def create(df):
            df["u_in_rolling_mean2"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).mean()["u_in"].reset_index(drop=True)
            df["u_in_rolling_mean4"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).mean()["u_in"].reset_index(drop=True)
            df["u_in_rolling_mean10"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(10).mean()["u_in"].reset_index(drop=True)
            
            df["u_in_rolling_max2"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).max()["u_in"].reset_index(drop=True)
            df["u_in_rolling_max4"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).max()["u_in"].reset_index(drop=True)
            df["u_in_rolling_max10"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(10).max()["u_in"].reset_index(drop=True)
            df["u_in_rolling_min2"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).min()["u_in"].reset_index(drop=True)
            df["u_in_rolling_min4"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).min()["u_in"].reset_index(drop=True)
            df["u_in_rolling_min10"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(10).min()["u_in"].reset_index(drop=True)
            df["u_in_rolling_std2"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(2).std()["u_in"].reset_index(drop=True)
            df["u_in_rolling_std4"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(4).std()["u_in"].reset_index(drop=True)
            df["u_in_rolling_std10"] = df[["breath_id", "u_in"]].groupby("breath_id").rolling(10).std()["u_in"].reset_index(drop=True)
            for col in df.columns:
                df[col] = df[col].fillna(df[col].mean())
            return df
        cols = ['u_in_rolling_mean2', 'u_in_rolling_mean4','u_in_rolling_mean10', 'u_in_rolling_max2', 'u_in_rolling_max4', 'u_in_rolling_max10',
                'u_in_rolling_min2', 'u_in_rolling_min4', 'u_in_rolling_min10', 'u_in_rolling_std2', 'u_in_rolling_std4', 'u_in_rolling_std10']
        self.train = create(train)[cols]
        self.test = create(test)[cols]
        display(self.train)

train, test = get_train_test()
U_in_Rolling().run().save()

[U_in_Rolling] start


Unnamed: 0,u_in_rolling_mean2,u_in_rolling_mean4,u_in_rolling_mean10,u_in_rolling_max2,u_in_rolling_max4,u_in_rolling_max10,u_in_rolling_min2,u_in_rolling_min4,u_in_rolling_min10,u_in_rolling_std2,u_in_rolling_std4,u_in_rolling_std10
0,7.296453,7.122734,6.673223,8.130658,9.024859,10.262890,6.462249,5.530754,4.011267,1.179743,1.644297,2.304005
1,9.233188,7.122734,6.673223,18.383041,9.024859,10.262890,0.083334,5.530754,4.011267,12.939847,1.644297,2.304005
2,20.446160,7.122734,6.673223,22.509278,9.024859,10.262890,18.383041,5.530754,4.011267,2.917690,1.644297,2.304005
3,22.659050,15.946119,6.673223,22.808822,22.808822,10.262890,22.509278,0.083334,4.011267,0.211810,10.766279,2.304005
4,24.082336,22.264248,6.673223,25.355850,25.355850,10.262890,22.808822,18.383041,4.011267,1.801021,2.885502,2.304005
...,...,...,...,...,...,...,...,...,...,...,...,...
6035995,1.455213,1.405304,1.346222,1.489714,1.489714,1.489714,1.420711,1.353205,1.184357,0.048792,0.064171,0.085995
6035996,1.489105,1.438032,1.376636,1.489714,1.489714,1.489714,1.488497,1.353205,1.237674,0.000860,0.065098,0.075534
6035997,1.523737,1.489475,1.400848,1.558978,1.558978,1.558978,1.488497,1.420711,1.237674,0.049838,0.056451,0.091385
6035998,1.415821,1.452463,1.404347,1.558978,1.558978,1.558978,1.272663,1.272663,1.272663,0.202455,0.124311,0.084882


[U_in_Rolling] done in 165 s


In [16]:
class U_in_delta(Feature):
# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/273974
    """
    """
    def create_features(self):
        def create(df):
            df['u_in_change']= df['u_in'].shift(-1, fill_value=0)-df['u_in']
            df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
            df['area_u_in_abs']=df['u_in_change']*df['time_delta']
            df['uin_in_time']=df['u_in_change']/df['time_delta']
            df.loc[df["time_step"] == 0, ["uin_in_time"]] = 0
            return df
            
        cols = ['u_in_change', 'area_u_in_abs', 'uin_in_time']
        self.train = create(train)[cols]
        self.test = create(test)[cols]
        
train, test = get_train_test()
U_in_delta().run().save()

[U_in_delta] start
[U_in_delta] done in 27 s


In [15]:
feats = ['Area', 'Time']
train, test = load_datasets(feats)
train.head()

Unnamed: 0,time_delta,delta,area,breath_time,u_in_time
0,0.0,0.0,0.0,0.0,0.0
1,0.033652,0.618632,0.618632,0.033652,18.299707
2,0.033862,0.762212,1.380843,0.033862,4.126236
3,0.034028,0.776134,2.156978,0.034028,0.299544
4,0.034213,0.867507,3.024485,0.034213,2.547028


In [49]:
train, test = get_train_test()