In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

import random

from tqdm import tqdm

import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold

from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import roc_curve, auc, accuracy_score, cohen_kappa_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix

from sklearn.preprocessing import RobustScaler


import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.models import Sequential, Model

#from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import LSTM, Bidirectional, add, concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Conv2DTranspose, AveragePooling1D, UpSampling1D
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, Activation, TimeDistributed
from tensorflow.keras.layers import Multiply, Add, Concatenate, Flatten, Average, Lambda

from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, Callback, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.constraints import unit_norm, max_norm

from tensorflow.keras import backend as K
from tensorflow_addons.optimizers import CyclicalLearningRate

/kaggle/input/ventilator-pressure-prediction/sample_submission.csv
/kaggle/input/ventilator-pressure-prediction/train.csv
/kaggle/input/ventilator-pressure-prediction/test.csv


### Google Brain - Ventilator Pressure Prediction

https://www.kaggle.com/c/ventilator-pressure-prediction

**Columns:**

**id** - globally-unique time step identifier across an entire file

**breath_id** - globally-unique time step for breaths

**R** - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.

**C** - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow.

**time_step** - the actual time stamp.

**u_in** - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.

**u_out** - the control input for the exploratory solenoid valve. Either 0 or 1.

**pressure** - the airway pressure measured in the respiratory circuit, measured in cmH2O.


In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        if col != 'time':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def get_stats(df):
    stats = pd.DataFrame(index=df.columns, columns=['na_count', 'n_unique', 'type', 'memory_usage'])
    for col in df.columns:
        stats.loc[col] = [df[col].isna().sum(), df[col].nunique(dropna=False), df[col].dtypes, df[col].memory_usage(deep=True, index=False) / 1024**2]
    stats.loc['Overall'] = [stats['na_count'].sum(), stats['n_unique'].sum(), None, df.memory_usage(deep=True).sum() / 1024**2]
    return stats

def print_header():
    print('col         conversion        dtype    na    uniq  size')
    print()
    
def print_values(name, conversion, col):
    template = '{:10}  {:16}  {:>7}  {:2}  {:6}  {:1.2f}MB'
    print(template.format(name, conversion, str(col.dtypes), col.isna().sum(), col.nunique(dropna=False), col.memory_usage(deep=True, index=False) / 1024 ** 2))
    
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    #tf.random.set_seed(seed)    
    
def draw_sequence( df, start, end, filter_out=None):
    f, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(16, 8))
    for i in range(start, end):
        if filter_out is not None and i in filter_out:
            continue
        df1 = df[df.breath_id == i]
        sns.lineplot( data = df1[['u_in']], ax=ax1)
        sns.lineplot( data = df1[['u_out']], ax=ax2)    
        sns.lineplot( data = df1[['pressure']], ax=ax3)    
        
def draw_in_out_pressure( df, breath_id1, breath_id2 ):
    f, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 8))
    df1 = df[df.breath_id == breath_id1]
    sns.lineplot( data = df1[['u_in', 'u_out', 'pressure']], ax=ax1)
    df2 = df[df.breath_id == breath_id2]
    sns.lineplot( data = df2[['u_in', 'u_out', 'pressure']], ax=ax2)
        
def display_missing(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    missing_data.head()
    return missing_data       

def select_train_data(df, features, train_size=1000, target_col='pressure'):

    data_points = SAMPLE_SIZE*train_size
    y = df[target_col][:data_points]
    X = df[features][:data_points]
    
    groups = df['breath_id'][:data_points]
    print(f'Original sizes: train: {X.shape}, y_train: {y.shape}' )
    return X, y, groups


In [3]:
RANDOM_SEED = 42
SAMPLE_SIZE = 80
DEBUG = False
HYPER_TUNING = False

seed_everything(RANDOM_SEED)

train = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/train.csv')
test = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/test.csv')
submission = pd.read_csv('/kaggle/input/ventilator-pressure-prediction/sample_submission.csv')

if DEBUG:
    train = train[:SAMPLE_SIZE*10000]

## Feature generation

https://www.kaggle.com/cdeotte/lstm-feature-importance

https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/280471

https://www.kaggle.com/swaralipibose/interesting-feature-importance-by-lstm-gradients



In [4]:
def gen_features(df):    
    df['cross']= df['u_in'] * df['u_out']
    df['cross2']= df['time_step'] * df['u_out']
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()    
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)

    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)        

    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)        
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_in__mean'] = df.groupby(['breath_id'])['u_in'].transform('mean')
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['time_gap'] = df['time_step'] - df['time_step'].shift(1).fillna(0)
    u_in_gap = df['u_in'] - df['u_in'].shift(1).fillna(0)
    df['u_in_rate'] = u_in_gap / df['time_gap']

    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)   
    
    df = df.replace([np.inf, -np.inf], np.nan)    
    df.fillna(0, inplace=True)
    df = reduce_mem_usage(df)
    gc.collect()    
    
    return df

train = gen_features(train)
test = gen_features(test)

Mem. usage decreased to 564.13 Mb (70.7% reduction)
Mem. usage decreased to 368.41 Mb (70.6% reduction)


In [5]:
ALL_FEATURES = ALL_FEATURES = [c for c in train.columns if c not in ['id', 'pressure', 'breath_id']]

targets = train[['pressure']].to_numpy().reshape(-1, SAMPLE_SIZE)
train.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
test = test.drop(['id', 'breath_id'], axis=1)

#if DEBUG:
#    print(train.columns)
#    print(test.columns)
    
scaler = RobustScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

print(train.shape)
print(test.shape)

train = train.reshape(-1, SAMPLE_SIZE, train.shape[-1])
test = test.reshape(-1, SAMPLE_SIZE, test.shape[-1])

print(train.shape)
print(test.shape)
gc.collect()

(6036000, 52)
(4024000, 52)
(75450, 80, 52)
(50300, 80, 52)


0

## Train

https://www.kaggle.com/marutama/eda-about-lstm-feature-importance

https://www.kaggle.com/marutama/finetune-of-tensorflow-bi-lstm-eda-about



In [6]:
def build_model( X, units = 64, optimizer='adam'):
    #shape = (X.shape[-2], X.shape[-1])
    model = Sequential([
        Input(shape=X.shape[-2:]),
        Bidirectional(LSTM(512, return_sequences=True)),
        Bidirectional(LSTM(256, return_sequences=True)),
        Bidirectional(LSTM(128, return_sequences=True)),
        Dense(128, activation='selu'),
        Dense(1),
    ])    

    model.compile(optimizer=optimizer, loss="mae")
    return model


def display_feature_importance( model, X_valid, cols):
    results = []
    print(' Computing LSTM feature importance...')

    for k in tqdm(range(len(cols))):
        if k>0: 
            save_col = X_valid[:,:,k-1].copy()
            np.random.shuffle(X_valid[:,:,k-1])

        oof_preds = model.predict(X_valid, verbose=0).squeeze() 
        mae = np.mean(np.abs( oof_preds-y_valid ))
        results.append({'feature':cols[k],'mae':mae})

        if k>0: 
            X_valid[:,:,k-1] = save_col

    # DISPLAY LSTM FEATURE IMPORTANCE
    print()
    df = pd.DataFrame(results)
    df = df.sort_values('mae')
    plt.figure(figsize=(10,20))
    plt.barh(np.arange(len(cols)),df.mae)
    plt.yticks(np.arange(len(cols)),df.feature.values)
    plt.title('LSTM Feature Importance',size=16)
    plt.ylim((-1,len(cols)))
    plt.show()


In [7]:
EPOCH = 100
BATCH_SIZE = 1024
TPU = False
TOTAL_SPLITS = 4
LEARNING_RATE = 0.00157

if DEBUG:    
    EPOCH = 20
    TOTAL_SPLITS = 2    

lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=10, verbose=1)
es = EarlyStopping(monitor="val_loss", patience=60, verbose=1, mode="min", restore_best_weights=True)
cyclical_learning_rate = CyclicalLearningRate( initial_learning_rate=3e-7, maximal_learning_rate=3e-5, step_size=2360, 
                                              scale_fn=lambda x: 1 / (2.0 ** (x - 1)), scale_mode='cycle')

if TPU:
    # detect and init the TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    ## instantiate a distribution strategy
    xpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # GET GPU STRATEGY
    xpu_strategy = tf.distribute.get_strategy()

models = []
scores = []
with xpu_strategy.scope():

    folds = KFold(n_splits=TOTAL_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(train, targets)):
        print('-'*15, '>', f'Fold {fold_n+1}', '<', '-'*15)
        X_train, X_valid = train[train_index], train[valid_index]
        y_train, y_valid = targets[train_index], targets[valid_index]    

        model = build_model(X_train, optimizer=Adam(learning_rate=LEARNING_RATE))
        #tf.train.Checkpoint( model = model, optimizer = Adam(learning_rate=cyclical_learning_rate))
        history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=EPOCH, batch_size=BATCH_SIZE, 
                            callbacks=[lr, es], 
                            shuffle=False, workers=8, use_multiprocessing=True)   
        #display_feature_importance( model, X_valid, ALL_FEATURES)
        #break # only one fold
        
        oof_preds = model.predict(X_valid, verbose=0).squeeze() 
        mae = mean_absolute_error(y_valid, oof_preds)   
        scores.append(mae)
        models.append(model)
        print(f'MAE: {mae}')

print('Mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

--------------- > Fold 1 < ---------------


2021-11-01 22:52:52.855778: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-01 22:52:52.944132: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-01 22:52:52.944915: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-01 22:52:52.946650: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Epoch 1/100


2021-11-01 22:53:06.525501: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100

Epoch 00058: ReduceLROnPlateau reducing learning rate to 0.0007849999819882214.
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoc

2021-11-01 23:48:54.138226: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 941607680 exceeds 10% of free system memory.
2021-11-01 23:48:55.088738: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 941607680 exceeds 10% of free system memory.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100

Epoch 00070: ReduceLROnPlateau reducing learning rate to 0.0007849999819882214.
Epoch 71/100
Epoch

2021-11-02 00:43:50.715343: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 941624320 exceeds 10% of free system memory.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100

Epoch 0

In [8]:
predicted = []
for model in models:
    yhat = model.predict(test, verbose=0).squeeze()
    predicted.append(yhat)
    
mean_pred = np.median(predicted, axis=0)

## Submission

In [9]:
submission['pressure'] = mean_pred.ravel()
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)

Unnamed: 0,id,pressure
0,1,6.283592
1,2,6.288374
2,3,7.507535
3,4,8.0513
4,5,9.46529
5,6,10.674587
6,7,11.902813
7,8,13.135768
8,9,14.349846
9,10,15.443434
