<a href="https://colab.research.google.com/github/zizilnam/Kaggle_Competition_Ventilator_Pressure_Prediction/blob/main/conv1d_kibae.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bi-LSTM with GRU (TPU) 
 - source: https://www.kaggle.com/hrshuvo/dnn-lstm-tpu 
 - just added reduce memory codes for Saving RAM

In [None]:
## base_path 
base_path = '../input/ventilator-pressure-prediction/'

## Import & base_path

In [None]:
import numpy as np
import pandas as pd
import gc
import os

from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.metrics import mean_absolute_error

import tensorflow as tf
print(tf.__version__)

from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Bidirectional, LSTM, Multiply, Dense, Dropout, Input, Concatenate, Add, GRU, BatchNormalization

from scipy.signal import butter, filtfilt
from pickle import dump

from IPython.display import display
from warnings import filterwarnings
filterwarnings('ignore')



In [None]:
num_workers = os.cpu_count()
num_workers

## Data 

In [None]:
## data
train_org = pd.read_csv(base_path + 'train.csv')
test_org = pd.read_csv(base_path + 'test.csv')

## Save RAM

In [None]:
## save ram
# https://www.kaggle.com/dmitryuarov/tps-soft-voting-xgb-cb-lgbm#Basic-information
# https://www.kaggle.com/rinnqd/reduce-memory-usage

def save_ram(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    print(f"After Diet: {round(end_mem, 2)}MB")
    print(f"Percentage of How much Reduced: {round(100*(start_mem - end_mem)/(start_mem), 2)}%")

    return df

## add_features

In [None]:
## add_features

def add_features(df):
    df['cross']= df['u_in'] * df['u_out']
    df['cross2']= df['time_step'] * df['u_out']
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    print("Step-1 Completed")
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    print("Step-2 Completed")

    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_in__mean'] = df.groupby(['breath_id'])['u_in'].transform('mean')
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    print("Step-3 Completed")
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    print("Step-4 Completed")
    
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] /df['count']

    df['breath_id_lag']=df['breath_id'].shift(1).fillna(0)
    df['breath_id_lag2']=df['breath_id'].shift(2).fillna(0)
    df['breath_id_lagsame']=np.select([df['breath_id_lag']==df['breath_id']],[1],0)
    df['breath_id_lag2same']=np.select([df['breath_id_lag2']==df['breath_id']],[1],0)
    df['breath_id__u_in_lag'] = df['u_in'].shift(1).fillna(0)
    df['breath_id__u_in_lag'] = df['breath_id__u_in_lag'] * df['breath_id_lagsame']
    df['breath_id__u_in_lag2'] = df['u_in'].shift(2).fillna(0)
    df['breath_id__u_in_lag2'] = df['breath_id__u_in_lag2'] * df['breath_id_lag2same']
    print("Step-5 Completed")
    
    df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)

    g = df.groupby('breath_id')['u_in'].apply(pd.Series)
    
    df['ewm_u_in_mean'] = g.ewm(halflife=9).mean()\
                           .reset_index(level=0, drop=True)
    
    df[["15_in_sum","15_in_min","15_in_max","15_in_mean"]] = (df\
                                                              .groupby('breath_id')['u_in']\
                                                              .rolling(window=15,min_periods=1)\
                                                              .agg({"15_in_sum":"sum",
                                                                    "15_in_min":"min",
                                                                    "15_in_max":"max",
                                                                    "15_in_mean":"mean"})\
                                                               .reset_index(level=0,drop=True))
    print("Step-6 Completed")
    
    df['u_in_lagback_diff1'] = df['u_in'] - df['u_in_lag_back1']
    df['u_out_lagback_diff1'] = df['u_out'] - df['u_out_lag_back1']
    df['u_in_lagback_diff2'] = df['u_in'] - df['u_in_lag_back2']
    df['u_out_lagback_diff2'] = df['u_out'] - df['u_out_lag_back2']
    print("Step-7 Completed")

    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    print("All Completed")
    
    return df

In [None]:
train = add_features(train_org)
test  = add_features(test_org)

print(train.shape, test.shape)
del train_org, test_org

## data on diet

In [None]:
train = save_ram(train, verbose=True)

In [None]:
train.head()

In [None]:
test = save_ram(test)

In [None]:
test.head()

## Drop useless features 

In [None]:
## target
targets = train[['pressure']].to_numpy().reshape(-1, 80)
targets.shape

In [None]:
## train
train.drop(['pressure', 'id', 'breath_id','one','count','breath_id_lag','breath_id_lag2','breath_id_lagsame',
            'breath_id_lag2same'], axis=1, inplace=True)

In [None]:
print(train.shape)
train.head(1)

In [None]:
## Test
test = test.drop(['id', 'breath_id','one','count','breath_id_lag','breath_id_lag2','breath_id_lagsame',
                  'breath_id_lag2same'], axis=1)

In [None]:
print(test.shape)
test.head(1)

In [None]:
## 출처: https://www.kaggle.com/jmcslk/hybrid-cnn-enc-dec-sample-weights

## Sample Weights Code


In [None]:
### SAMPLE WEIGHTS ###
def get_sample_weight_param(train, targets, u_out_1_weight):
    cols = train.columns.tolist()
    u_out_index = cols.index("u_out")
    cols = cols[u_out_index]
    x_train = train[[cols]].values.reshape((-1, 80, len([cols])))

    # GET SAMPLE WEIGHT
    U_OUT_IDX = cols.index("u_out")
    y_weight = np.ones_like(targets)
    u_out_values = x_train[:,:,U_OUT_IDX]

    #DEFINE U_out == 1 samples weights, if 1 => sampling is turned off
    y_weight[u_out_values==1] = u_out_1_weight
    del x_train
    return y_weight

#set True to do sample weighting during training
do_sample_weights = True

if do_sample_weights == True:
    y_weight = get_sample_weight_param(train, targets, 0.1)
else:
    pass

## RobustScaler 

In [None]:
sc = RobustScaler()
train = sc.fit_transform(train)
test = sc.transform(test) ## np.ndarray

In [None]:
train = train.reshape(-1, 80, train.shape[-1])
train.shape

In [None]:
test = test.reshape(-1, 80, train.shape[-1])
test.shape

In [None]:
targets.shape

## pressure

In [None]:
pressure = targets.squeeze().reshape(-1, 1).astype('float32')

P_MIN = np.min(pressure)
P_MAX = np.max(pressure)
P_STEP = (pressure[1] -  pressure[0])[0]

print(f"MIN Pressure: {P_MIN}")
print(f"MAX Pressure: {P_MAX}")
print(f"Pressure step: {P_STEP}")
print(f"Unique Values: {np.unique(pressure).shape[0]}")

In [None]:
del pressure
gc.collect()

## TPU or NOT

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = tpu_strategy.num_replicas_in_sync * 64
    print("Running on TPU:", tpu.master())
    print(f"Batch Size: {BATCH_SIZE}")
    
except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 512
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    print(f"Batch Size: {BATCH_SIZE}")

## Define Model

In [None]:
def blgtpumodel():
    
    x_input = Input(shape=(train.shape[-2:]))
    ## input: 80 * train.shape[-1]

    x1 = Bidirectional(LSTM(units = 768, return_sequences = True))(x_input)
    x2 = Bidirectional(LSTM(units = 512, return_sequences = True))(x1)

    ## z2
    x3 = Bidirectional(LSTM(units = 384, return_sequences = True))(x2)

    ## z3
    x4 = Bidirectional(LSTM(units = 256, return_sequences = True))(x3)

    ## z4 
    x5 = Bidirectional(LSTM(units=128, return_sequences=True))(x4)
    
    z2 = Bidirectional(GRU(units=384, return_sequences=True))(x2)
    
    z31 = Multiply()([x3, z2])
    z31 = BatchNormalization()(z31)
    z3 = Bidirectional(GRU(units=256, return_sequences=True))(z31)
    
    z41 = Multiply()([x4, z3])
    z41 = BatchNormalization()(z41)
    z4 = Bidirectional(GRU(units=128, return_sequences=True))(z41)
    
    z51 = Multiply()([x5, z4])
    z51 = BatchNormalization()(z51)
    z5 = Bidirectional(GRU(units=64, return_sequences=True))(z51)
    
    x = Concatenate(axis=2)([x5, z2, z3, z4, z5])
    
    x = Dense(units=128, activation='selu')(x)
    
    x_output = Dense(units=1)(x)

    model = Model(inputs=x_input, outputs=x_output, name='BLG_Model')

    return model

### Summary

In [None]:
model = blgtpumodel()
model.summary()

## Run

In [None]:
with tpu_strategy.scope():
    
    VERBOSE = 0
    test_preds = []
    kf = KFold(n_splits=7, shuffle=True, random_state=624)
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(train, targets)):
        X_train, X_valid = train[train_idx], train[test_idx]
        y_train, y_valid = targets[train_idx], targets[test_idx]

        if do_sample_weights == True:
            y_w_train, y_w_valid = y_weight[train_idx], y_weight[test_idx]
            y_w_train = y_w_train.reshape(y_w_train.shape[0], y_w_train.shape[1], 1)
            y_w_valid = y_w_valid.reshape(y_w_valid.shape[0], y_w_valid.shape[1], 1)
            print(y_train.shape, y_w_train.shape)
        else:
                pass
        
        model = blgtpumodel()
        model.compile(optimizer="adam", loss="mae")

        lr = ReduceLROnPlateau(monitor="val_loss", factor=0.85, 
                               patience=7, verbose=VERBOSE)
        
        save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
        chk_point = ModelCheckpoint(f'./Bidirect_LSTM_model_{fold+1}C.h5', options=save_locally, 
                                    monitor='val_loss', verbose=VERBOSE, 
                                    save_best_only=True, mode='min')

        es = EarlyStopping(monitor="val_loss", patience=30, 
                           verbose=VERBOSE, mode="min", 
                           restore_best_weights=True)
        
        if do_sample_weights == True:
            val_data = (X_valid, y_valid, y_w_valid)
            sw = y_w_train
        else:
            val_data = (X_valid, y_valid)
            sw = None

        model.fit(X_train, y_train, 
                  validation_data=val_data, 
                  sample_weight=sw,
                #   validation_data=(X_valid, y_valid), 
                  epochs=400,
                  verbose=VERBOSE,
                  batch_size=BATCH_SIZE, 
                  callbacks=[lr, chk_point, es])
        
        load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
        # model = load_model(f'./Bidirect_LSTM_model_{fold+1}C.h5', options=load_locally)
        
        y_true = y_valid.squeeze().reshape(-1, 1)
        y_pred = model.predict(X_valid, batch_size=BATCH_SIZE).squeeze().reshape(-1, 1)
        score = mean_absolute_error(y_true, y_pred)
        print(f"Fold-{fold+1} - Score: {score}")
        
        test_preds.append(model.predict(test, batch_size=BATCH_SIZE).squeeze().reshape(-1, 1).squeeze())

        # if fold >= 1:
            # break

In [None]:
#    

## Submit file

In [None]:
## 이번에 오류가 안 나야하는 데 괜히
## 17:10에 시작  

In [None]:
sss = pd.read_csv(base_path + 'sample_submission.csv')

In [None]:
sss.head()

In [None]:
sss["pressure"] = np.median(np.vstack(test_preds),axis=0)
print(sss.shape)
sss.head()

In [None]:
sss["pressure"] = np.round((sss.pressure - P_MIN)/P_STEP) * P_STEP + P_MIN
print(sss.shape)
sss.head()

In [None]:
sss["pressure"] = np.clip(sss.pressure, P_MIN, P_MAX)
print(sss.shape)
sss.head()

In [None]:
sss.to_csv('blg_tpu_median_submission.csv', index=False)
sss.to_csv('blg_tpu_sample_kb_submission.csv', index=False)

In [None]:
df_folds = pd.DataFrame(test_preds)
df_folds = df_folds.T
df_folds.columns = ['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5', 'fold_6', 'fold_7']
df_folds.to_csv(base_path + 'rh_7folds.csv', index=False)
df_folds.shape