In [1]:
import pandas as pd
import os
import numpy as np
import sys
module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
from utils.utils import seed_everything
from utils.scaler import DataScaler
from models.trainer import Trainer
from models.lstm import Model_LSTM
from sklearn.metrics import mean_absolute_error
import wandb
DATA_PATH = '../01.Data'
DEBUG = False
os.environ['WANDB_SILENT']="True"

In [2]:
'''
%%time
train_df = pd.read_csv(os.path.join(DATA_PATH,'train_folds.csv'),nrows=80*100)
if DEBUG:
    train_df = pd.read_csv(os.path.join(DATA_PATH,'train_folds.csv'),nrows=80*100)
else:
    train_df = pd.read_csv(os.path.join(DATA_PATH,'train_folds.csv'))    
test_df  = pd.read_csv(os.path.join(DATA_PATH,'test.csv'))
'''

"\n%%time\ntrain_df = pd.read_csv(os.path.join(DATA_PATH,'train_folds.csv'),nrows=80*100)\nif DEBUG:\n    train_df = pd.read_csv(os.path.join(DATA_PATH,'train_folds.csv'),nrows=80*100)\nelse:\n    train_df = pd.read_csv(os.path.join(DATA_PATH,'train_folds.csv'))    \ntest_df  = pd.read_csv(os.path.join(DATA_PATH,'test.csv'))\n"

In [3]:
def feat_eng(df):
    # Add Feature engineering df:
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df    

In [4]:
def train_fn(train,valid,cfg):
    # Call seed
    seed_everything(cfg.seed)
    model    = Model_LSTM(cfg)
    trainer  = Trainer(config = cfg,model = model)
    best_val_loss = trainer.fit(train,valid)
    print(f'Best Val Loss: {best_val_loss}')
    return trainer

In [5]:
def compute_mae_filtered(df,target = 'pressure',preds = 'oof'):
    metric_df  = df[df['u_out']!=1].reset_index(drop = True)
    mae_metric = mean_absolute_error(metric_df[target].values,metric_df[preds].values)
    del metric_df
    return mae_metric

In [6]:
def run_kfold():
    cfg       = Config()
    run_folds = cfg.run_folds 
    if DEBUG:
        train_df = pd.read_csv(os.path.join(DATA_PATH,'train_folds.csv'),nrows=80*100)
        test_df  = pd.read_csv(os.path.join(DATA_PATH,'test.csv'),nrows=80*100)
    else:
        train_df = feat_eng(pd.read_csv(os.path.join(DATA_PATH,'train_folds.csv')))   
        test_df  = feat_eng(pd.read_csv(os.path.join(DATA_PATH,'test.csv')))
    test_df['pressure'] = -1
    cfg.cols       = train_df.drop(columns = ['id','breath_id','pressure','u_out','fold']).columns.to_list()
    cfg.input_size = len(cfg.cols)
    # Scaling Dataset
    scaler   = DataScaler(train_df,sc_name = cfg.sc_name,cols = cfg.cols)
    train_df = scaler.transform(train_df)
    test_df  = scaler.transform(test_df)
    # Data to store
    preds_kfold = 0
    train_df.loc[:,'oof'] = -1
    for fold in run_folds:
        print(f"***********************************************")
        print(f"**************** FOLD : {fold} *********************")
        print(f"***********************************************")
        cfg.fold        = fold
        cfg.output_path = os.path.join(cfg.output_dir,cfg.experiment_name,f'fold_{fold}')
        # Training Dataset
        train     = train_df[train_df['fold']!=fold].reset_index(drop = True)
        # Valid Dataset
        valid     = train_df[train_df['fold']==fold]
        valid_index = valid.index.to_list()
        valid     = valid.reset_index(drop = True)
        # Trainer Part
        trainer   = train_fn(train,valid,cfg)
        # Valid Preds:
        _, valid_preds = trainer.predict(valid,os.path.join(cfg.output_path,'model.pt'))
        valid['oof'] = valid_preds.reshape(-1) 
        print('Valid Metric: ',compute_mae_filtered(valid[['pressure','u_out','oof']]))
        train_df.loc[valid_index,'oof'] = valid_preds.reshape(-1)
        if fold == run_folds[-1]:
            oof_metric = compute_mae_filtered(train_df[['pressure','u_out','oof']])
            trainer._log({'oof_metric':oof_metric})
        # Test Preds
        _, test_preds = trainer.predict(test_df,os.path.join(cfg.output_path,'model.pt'))
        preds_kfold   += test_preds
        if fold != run_folds[-1]:
            del trainer
        del train,valid
        
    # Saving test preds
    test_df['preds'] = preds_kfold.reshape(-1)
    test_df[['id','breath_id','preds']].to_csv(os.path.join(cfg.output_dir,cfg.experiment_name,f'test_preds.csv'),index = False) 
    trainer._upload_df(name = 'preds', data = test_df[['id','breath_id','preds']])
    # Saving oof predictions
    train_df[['id','breath_id','oof']].to_csv(os.path.join(cfg.output_dir,cfg.experiment_name,f'oof_preds.csv'),index = False)
    trainer._upload_df(name = 'oof_train', data = train_df[['id','breath_id','oof']])
    trainer._finish()
    del trainer

In [7]:
class Config():
    # =========== General Parameters ========
    seed = 42
    logging = True
    run_folds       = [0,1,2,3,4]
    # ======== Model Parameters =============
    input_size  = -1
    hidden_size = 300
    num_layers  = 4
    dropout     = 0.0
    bidirectional = True
    logit_dim     = 50
    # ========= Training Parameters =========
    epochs         = 100
    device         = 'cuda'
    lr             = 1e-3
    batch_size     = 2**9
    num_workers    = 64 
    sc_name        = 'Robust'
    # ======== Early stopping  =============
    early_stopping = 5
    mode           = 'min'
    # ======== Loss Parameters =============
    loss_params    = {'name':'MAE_FILTERED'}        
    # ======== Optimizer Parameters ========
    optimizer_params = {'name':'Adam',
                        'WD'  : 0.0}

    # ======= Scheduler Parameters =========
    # Mode: ['batch','epoch']
    scheduler_params = {'name'     : None,
                        'step_on'  : None,
                        'patience' :  None,
                        'step_metric': None}         
    # ======= Logging and Saving Parameters ===
    project_name    = 'Ventilator-Kaggle'
    experiment_name = 'baseline_lstm'
    fold            = None
    output_dir      = '../03.SavedModels' # Relative to trainer path

In [8]:
if __name__ == '__main__':
    run_kfold()

***********************************************
**************** FOLD : 0 *********************
***********************************************
**** Training **** 


100%|██████████| 118/118 [00:42<00:00,  2.76it/s, LR=0.001, Train_Loss=5.98]


**** Validation ****


100%|██████████| 30/30 [00:12<00:00,  2.33it/s, Eval_Loss=2.46]


Validation score improved (inf --> 2.4609090089797974). Saving model!
**** Training **** 


100%|██████████| 118/118 [00:43<00:00,  2.71it/s, LR=0.001, Train_Loss=2.09]


**** Validation ****


100%|██████████| 30/30 [00:12<00:00,  2.33it/s, Eval_Loss=2.02]


Validation score improved (2.4609090089797974 --> 2.0151629368464152). Saving model!
**** Training **** 


100%|██████████| 118/118 [00:42<00:00,  2.75it/s, LR=0.001, Train_Loss=1.53]


**** Validation ****


100%|██████████| 30/30 [00:12<00:00,  2.33it/s, Eval_Loss=1.18]


Validation score improved (2.0151629368464152 --> 1.1764485637346904). Saving model!
**** Training **** 


100%|██████████| 118/118 [00:42<00:00,  2.76it/s, LR=0.001, Train_Loss=1.15]


**** Validation ****


100%|██████████| 30/30 [00:14<00:00,  2.09it/s, Eval_Loss=0.974]


Validation score improved (1.1764485637346904 --> 0.9741322894891103). Saving model!
**** Training **** 


100%|██████████| 118/118 [00:42<00:00,  2.76it/s, LR=0.001, Train_Loss=0.981]


**** Validation ****


100%|██████████| 30/30 [00:13<00:00,  2.27it/s, Eval_Loss=0.869]


Validation score improved (0.9741322894891103 --> 0.8691904664039611). Saving model!
Didn't meet early stopping
Best Val Loss: 0.8691904664039611


100%|██████████| 30/30 [00:09<00:00,  3.20it/s]


Valid Metric:  0.8737820470204125


100%|██████████| 99/99 [00:16<00:00,  5.94it/s]


***********************************************
**************** FOLD : 1 *********************
***********************************************
**** Training **** 


100%|██████████| 118/118 [00:45<00:00,  2.62it/s, LR=0.001, Train_Loss=5.95]


**** Validation ****


100%|██████████| 30/30 [00:14<00:00,  2.10it/s, Eval_Loss=2.47]


Validation score improved (inf --> 2.4658446391423543). Saving model!
**** Training **** 


100%|██████████| 118/118 [00:44<00:00,  2.66it/s, LR=0.001, Train_Loss=2.08]


**** Validation ****


100%|██████████| 30/30 [00:14<00:00,  2.11it/s, Eval_Loss=1.82]


Validation score improved (2.4658446391423543 --> 1.81582111120224). Saving model!
**** Training **** 


100%|██████████| 118/118 [00:44<00:00,  2.66it/s, LR=0.001, Train_Loss=1.5] 


**** Validation ****


100%|██████████| 30/30 [00:14<00:00,  2.10it/s, Eval_Loss=1.36]


Validation score improved (1.81582111120224 --> 1.359883721669515). Saving model!
**** Training **** 


100%|██████████| 118/118 [00:44<00:00,  2.65it/s, LR=0.001, Train_Loss=1.1] 


**** Validation ****


  0%|          | 0/30 [00:07<?, ?it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-5f2a284301f0>", line 2, in <module>
    run_kfold()
  File "<ipython-input-6-94e919434f59>", line 33, in run_kfold
    trainer   = train_fn(train,valid,cfg)
  File "<ipython-input-4-695a06277c97>", line 6, in train_fn
    best_val_loss = trainer.fit(train,valid)
  File "../src/models/trainer.py", line 107, in fit
    valid_loss = self.valid_fn(valid_loader)
  File "../src/models/trainer.py", line 70, in valid_fn
    for b_idx,data in enumerate(tk0):
  File "/usr/local/lib/python3.6/dist-packages/tqdm/std.py", line 1129, in __iter__
    for obj in iterable:
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 352, in __iter__
    return self._get_iterator()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line

KeyboardInterrupt: 