In [1]:
import pandas as pd
import os
import numpy as np
import sys
module_path = "../src"
if module_path not in sys.path:
    sys.path.append(module_path)
from utils.utils import seed_everything
from sklearn.metrics import mean_absolute_error
from utils.boosting import Training_Lightgbm
from utils.encoding import ClassicEncoding
# General Parameters
MODEL_PATH  = '../03.SavedModels'
MODEL_NAMES = ['baseline_lstm_v7','baseline_lstm_v14','baseline_lstm_v15']
SEED        = 42 

In [2]:
def feat_eng(df):
    # Add Feature engineering df:
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    df['R_indx'] = df['R']
    df['C_indx'] = df['C']
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df    

In [3]:
%%time 
# Training:
train_df = pd.read_csv('../01.Data/train_folds.csv')
train_df = feat_eng(train_df)
train_df = train_df[train_df['u_out']==0].reset_index(drop = True)
for name in MODEL_NAMES:
    train_df = train_df.merge(pd.read_csv(os.path.join(MODEL_PATH,name,'oof_preds.csv')).rename(columns = {'oof':'preds'}),how = 'left',on = ['id','breath_id'])

CPU times: user 28.7 s, sys: 12 s, total: 40.7 s
Wall time: 41.3 s


In [None]:
drop_cols   = ['id','breath_id','R_indx','C_indx']
columns_modeling = ['preds','pressure','u_in_cumsum','fold']#train_df.drop(columns = drop_cols).columns.to_list()
cat_columns = [i for i in train_df.columns if 'label' in i]
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'huber',
        'alpha'    :0.2, 
        'metric':  'mae',
        'num_leaves':64,
        'learning_rate': 0.1,
        #"min_child_samples": 300,
        "max_depth" : 8,
        'feature_fraction':  0.75,
        "bagging_freq": 1,
        'bagging_fraction': 0.75,
        #'num_threads':18,
        #"scale_pos_weight":5 -> Generally  is the ratio of number of negative class to the positive class.
        'bagging_seed':SEED,
        'lambda_l1':1,
        'lambda_l2':1,
        'verbose': 1

}

results,models,importances,oof,feature_list = Training_Lightgbm(train_df[columns_modeling],
                                                                params,
                                                                fold_column = 'fold',
                                                                target_column = 'pressure',
                                                                cat_vars = cat_columns,
                                                                metric = 'MAE', 
                                                                early_stopping = 200, 
                                                                max_boost_round = 8000*2)

Columns: ['preds', 'u_in_cumsum']
Cat index: []
---------- Training fold Nº 1 ----------
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1832862, number of used features: 2
[LightGBM] [Info] Start training from score 17.605154
Training until validation scores don't improve for 200 rounds
[50]	training's l1: 7.01293	valid_1's l1: 7.01286
[100]	training's l1: 6.44891	valid_1's l1: 6.45483
[150]	training's l1: 5.93404	valid_1's l1: 5.94517
[200]	training's l1: 5.46741	valid_1's l1: 5.48324
[250]	training's l1: 5.04505	valid_1's l1: 5.06512
[300]	training's l1: 4.66484	valid_1's l1: 4.68873
[350]	training's l1: 4.3256	valid_1's l1: 4.3523
[400]	training's l1: 4.03198	valid_1's l1: 4.06094
[450]	training's l1: 3.78687	valid_1's l1: 3.81753
[500]	training's l1: 3.58661	valid_1's l1: 3.61836
[550]	training's l1: 3.4324	valid_1's l

[1000]	training's l1: 3.13625	valid_1's l1: 3.17373
[1050]	training's l1: 3.12742	valid_1's l1: 3.16538
[1100]	training's l1: 3.11997	valid_1's l1: 3.15827
[1150]	training's l1: 3.11377	valid_1's l1: 3.15233
[1200]	training's l1: 3.10855	valid_1's l1: 3.14728
[1250]	training's l1: 3.10419	valid_1's l1: 3.14308
[1300]	training's l1: 3.1003	valid_1's l1: 3.13934
[1350]	training's l1: 3.09716	valid_1's l1: 3.13636
[1400]	training's l1: 3.09454	valid_1's l1: 3.1339
[1450]	training's l1: 3.0922	valid_1's l1: 3.13173
[1500]	training's l1: 3.09007	valid_1's l1: 3.12979
[1550]	training's l1: 3.08841	valid_1's l1: 3.12826
[1600]	training's l1: 3.08712	valid_1's l1: 3.12705
[1650]	training's l1: 3.08611	valid_1's l1: 3.12608
[1700]	training's l1: 3.08529	valid_1's l1: 3.12532
[1750]	training's l1: 3.08463	valid_1's l1: 3.12474
[1800]	training's l1: 3.08408	valid_1's l1: 3.12429
[1850]	training's l1: 3.08362	valid_1's l1: 3.1239
[1900]	training's l1: 3.08322	valid_1's l1: 3.12359
[1950]	training'

[900]	training's l1: 3.17227	valid_1's l1: 3.1541
[950]	training's l1: 3.15891	valid_1's l1: 3.14163
[1000]	training's l1: 3.14811	valid_1's l1: 3.13161
[1050]	training's l1: 3.13919	valid_1's l1: 3.12334
[1100]	training's l1: 3.13165	valid_1's l1: 3.11642
[1150]	training's l1: 3.12537	valid_1's l1: 3.11069
[1200]	training's l1: 3.12003	valid_1's l1: 3.10579
[1250]	training's l1: 3.11561	valid_1's l1: 3.1017
[1300]	training's l1: 3.11166	valid_1's l1: 3.09805
[1350]	training's l1: 3.10841	valid_1's l1: 3.09507
[1400]	training's l1: 3.10576	valid_1's l1: 3.09269
[1450]	training's l1: 3.10342	valid_1's l1: 3.09061
[1500]	training's l1: 3.10127	valid_1's l1: 3.08873
[1550]	training's l1: 3.09945	valid_1's l1: 3.08712
[1600]	training's l1: 3.09806	valid_1's l1: 3.08591
[1650]	training's l1: 3.09696	valid_1's l1: 3.08497
[1700]	training's l1: 3.0961	valid_1's l1: 3.0842
[1750]	training's l1: 3.09541	valid_1's l1: 3.08358
[1800]	training's l1: 3.09484	valid_1's l1: 3.08308
[1850]	training's 