In [37]:
import numpy as np
import pandas as pd
from pathlib import Path
import gc
import sys
import json
import warnings

import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import RobustScaler

sys.path.append(str(Path().resolve().parent))

from src.utils import (
    seed_every_thing,
    fetch_data,
    Config,
    plot_metric,
    reduce_tf_gpu_memory,
    reduce_mem_usage,
    fetch_custom_data,
    CustomL1Loss
)
warnings.simplefilter('ignore')
pd.options.display.max_columns = 200

In [38]:
def find_R(n):
    if n in [0, 1, 2]:
        return 0
    elif n in [3, 4, 5]:
        return 2
    elif n in [6, 7, 8]:
        return 1
    else:
        raise Exception()

def find_C(n):
    if n in [0, 3, 6]:
        return 0
    elif n in [1, 4, 7]:
        return 1
    elif n in [2, 5, 8]:
        return 2
    else:
        raise Exception()
        
def build_model(config: Config, n_features) -> keras.models.Sequential:
    model = keras.models.Sequential([keras.layers.Input(shape=(config.cut, n_features))])
    for n_unit in config.n_units:
        model.add(
            keras.layers.Bidirectional(
                keras.layers.LSTM(
                    n_unit,
                    return_sequences=True,
                )
            )
        )
    for n_unit in config.n_dense_units:
        model.add(keras.layers.Dense(n_unit, activation="selu"))
    model.add(keras.layers.Dense(1))

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=config.lr), loss="mae")
    return model


def compute_metric(df, merge=True):
    y_true, y_pred, u_out = df["pressure"].values, df["pred"].values, df["u_out"].values
    w = (1 - u_out)
    
    if merge:
        mae = w * np.abs(y_true - y_pred)
        mae = np.sum(mae) / np.sum(w)
        return mae
    else:
        y_true, y_pred, w = y_true.reshape(-1, 35), y_pred.reshape(-1, 35), w.reshape(-1, 35)
        maes = []
        for yt, yp, w_ in zip(y_true, y_pred, w):
            mae = w_ * np.abs(yt - yp)
            mae = np.sum(mae) / np.sum(w_)
            maes.append(mae)
        return np.array(maes)

In [39]:
basedir = Path().resolve().parent
logdir = basedir / "logs"
datadir = basedir / "data"
cachedir = basedir / "cache"

In [40]:
cfg_file_path = logdir / "lstm-less-addfeatures-cut-large-custom-mae" / "config.json"
with open(cfg_file_path, "rb") as f:
    config = json.load(f)
config = Config().update(config)

In [41]:
# rc_preds = pd.read_csv(logdir / "cnn-classify-rc-reliable" / "valid_preds.csv")
# rc_test_preds = pd.read_csv(logdir / "cnn-classify-rc-reliable" / "test_preds.csv")

rc_preds = pd.read_csv(logdir / "cnn-classify-rc-reliable_3" / "valid_preds.csv")
rc_test_preds = pd.read_csv(logdir / "cnn-classify-rc-reliable_3" / "test_preds.csv")

In [7]:
valid_preds = pd.read_csv(logdir / "lstm-less-addfeatures-cut-large-custom-mae" / "valid_preds.csv", index_col=0)

In [9]:
train = pd.read_csv(cachedir / "train_lstm-less-cut-addfeatures_debugFalse.csv")
train.head()

Unnamed: 0,time_step,u_in,u_out,pressure,kfold,count,corss,cross2,cross3,time_delta,time_step_cumsum,u_in_cumsum,u_in_cummean,u_in_lag_b1,u_out_lag_b1,u_in_lag_b2,u_out_lag_b2,u_in_lag_b3,u_out_lag_b3,u_in_lag_b4,u_out_lag_b4,u_in_lag_b5,u_out_lag_b5,u_in_lag_f1,u_out_lag_f1,u_in_lag_f2,u_out_lag_f2,u_in_lag_f3,u_out_lag_f3,u_in_lag_f4,u_out_lag_f4,u_in_lag_f5,u_out_lag_f5,u_in_diff_b1,u_out_diff_b1,u_in_diff_b2,u_out_diff_b2,u_in_diff_b3,u_out_diff_b3,u_in_diff_b4,u_out_diff_b4,u_in_diff_b5,u_out_diff_b5,u_in_diff_f1,u_out_diff_f1,u_in_diff_f2,u_out_diff_f2,u_in_diff_f3,u_out_diff_f3,u_in_diff_f4,u_out_diff_f4,u_in_diff_f5,u_out_diff_f5,u_in_bwindow_mean,u_in_bwindow_max,u_in_bwindow_min,u_in_bwindow_std,u_in_fwindow_mean,u_in_fwindow_max,u_in_fwindow_min,u_in_fwindow_std,u_in_cwindow_mean,u_in_cwindow_max,u_in_cwindow_min,u_in_cwindow_std,u_in_bwindow_ewm,u_in_fwindow_ewm,u_in_cwindow_ewm,u_in_bwindow_mean_diff,u_in_bwindow_max_diff,u_in_bwindow_min_diff,u_in_fwindow_mean_diff,u_in_fwindow_max_diff,u_in_fwindow_min_diff,u_in_cwindow_mean_diff,u_in_cwindow_max_diff,u_in_cwindow_min_diff,area,area_insp,area_insp_last,u_in_max,u_in_mean,u_in_std,area_max,area_insp_max,area_mean,area_insp_mean,vibs,RC_20_10,RC_20_20,RC_20_50,RC_50_10,RC_50_20,RC_50_50,RC_5_10,RC_5_20,RC_5_50,R_20,R_5,R_50,C_10,C_20,C_50,norm_time_step,u_in_max_diff,area_max_diff,area_insp_max_diff,u_in_mean_diff,area_mean_diff,area_insp_mean_diff
0,0.0,0.0833,0,5.836,4,1,0.0,0.0,0.0,0.0,0.0,0.0833,0.0833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.39,0.0,22.52,0.0,22.81,0.0,25.36,0.0,27.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-18.3,0.0,-22.42,0.0,-22.72,0.0,-25.27,0.0,-27.17,0.0,0.013885,0.0833,0.0,0.03105,19.4,27.27,0.0833,9.06,5.12,22.52,0.0,8.91,0.05957,26.45,20.69,0.06946,0.0,0.0833,-19.31,-27.17,0.0,-5.04,-22.42,0.0833,0.0,0.0,22.6,28.31,18.95,9.52,22.6,22.6,13.58,13.58,13,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0.0,-28.23,-22.6,-22.6,-18.88,-13.58,-13.58
1,0.03366,18.39,0,5.906,4,2,0.0,0.03366,0.0,0.03366,0.03366,18.47,9.234,0.0833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.52,0.0,22.81,0.0,25.36,0.0,27.27,0.0,27.12,0.0,18.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.125,0.0,-4.426,0.0,-6.973,0.0,-8.875,0.0,-8.74,0.0,3.078,18.39,0.0,6.844,23.9,27.27,18.39,3.092,7.973,22.81,0.0,10.34,13.16,26.94,22.34,15.305,0.0,18.39,-5.523,-8.875,0.0,10.41,-4.426,18.39,0.6187,0.6187,22.6,28.31,18.95,9.52,22.6,22.6,13.58,13.58,13,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0.02907,-9.93,-21.97,-21.97,-0.5767,-12.96,-12.96
2,0.0675,22.52,0,7.875,4,3,0.0,0.0675,0.0,0.03387,0.1012,40.97,13.66,18.39,0.0,0.0833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.81,0.0,25.36,0.0,27.27,0.0,27.12,0.0,26.81,0.0,4.125,0.0,22.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.2996,0.0,-2.846,0.0,-4.75,0.0,-4.617,0.0,-4.297,0.0,6.83,22.52,0.0,9.7,25.31,27.27,22.52,1.977,11.14,25.36,0.0,11.26,19.84,26.84,24.69,15.68,0.0,22.52,-2.803,-4.75,0.0,11.37,-2.846,22.52,1.381,1.381,22.6,28.31,18.95,9.52,22.6,22.6,13.58,13.58,13,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0.05832,-5.805,-21.22,-21.22,3.549,-12.195,-12.195
3,0.10156,22.81,0,11.74,4,4,0.0,0.10156,0.0,0.03403,0.2028,63.78,15.945,22.52,0.0,18.39,0.0,0.0833,0.0,0.0,0.0,0.0,0.0,25.36,0.0,27.27,0.0,27.12,0.0,26.81,0.0,27.86,0.0,0.2996,0.0,4.426,0.0,22.72,0.0,0.0,0.0,0.0,0.0,-2.547,0.0,-4.45,0.0,-4.32,0.0,-3.998,0.0,-5.055,0.0,10.63,22.81,0.0,10.695,26.2,27.86,22.81,1.7,14.55,27.27,0.0,11.49,21.97,27.58,26.69,12.18,0.0,22.81,-3.395,-5.055,0.0,8.26,-4.45,22.81,2.156,2.156,22.6,28.31,18.95,9.52,22.6,22.6,13.58,13.58,13,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0.0877,-5.504,-20.44,-20.44,3.85,-11.42,-11.42
4,0.1357,25.36,0,12.234,4,5,0.0,0.1357,0.0,0.0342,0.3384,89.1,17.83,22.81,0.0,22.52,0.0,18.39,0.0,0.0833,0.0,0.0,0.0,27.27,0.0,27.12,0.0,26.81,0.0,27.86,0.0,28.31,0.0,2.547,0.0,2.846,0.0,6.973,0.0,25.27,0.0,0.0,0.0,-1.904,0.0,-1.771,0.0,-1.452,0.0,-2.51,0.0,-2.957,0.0,14.86,25.36,0.0,10.67,27.12,28.31,25.36,0.9316,17.94,27.27,0.0,10.67,24.4,28.11,27.03,10.5,0.0,25.36,-1.766,-2.957,0.0,7.414,-1.904,25.36,3.025,3.025,22.6,28.31,18.95,9.52,22.6,22.6,13.58,13.58,13,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0.11725,-2.957,-19.56,-19.56,6.395,-10.555,-10.555


In [16]:
rc_cols = [c for c in train.columns if "RC_" in c]
df = train[["u_out", "pressure"] + rc_cols]
df["RC"] = df[rc_cols].values.argmax(axis=1)
df.drop(rc_cols, axis=1, inplace=True)
df["pred"] = valid_preds.values.reshape(-1)
df.head()

Unnamed: 0,u_out,pressure,RC,pred
0,0,5.836,2,5.797
1,0,5.906,2,5.88
2,0,7.875,2,7.906
3,0,11.74,2,11.87
4,0,12.234,2,12.37


In [42]:
df["RC_pred"] = np.tile(rc_preds, (1, 35)).reshape(-1, rc_preds.shape[1]).argmax(axis=1)

In [43]:
print(compute_metric(df))

print(compute_metric(df[df.RC == df.RC_pred]))

print(compute_metric(df[df.RC == df.RC_pred]))

0.16571056865473463
0.15291127172745936
0.15291127172745936


In [53]:
test = pd.read_csv(cachedir / "test_lstm-less-cut-addfeatures_debugFalse.csv")
test.head()

Unnamed: 0,time_step,u_in,u_out,count,pressure,corss,cross2,cross3,time_delta,time_step_cumsum,u_in_cumsum,u_in_cummean,u_in_lag_b1,u_out_lag_b1,u_in_lag_b2,u_out_lag_b2,u_in_lag_b3,u_out_lag_b3,u_in_lag_b4,u_out_lag_b4,u_in_lag_b5,u_out_lag_b5,u_in_lag_f1,u_out_lag_f1,u_in_lag_f2,u_out_lag_f2,u_in_lag_f3,u_out_lag_f3,u_in_lag_f4,u_out_lag_f4,u_in_lag_f5,u_out_lag_f5,u_in_diff_b1,u_out_diff_b1,u_in_diff_b2,u_out_diff_b2,u_in_diff_b3,u_out_diff_b3,u_in_diff_b4,u_out_diff_b4,u_in_diff_b5,u_out_diff_b5,u_in_diff_f1,u_out_diff_f1,u_in_diff_f2,u_out_diff_f2,u_in_diff_f3,u_out_diff_f3,u_in_diff_f4,u_out_diff_f4,u_in_diff_f5,u_out_diff_f5,u_in_bwindow_mean,u_in_bwindow_max,u_in_bwindow_min,u_in_bwindow_std,u_in_fwindow_mean,u_in_fwindow_max,u_in_fwindow_min,u_in_fwindow_std,u_in_cwindow_mean,u_in_cwindow_max,u_in_cwindow_min,u_in_cwindow_std,u_in_bwindow_ewm,u_in_fwindow_ewm,u_in_cwindow_ewm,u_in_bwindow_mean_diff,u_in_bwindow_max_diff,u_in_bwindow_min_diff,u_in_fwindow_mean_diff,u_in_fwindow_max_diff,u_in_fwindow_min_diff,u_in_cwindow_mean_diff,u_in_cwindow_max_diff,u_in_cwindow_min_diff,area,area_insp,area_insp_last,u_in_max,u_in_mean,u_in_std,area_max,area_insp_max,area_mean,area_insp_mean,vibs,RC_20_10,RC_20_20,RC_20_50,RC_50_10,RC_50_20,RC_50_50,RC_5_10,RC_5_20,RC_5_50,R_20,R_5,R_50,C_10,C_20,C_50,norm_time_step,u_in_max_diff,area_max_diff,area_insp_max_diff,u_in_mean_diff,area_mean_diff,area_insp_mean_diff
0,0.0,0.0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.516,0.0,14.65,0.0,21.23,0.0,26.33,0.0,30.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.516,0.0,-14.65,0.0,-21.23,0.0,-26.33,0.0,-30.48,0.0,0.0,0.0,0.0,0.0,16.7,30.48,0.0,10.57,2.771,14.65,0.0,5.12,0.0,28.69,12.695,0.0,0.0,0.0,-16.7,-30.48,0.0,-2.771,-14.65,0.0,0.0,0.0,19.48,37.53,17.45,14.77,19.48,19.48,13.09,13.09,3,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0.0,-37.53,-19.48,-19.48,-17.45,-13.09,-13.09
1,0.0319,7.516,0,2,0,0.0,0.0319,0.0,0.0319,0.0319,7.516,3.758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.65,0.0,21.23,0.0,26.33,0.0,30.48,0.0,33.53,0.0,7.516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.137,0.0,-13.72,0.0,-18.81,0.0,-22.97,0.0,-26.03,0.0,1.253,7.516,0.0,2.8,22.3,33.53,7.516,9.016,5.426,21.23,0.0,7.797,5.37,32.16,19.33,6.26,0.0,7.516,-14.77,-26.03,0.0,2.09,-13.72,7.516,0.2397,0.2397,19.48,37.53,17.45,14.77,19.48,19.48,13.09,13.09,3,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0.02942,-30.03,-19.25,-19.25,-9.94,-12.85,-12.85
2,0.06384,14.65,0,3,0,0.0,0.06384,0.0,0.03192,0.0957,22.17,7.39,7.516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.23,0.0,26.33,0.0,30.48,0.0,33.53,0.0,35.72,0.0,7.137,0.0,14.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.58,0.0,-11.67,0.0,-15.836,0.0,-18.89,0.0,-21.06,0.0,3.695,14.65,0.0,5.617,27.0,35.72,14.65,7.27,8.71,26.33,0.0,10.05,12.01,34.72,24.77,10.96,0.0,14.65,-12.34,-21.06,0.0,5.938,-11.67,14.65,0.7075,0.7075,19.48,37.53,17.45,14.77,19.48,19.48,13.09,13.09,3,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0.05884,-22.89,-18.78,-18.78,-2.799,-12.38,-12.38
3,0.09576,21.23,0,4,0,0.0,0.09576,0.0,0.03192,0.1915,43.4,10.85,14.65,0.0,7.516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.33,0.0,30.48,0.0,33.53,0.0,35.72,0.0,36.97,0.0,6.58,0.0,13.72,0.0,21.23,0.0,0.0,0.0,0.0,0.0,-5.09,0.0,-9.26,0.0,-12.31,0.0,-14.484,0.0,-15.74,0.0,7.234,21.23,0.0,8.25,30.72,36.97,21.23,5.5,12.52,30.48,0.0,11.664,18.61,36.34,29.22,14.0,0.0,21.23,-9.484,-15.74,0.0,8.7,-9.26,21.23,1.385,1.385,19.48,37.53,17.45,14.77,19.48,19.48,13.09,13.09,3,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0.08826,-16.31,-18.11,-18.11,3.78,-11.71,-11.71
4,0.1277,26.33,0,5,0,0.0,0.1277,0.0,0.0319,0.319,69.7,13.945,21.23,0.0,14.65,0.0,7.516,0.0,0.0,0.0,0.0,0.0,30.48,0.0,33.53,0.0,35.72,0.0,36.97,0.0,37.53,0.0,5.09,0.0,11.67,0.0,18.81,0.0,26.33,0.0,0.0,0.0,-4.164,0.0,-7.227,0.0,-9.4,0.0,-10.65,0.0,-11.22,0.0,11.62,26.33,0.0,10.04,33.44,37.53,26.33,3.955,16.72,33.53,0.0,12.414,24.12,37.22,32.6,14.7,0.0,26.33,-7.11,-11.22,0.0,9.6,-7.227,26.33,2.225,2.225,19.48,37.53,17.45,14.77,19.48,19.48,13.09,13.09,3,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0.1177,-11.22,-17.27,-17.27,8.87,-10.87,-10.87


In [54]:
train_ = train.copy()

In [56]:
features = np.array(list(train.drop(["kfold", "pressure"], axis=1).columns))
cont_features = [
    f
    for f in features
    if ("RC_" not in f) and ("u_out" not in f) and ("R_" not in f) and ("C_" not in f) and ("u_out" not in f)
]

RS = RobustScaler()
train[cont_features] = RS.fit_transform(train[cont_features])
test[cont_features] = RS.transform(test[cont_features])

In [57]:
train_data, test_data = train[features].values, test[features].values
train_data = train_data.reshape(-1, config.cut, train_data.shape[-1])
targets = train[["pressure"]].to_numpy().reshape(-1, config.cut)
test_data = test_data.reshape(-1, config.cut, test_data.shape[-1])

In [58]:
kfolds = train.iloc[0 :: config.cut]["kfold"].values
# reliable = train.iloc[0 :: config.cut]["is_reliable"].values

In [59]:
rc_feature_idxs = np.array([i for i, f in enumerate(features) if "RC_" in f])
r_feature_idxs = np.array([i for i, f in enumerate(features) if f in ["R_20", "R_5", "R_50"]])
c_feature_idxs = np.array([i for i, f in enumerate(features) if f in ["C_10", "C_20", "C_50"]])

In [61]:
rc_preds_onehot = np.eye(rc_preds.shape[1])[rc_preds.values.argmax(axis=1)]
r_preds_onehot = np.eye(3)[np.array(list(map(find_R, rc_preds.values.argmax(axis=1))))]
c_preds_onehot = np.eye(3)[np.array(list(map(find_C, rc_preds.values.argmax(axis=1))))]

In [62]:
rc_true_onehot = train.iloc[0::35][features[rc_feature_idxs]].values

In [63]:
train_data_ = train_data.copy()

In [107]:
train_data_[:, :, rc_feature_idxs] = np.tile(rc_preds_onehot, (1, 35)).reshape(-1, config.cut, rc_preds_onehot.shape[1])
train_data_[:, :, r_feature_idxs] = np.tile(r_preds_onehot, (1, 35)).reshape(-1, config.cut, r_preds_onehot.shape[1])
train_data_[:, :, c_feature_idxs] = np.tile(c_preds_onehot, (1, 35)).reshape(-1, config.cut, c_preds_onehot.shape[1])

In [110]:
def inference(train_data):
    with tf.device(f"/GPU:2"):
        valid_preds = np.empty_like(targets)

        for fold in range(config.n_splits):
            train_idx, test_idx = (kfolds != fold), (kfolds == fold)
            # test_idx = np.logical_and(test_idx, ~reliable)
            print("-" * 15, ">", f"Fold {fold+1}", "<", "-" * 15)
            savedir = logdir / "lstm-less-addfeatures-cut-large-custom-mae" / f"fold{fold}"

            X_valid, y_valid = train_data[test_idx], targets[test_idx]

            model = build_model(config=config, n_features=len(features))
            model.load_weights(savedir / "weights_custom_best.h5")

            valid_preds[test_idx, :] = model.predict(X_valid).squeeze()

            del model, X_valid, y_valid
            keras.backend.clear_session()
            gc.collect()
    return valid_preds

In [69]:
plane_preds = inference(train_data)

--------------- > Fold 1 < ---------------
--------------- > Fold 2 < ---------------
--------------- > Fold 3 < ---------------
--------------- > Fold 4 < ---------------
--------------- > Fold 5 < ---------------
--------------- > Fold 6 < ---------------
--------------- > Fold 7 < ---------------
--------------- > Fold 8 < ---------------
--------------- > Fold 9 < ---------------
--------------- > Fold 10 < ---------------


In [112]:
df = train[["pressure", "u_out"]].copy()
df["pred"] = plane_preds.reshape(-1)
compute_metric(df)

0.1658453562466597

In [73]:
mod_preds = inference(train_data_)

--------------- > Fold 1 < ---------------
--------------- > Fold 2 < ---------------
--------------- > Fold 3 < ---------------
--------------- > Fold 4 < ---------------
--------------- > Fold 5 < ---------------
--------------- > Fold 6 < ---------------
--------------- > Fold 7 < ---------------
--------------- > Fold 8 < ---------------
--------------- > Fold 9 < ---------------
--------------- > Fold 10 < ---------------


In [111]:
df = train[["pressure", "u_out"]].copy()
df["pred"] = mod_preds.reshape(-1)
compute_metric(df)

0.23818726887940972

In [153]:
thr = 0.99999
insert_idx = (np.max(rc_preds.values, axis=1) > thr)
insert_idx = np.tile(insert_idx, (35, 1)).T.reshape(-1)
print(np.round(insert_idx.sum() / insert_idx.shape[0], 3))

0.244


In [154]:
df = train[["pressure", "u_out"]].copy()
df["pred"] = np.where(insert_idx, mod_preds.reshape(-1), plane_preds.reshape(-1))
compute_metric(df)

0.16603730190284685