In [1]:
from random import seed
import numpy as np
import pandas as pd
import json
import os
import sys
import gc
import shutil
from pprint import pprint
from pathlib import Path
from typing import *

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import categorical_accuracy

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler

import sys

from matplotlib import pyplot as plt

In [2]:
sys.path.append(str(Path().resolve().parent))

from src.utils import seed_every_thing, Config, plot_metric, reduce_tf_gpu_memory, reduce_mem_usage, fetch_custom_data

In [3]:
logdir = Path().resolve().parent / "logs" / "cnn-classify-rc"
datadir = Path().resolve().parent / "data"
cachedir = Path().resolve().parent / "cache"

In [4]:
cfg_file_path = logdir / "config.json"
with open(cfg_file_path, "rb") as f:
    config = json.load(f)

In [48]:
def build_model(config: Config, n_features) -> keras.models.Sequential:
    model = keras.models.Sequential([keras.layers.Input(shape=(config.cut, n_features))])
    for n_unit in config.n_units:
        model.add(
            keras.layers.Bidirectional(
                keras.layers.LSTM(
                    n_unit,
                    return_sequences=True,
                )
            )
        )
    for n_unit in config.n_dense_units:
        model.add(keras.layers.Dense(n_unit, activation="selu"))
    model.add(keras.layers.Dense(1))

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=config.lr), loss="mae")
    return model

def find_R(n):
    if n in [0, 1, 2]:
        return 0
    elif n in [3, 4, 5]:
        return 2
    elif n in [6, 7, 8]:
        return 1
    else:
        raise Exception()

def find_C(n):
    if n in [0, 3, 6]:
        return 0
    elif n in [1, 4, 7]:
        return 1
    elif n in [2, 5, 8]:
        return 2
    else:
        raise Exception()

def compute_metric(df, merge=True):
    y_true, y_pred, u_out = df["pressure"].values, df["pred"].values, df["u_out"].values
    w = (1 - u_out)
    
    if merge:
        mae = w * np.abs(y_true - y_pred)
        mae = np.sum(mae) / np.sum(w)
        return mae
    else:
        y_true, y_pred, w = y_true.reshape(-1, 35), y_pred.reshape(-1, 35), w.reshape(-1, 35)
        maes = []
        for yt, yp, w_ in zip(y_true, y_pred, w):
            mae = w_ * np.abs(yt - yp)
            mae = np.sum(mae) / np.sum(w_)
            maes.append(mae)
        return np.array(maes)

In [6]:
config = Config().update(config)
reduce_tf_gpu_memory(gpu_id=config.gpu_id)

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') memory growth: True
PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU') memory growth: True
PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU') memory growth: True


In [27]:
rc_preds = pd.read_csv(logdir.parent / "cnn-classify-rc-reliable_4" / "valid_preds.csv")
rc_test_preds = pd.read_csv(logdir.parent / "cnn-classify-rc-reliable_4" / "test_preds.csv")

In [7]:
train_df, test_df, _ = fetch_custom_data(datadir=datadir, n_splits=config.n_splits)
train_df["count"], test_df["count"] = (np.arange(train_df.shape[0]) % 80).astype(int), (
    np.arange(test_df.shape[0]) % 80
).astype(int)
train_df = train_df[train_df["count"] < config.cut].reset_index(drop=True)
test_preds_idx = test_df["count"] < config.cut
test_df = test_df[test_preds_idx].reset_index(drop=True)
test_df["pressure"] = 0

fetching data ...


  mask |= (ar1 == a)


done.


In [8]:
kfolds = train_df.iloc[0 :: config.cut]["kfold"].values
train_df = reduce_mem_usage(pd.read_csv(cachedir / f"train-reliable-debug{config.debug}.csv"))
test_df = reduce_mem_usage(pd.read_csv(cachedir / f"test_lstm-less-cut-addfeatures_debug{config.debug}.csv"))

Memory usage of dataframe is 5071.37 MB
Memory usage after optimization is: 1168.54 MB
Decreased by 77.0%
Memory usage of dataframe is 1464.04 MB
Memory usage after optimization is: 334.11 MB
Decreased by 77.2%


In [21]:
train_df = train_df[train_df["count"] <= 35].reset_index(drop=True)

In [22]:
features = list(train_df.drop(["kfold", "pressure", "is_reliable"], axis=1).columns)
cont_features = [
    f
    for f in features
    if ("RC_" not in f) and ("u_out" not in f) and ("R_" not in f) and ("C_" not in f) and ("u_out" not in f)
]
# pprint(features)
# pprint(cont_features)

In [24]:
RS = RobustScaler()
train_df[cont_features] = RS.fit_transform(train_df[cont_features])
test_df[cont_features] = RS.transform(test_df[cont_features])
train_data, test_data = train_df[features].values, test_df[features].values

train_data = train_data.reshape(-1, config.cut, train_data.shape[-1])
targets = train_df[["pressure"]].to_numpy().reshape(-1, config.cut)
test_data = test_data.reshape(-1, config.cut, test_data.shape[-1])

In [31]:
kfolds = train_df.iloc[0 :: config.cut]["kfold"].values

In [32]:
rc_feature_idxs = np.array([i for i, f in enumerate(features) if "RC_" in f])
r_feature_idxs = np.array([i for i, f in enumerate(features) if f in ["R_20", "R_5", "R_50"]])
c_feature_idxs = np.array([i for i, f in enumerate(features) if f in ["C_10", "C_20", "C_50"]])

In [33]:
rc_preds_onehot = np.eye(rc_preds.shape[1])[rc_preds.values.argmax(axis=1)]
r_preds_onehot = np.eye(3)[np.array(list(map(find_R, rc_preds.values.argmax(axis=1))))]
c_preds_onehot = np.eye(3)[np.array(list(map(find_C, rc_preds.values.argmax(axis=1))))]

In [34]:
train_data_ = train_data.copy()

train_data_[:, :, rc_feature_idxs] = np.tile(rc_preds_onehot, (1, 35)).reshape(-1, config.cut, rc_preds_onehot.shape[1])
train_data_[:, :, r_feature_idxs] = np.tile(r_preds_onehot, (1, 35)).reshape(-1, config.cut, r_preds_onehot.shape[1])
train_data_[:, :, c_feature_idxs] = np.tile(c_preds_onehot, (1, 35)).reshape(-1, config.cut, c_preds_onehot.shape[1])

In [55]:
def inference(train_data):
    cfg_file_path = logdir.parent / "lstm-reliable" / "config.json"
    with open(cfg_file_path, "rb") as f:
        config = json.load(f)
        
    config = Config().update(config)
    with tf.device(f"/GPU:1"):
        valid_preds = np.empty_like(targets).astype(np.float32)
        # test_preds = []

        for fold in range(config.n_splits):
            train_idx, test_idx = (kfolds != fold), (kfolds == fold)
            print("-" * 15, ">", f"Fold {fold+1}", "<", "-" * 15)
            savedir = logdir.parent / "lstm-reliable"/ f"fold{fold}"

            X_train, X_valid = train_data[train_idx], train_data[test_idx]
            y_train, y_valid = targets[train_idx], targets[test_idx]

            model = build_model(config=config, n_features=len(features))
            model.load_weights(savedir / "weights_best.h5")

            valid_preds[test_idx, :] = model.predict(X_valid).squeeze()
            # test_preds.append(model.predict(test_data).squeeze().reshape(-1, 1).squeeze())

            del model, X_train, X_valid, y_train, y_valid
            keras.backend.clear_session()
            gc.collect()
    return valid_preds

In [40]:
plane_preds = pd.read_csv(logdir.parent / "lstm-reliable" / "valid_preds.csv", index_col=0).values

df = train_df[["pressure", "u_out"]].copy()
df["pred"] = plane_preds.reshape(-1)
compute_metric(df)

0.16726234596954298

In [56]:
mod_preds = inference(train_data_)

--------------- > Fold 1 < ---------------
--------------- > Fold 2 < ---------------
--------------- > Fold 3 < ---------------
--------------- > Fold 4 < ---------------
--------------- > Fold 5 < ---------------
--------------- > Fold 6 < ---------------
--------------- > Fold 7 < ---------------
--------------- > Fold 8 < ---------------
--------------- > Fold 9 < ---------------
--------------- > Fold 10 < ---------------


In [57]:
df = train_df[["pressure", "u_out"]].copy()
df["pred"] = mod_preds.reshape(-1)
compute_metric(df)

1.001877044987097

In [67]:
np.tile(np.arange(5), (3, 1)).T.reshape(-1)

array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4])

In [68]:
thr = 0.9
is_apply = (rc_preds.values.max(axis=1) > 0.9)
is_apply = np.tile(is_apply, (35, 1)).T.reshape(-1)

In [70]:
comb_preds = np.where(is_apply, mod_preds.reshape(-1), plane_preds.reshape(-1))

df = train_df[["pressure", "u_out"]].copy()
df["pred"] = comb_preds
compute_metric(df)

0.79719897994778

In [71]:
rc_preds_onehot

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [80]:
a = train_df.iloc[0 :: 35][np.array(features)[rc_feature_idxs]].values

from sklearn.metrics import accuracy_score


accuracy_score(a.argmax(axis=1), rc_preds_onehot.argmax(axis=1))

0.9456196156394964