In [47]:
from random import seed
import numpy as np
import pandas as pd
import json
import os
import sys
import gc
import shutil
from pprint import pprint
from pathlib import Path
from typing import *

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import categorical_accuracy

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler

import sys

from matplotlib import pyplot as plt

In [5]:
sys.path.append(str(Path().resolve().parent))

from src.utils import seed_every_thing, Config, plot_metric, reduce_tf_gpu_memory, reduce_mem_usage, fetch_custom_data

In [18]:
logdir = Path().resolve().parent / "logs" / "cnn-classify-rc"
datadir = Path().resolve().parent / "data"
cachedir = Path().resolve().parent / "cache"

In [7]:
cfg_file_path = logdir / "config.json"
with open(cfg_file_path, "rb") as f:
    config = json.load(f)

In [9]:
def build_model(config: Config, n_features, n_classes) -> keras.models.Sequential:
    model = keras.models.Sequential([keras.layers.Input(shape=(config.cut, n_features))])
    for filters, kernel_size, dilation_rate in zip(
        config.conv1d["filters"], config.conv1d["kernel_sizes"], config.conv1d["dilation_rates"]
    ):
        model.add(
            keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, dilation_rate=dilation_rate, padding="same")
        )
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.ReLU())
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(n_classes, activation="softmax"))

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=config.lr),
        loss="categorical_crossentropy",
        metrics=[categorical_accuracy],
    )
    return model

In [10]:
config = Config().update(config)
reduce_tf_gpu_memory(gpu_id=config.gpu_id)

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') memory growth: True
PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU') memory growth: True
PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU') memory growth: True


In [13]:
train_df, test_df, _ = fetch_custom_data(datadir=datadir, n_splits=config.n_splits)
train_df["count"], test_df["count"] = (np.arange(train_df.shape[0]) % 80).astype(int), (
    np.arange(test_df.shape[0]) % 80
).astype(int)
train_df = train_df[train_df["count"] < config.cut].reset_index(drop=True)
test_preds_idx = test_df["count"] < config.cut
test_df = test_df[test_preds_idx].reset_index(drop=True)
test_df["pressure"] = 0

fetching data ...


  mask |= (ar1 == a)


done.


In [15]:
kfolds = train_df.iloc[0 :: config.cut]["kfold"].values
train_df = reduce_mem_usage(pd.read_csv(cachedir / f"train-classify-rc-debug{config.debug}.csv"))
test_df = reduce_mem_usage(pd.read_csv(cachedir / f"test-classify-rc-debug{config.debug}.csv"))

Memory usage of dataframe is 2377.38 MB
Memory usage after optimization is: 546.50 MB
Decreased by 77.0%
Memory usage of dataframe is 1571.49 MB
Memory usage after optimization is: 360.97 MB
Decreased by 77.0%


In [16]:
target_cols = [f for f in train_df.columns if "RC_" in f]
features = [f for f in train_df.columns if "u_in" in f]
cont_features = [f for f in features if ("u_out" not in f)]
pprint(target_cols)
pprint(features)
pprint(cont_features)

['RC_20_10',
 'RC_20_20',
 'RC_20_50',
 'RC_50_10',
 'RC_50_20',
 'RC_50_50',
 'RC_5_10',
 'RC_5_20',
 'RC_5_50']
['u_in',
 'u_in_cumsum',
 'u_in_cummean',
 'u_in_lag_b1',
 'u_in_lag_b2',
 'u_in_lag_b3',
 'u_in_lag_b4',
 'u_in_lag_b5',
 'u_in_lag_f1',
 'u_in_lag_f2',
 'u_in_lag_f3',
 'u_in_lag_f4',
 'u_in_lag_f5',
 'u_in_diff_b1',
 'u_in_diff_b2',
 'u_in_diff_b3',
 'u_in_diff_b4',
 'u_in_diff_b5',
 'u_in_diff_f1',
 'u_in_diff_f2',
 'u_in_diff_f3',
 'u_in_diff_f4',
 'u_in_diff_f5',
 'u_in_bwindow_mean',
 'u_in_bwindow_max',
 'u_in_bwindow_min',
 'u_in_bwindow_std',
 'u_in_fwindow_mean',
 'u_in_fwindow_max',
 'u_in_fwindow_min',
 'u_in_fwindow_std',
 'u_in_cwindow_mean',
 'u_in_cwindow_max',
 'u_in_cwindow_min',
 'u_in_cwindow_std',
 'u_in_bwindow_ewm',
 'u_in_fwindow_ewm',
 'u_in_cwindow_ewm',
 'u_in_bwindow_mean_diff',
 'u_in_bwindow_max_diff',
 'u_in_bwindow_min_diff',
 'u_in_fwindow_mean_diff',
 'u_in_fwindow_max_diff',
 'u_in_fwindow_min_diff',
 'u_in_cwindow_mean_diff',
 'u_in_cwin

In [17]:
RS = RobustScaler()
train_df[cont_features] = RS.fit_transform(train_df[cont_features])
test_df[cont_features] = RS.transform(test_df[cont_features])
train_data, test_data = train_df[features].values, test_df[features].values

train_data = train_data.reshape(-1, config.cut, train_data.shape[-1])
targets = train_df.iloc[0 :: config.cut][target_cols].to_numpy()
test_data = test_data.reshape(-1, config.cut, test_data.shape[-1])

In [41]:
with tf.device(f"/GPU:{config.gpu_id}"):
    valid_preds = np.empty_like(targets).astype(np.float32)
    test_preds = []
    
    for fold in range(config.n_splits):
        train_idx, test_idx = (kfolds != fold), (kfolds == fold)
        print("-" * 15, ">", f"Fold {fold+1}", "<", "-" * 15)
        savedir = logdir / f"fold{fold}"

        X_train, X_valid = train_data[train_idx], train_data[test_idx]
        y_train, y_valid = targets[train_idx], targets[test_idx]

        model = build_model(config=config, n_features=len(features), n_classes=len(target_cols))
        model.load_weights(savedir / "weights_best.h5")

        valid_preds[test_idx, :] = model.predict(X_valid)
        test_preds.append(model.predict(test_data).reshape(-1, len(target_cols)))

        del model, X_train, X_valid, y_train, y_valid
        keras.backend.clear_session()
        gc.collect()

--------------- > Fold 1 < ---------------
--------------- > Fold 2 < ---------------
--------------- > Fold 3 < ---------------
--------------- > Fold 4 < ---------------
--------------- > Fold 5 < ---------------
--------------- > Fold 6 < ---------------
--------------- > Fold 7 < ---------------
--------------- > Fold 8 < ---------------
--------------- > Fold 9 < ---------------
--------------- > Fold 10 < ---------------


In [43]:
pd.DataFrame(valid_preds).to_csv(logdir / "valid_preds.csv", index=False)

In [57]:
np.tile(valid_preds, (35, 1)).shape

(2640750, 9)