In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
import sys
sys.path.append('/workspaces/ventilator-pressure-prediction/')

In [None]:
from src.datamodules.datamodule import VPPDataModule

In [None]:
dm = VPPDataModule(
    batch_size = 32,
    dataset={'_target_': 'src.datamodules.dataset.VPPDataset'},
    splitter = {'_target_': 'sklearn.model_selection.KFold', 'n_splits': 11, 'shuffle': True, 'random_state':1774},
    save_df=True
)
train = dm.train
test = dm.test

In [None]:
train = train[['id', 'pressure', 'fold', 'u_out']].set_index('id')
test = test[['id']].set_index('id')

In [None]:
all_pressure = np.sort(train.pressure.unique())
min_pressure, max_pressure = all_pressure[0], all_pressure[-1]
step_pressure = (all_pressure[1:] - all_pressure[:-1])[0]
def round_pressure(pressure): return np.round((pressure - min_pressure)/step_pressure) * step_pressure + min_pressure
min_pressure, max_pressure, step_pressure

In [None]:
names = ['cloudy']
folds = range(11)

In [None]:
working_path = Path('../')
log_path = working_path / 'logs/experiments'

oofs = []
pred = []
for name in tqdm(names):
    os = []
    ps = []
    for fold in folds:
        o = pd.read_csv(log_path/name/str(fold)/'oof_df.csv', index_col='id')
        o['pressure'] = round_pressure(o['pressure'])
        os.append(o)

        p = pd.read_csv(log_path/name/str(fold)/'pred_df.csv', index_col='id')
        p['pressure'] = round_pressure(p['pressure'])
        p = p.rename(columns={'pressure': fold})
        ps.append(p)
    
    os = pd.concat(os).sort_values('id')
    os = os.rename(columns={'pressure': name})
    oofs.append(os)
    
    ps = pd.concat(ps, axis=1)
    ps.columns = pd.MultiIndex.from_product([[name], ps.columns])
    pred.append(ps)

oofs = pd.concat(oofs, axis=1)
pred = pd.concat(pred, axis=1)



In [None]:
pred.columns = [f for n, f in pred.columns]

In [None]:
cv_df = oofs.join(train)
cv_df = cv_df[cv_df.u_out == 0]

In [None]:
cv_by_fold = {fold: mean_absolute_error(fold_df.pressure, fold_df[names[0]]) for fold, fold_df in cv_df.groupby('fold')}
mae_ = np.mean(list(cv_by_fold.values()))
mae_

In [None]:
oofs = oofs.join(train)[[names[0], 'fold']]

In [None]:
oofs.columns = ['pressure', 'fold']

In [None]:
oofs.to_csv(f'./oofs_{name}_{str(folds).replace(" ", "")}_{str(mae_)[:6]}.csv')
pred.to_csv(f'./pred_by_folds_{name}_{str(folds).replace(" ", "")}_{str(mae_)[:6]}.csv')
pred.median(axis=1).to_frame().rename(columns={0: 'pressure'}).to_csv(f'./pred_{name}_{str(folds).replace(" ", "")}_{str(mae_)[:6]}.csv')