In [11]:
%cd /kaggle/working

import os

from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../preprocess/split_pred"):
    cfg = compose(
        config_name="config.yaml", overrides=["debug=True"], return_hydra_config=True
    )
    print(OmegaConf.to_yaml(cfg.exp))

/kaggle/working
seed: 7
kami_pred_paths:
- input/predict/kami-leap-pred2/kami_experiments_201_unet_multi_all_384_n2_submission.parquet
- input/predict/kami-leap-pred2/kami_experiments_201_unet_multi_all_512_n3_submission.parquet
- input/predict/kami-leap-pred2/kami_experiments_201_unet_multi_all_n3_restart2_submission.parquet
- input/predict/kami-leap-pred2/kami_experiments_201_unet_multi_all_submission.parquet
- input/predict/kami-leap-pred2/kami_experiments_204_diff_last_all_lr_submission.parquet
- input/predict/kami-leap-pred2/kami_experiments_211_simple_split_head_all_cos_submission.parquet
- input/predict/kami-leap-pred2/kami_experiments_217_fix_transformer_leak_all_cos_head64_n4_submission.parquet
- input/predict/kami-leap-pred2/kami_experiments_217_fix_transformer_leak_all_cos_head64_submission.parquet
- input/predict/kami-leap-pred2/kami_experiments_222_wo_transformer_all_submission.parquet
kurupical_pred_paths:
- input/predict/kurupical-leap-pred2/20240621215730_exp031_20files

In [3]:
from glob import glob
from pathlib import Path

import numpy as np
import polars as pl

In [8]:
# valid
valid_output_dir = Path(cfg.exp.output_dir) / "label" / "valid"
valid_output_dir.mkdir(parents=True, exist_ok=True)
df = pl.read_parquet(cfg.exp.valid_path)
for i in range(0, cfg.exp.n_data_for_eval, 384):
    sub_array = df[
        -cfg.exp.n_data_for_eval + i * 384 : -cfg.exp.n_data_for_eval + (i + 1) * 384,
        1:,
    ].to_numpy()
    if i == 0:
        print(sub_array.shape)
    np.save(valid_output_dir / f"{i:06d}.npy", sub_array)

(384, 924)


In [14]:
# test
output_dir = Path(cfg.exp.output_dir) / "label" / "test"
output_dir.mkdir(parents=True, exist_ok=True)
df = pl.read_parquet(cfg.exp.test_path)
for i in range(0, len(df), 384):
    sub_array = df[i * 384 : (i + 1) * 384, 1:].to_numpy()
    if i == 0:
        print(sub_array.shape)
    np.save(output_dir / f"{i:06d}.npy", sub_array)

(384, 556)


In [37]:
TARGET_COLUMNS = pl.read_csv(
    "input/leap-atmospheric-physics-ai-climsim/sample_submission.csv", n_rows=1
).columns[1:]

In [38]:
# kami

for path in cfg.exp.kami_pred_paths:
    base_name = path.replace("_submission.parquet", "").split("/")[-1]
    print(base_name)

    # valid
    valid_output_dir = Path(cfg.exp.output_dir) / base_name / "valid"
    valid_output_dir.mkdir(parents=True, exist_ok=True)
    valid_path = path.replace("_submission.parquet", "_valid_pred.parquet")
    df = pl.read_parquet(valid_path)
    array = df.select(TARGET_COLUMNS)[-cfg.exp.n_data_for_eval :].to_numpy()
    print(array.shape)
    for i in range(0, cfg.exp.n_data_for_eval, 384):
        sub_array = array[i * 384 : (i + 1) * 384]
        np.save(valid_output_dir / f"{i:06d}.npy", sub_array)

    # test
    test_output_dir = Path(cfg.exp.output_dir) / base_name / "test"
    test_output_dir.mkdir(parents=True, exist_ok=True)
    sub_path = path
    df = pl.read_parquet(sub_path)
    array = df.select(TARGET_COLUMNS)[: cfg.exp.n_data_for_eval].to_numpy()
    print(array.shape)
    for i in range(0, len(df), 384):
        sub_array = array[i * 384 : (i + 1) * 384]
        np.save(test_output_dir / f"{i:06d}.npy", sub_array)

kami_experiments_201_unet_multi_all_384_n2
(636640, 368)
(625000, 368)
kami_experiments_201_unet_multi_all_512_n3
(636640, 368)
(625000, 368)
kami_experiments_201_unet_multi_all_n3_restart2
(636640, 368)
(625000, 368)
kami_experiments_201_unet_multi_all
(636640, 368)
(625000, 368)
kami_experiments_204_diff_last_all_lr
(636640, 368)
(625000, 368)
kami_experiments_211_simple_split_head_all_cos
(636640, 368)
(625000, 368)
kami_experiments_217_fix_transformer_leak_all_cos_head64_n4
(636640, 368)
(625000, 368)
kami_experiments_217_fix_transformer_leak_all_cos_head64
(636640, 368)
(625000, 368)
kami_experiments_222_wo_transformer_all
(636640, 368)
(625000, 368)


In [39]:
# kurupical

for path in cfg.exp.kurupical_pred_paths:
    base_name = path.split("/")[-1]
    print(base_name)

    # valid
    valid_output_dir = Path(cfg.exp.output_dir) / base_name / "valid"
    valid_output_dir.mkdir(parents=True, exist_ok=True)
    valid_path = Path(path) / "pred_valid.parquet"
    df = pl.read_parquet(valid_path)
    array = df.select(TARGET_COLUMNS)[-cfg.exp.n_data_for_eval :].to_numpy()
    print(array.shape)
    for i in range(0, cfg.exp.n_data_for_eval, 384):
        sub_array = array[i * 384 : (i + 1) * 384]
        np.save(valid_output_dir / f"{i:06d}.npy", sub_array)

    # test
    test_output_dir = Path(cfg.exp.output_dir) / base_name / "test"
    test_output_dir.mkdir(parents=True, exist_ok=True)
    sub_path = Path(path) / "submission.parquet"
    df = pl.read_parquet(sub_path)
    array = df.select(TARGET_COLUMNS)[-cfg.exp.n_data_for_eval :].to_numpy()
    print(array.shape)
    for i in range(0, cfg.exp.n_data_for_eval, 384):
        sub_array = array[i * 384 : (i + 1) * 384]
        np.save(test_output_dir / f"{i:06d}.npy", sub_array)

20240621215730_exp031_20files_1d_dims(64, 128, 256, 512)_7epochs_power2.5e-3
(636640, 368)
(625000, 368)
20240626042548_exp031_20files_transformer_512x6_head32_lr1e-3
(636640, 368)
(625000, 368)
20240626120414_exp031_20files_transformer_384x12_head32_lr1e-3
(636640, 368)
(625000, 368)
20240703230157_exp042_70m_transformer_512x4_lr0.001_beta1
(636640, 368)
(625000, 368)
20240705215850_exp042_70m_transformer_768x4_lr0.001_beta1
(636640, 368)
(625000, 368)
20240706022848_exp042_70m_cnn64_smoothl1beta1_lr2.5e-3_beta0.01_wd0.05
(636640, 368)
(625000, 368)


In [41]:
# takoi

for path in cfg.exp.takoi_pred_dir:
    base_name = path.replace("_pp.parquet", "").split("/")[-1]
    print(base_name)

    # valid
    valid_output_dir = Path(cfg.exp.output_dir) / base_name / "valid"
    valid_output_dir.mkdir(parents=True, exist_ok=True)
    valid_path = path.replace("_pp.parquet", "_val_preds.npy").replace("ex", "exp")
    array = np.load(valid_path)
    array = array[-cfg.exp.n_data_for_eval :]
    print(array.shape)
    for i in range(0, cfg.exp.n_data_for_eval, 384):
        sub_array = array[i * 384 : (i + 1) * 384]
        np.save(valid_output_dir / f"{i:06d}.npy", sub_array)

    # test
    test_output_dir = Path(cfg.exp.output_dir) / base_name / "test"
    test_output_dir.mkdir(parents=True, exist_ok=True)
    sub_path = path
    df = pl.read_parquet(sub_path)
    array = df.select(TARGET_COLUMNS)[-cfg.exp.n_data_for_eval :].to_numpy()
    print(array.shape)
    for i in range(0, cfg.exp.n_data_for_eval, 384):
        sub_array = array[i * 384 : (i + 1) * 384]
        np.save(test_output_dir / f"{i:06d}.npy", sub_array)

ex123
(636640, 368)
(625000, 368)
ex124
(636640, 368)
(625000, 368)
ex130
(636640, 368)
(625000, 368)
ex131
(636640, 368)
(625000, 368)
ex133
(636640, 368)
(625000, 368)
ex134
(636640, 368)
(625000, 368)
ex135
(636640, 368)
(625000, 368)
ex136
(636640, 368)
(625000, 368)
ex138
(636640, 368)
(625000, 368)
ex139
(636640, 368)
(625000, 368)
ex141
(636640, 368)
(625000, 368)


In [17]:
array = np.load(
    "input/predict_split/label/test/000000.npy",
    allow_pickle=True,
)
print(array.shape)
array

(384, 556)


array([[2.14197182e+02, 2.38947380e+02, 2.50907070e+02, ...,
        4.90858386e-07, 4.90858386e-07, 4.90858386e-07],
       [2.18198634e+02, 2.33129742e+02, 2.36723381e+02, ...,
        4.90858386e-07, 4.90858386e-07, 4.90858386e-07],
       [2.09452196e+02, 2.33517061e+02, 2.27908495e+02, ...,
        4.90858386e-07, 4.90858386e-07, 4.90858386e-07],
       ...,
       [2.01760727e+02, 2.17223779e+02, 2.28078219e+02, ...,
        4.90858386e-07, 4.90858386e-07, 4.90858386e-07],
       [2.17295289e+02, 2.17804358e+02, 2.30761110e+02, ...,
        4.90858386e-07, 4.90858386e-07, 4.90858386e-07],
       [2.14073740e+02, 2.18428334e+02, 2.31001217e+02, ...,
        4.90858386e-07, 4.90858386e-07, 4.90858386e-07]])

In [16]:
array = np.load(
    "input/predict_split/20240706022848_exp042_70m_cnn64_smoothl1beta1_lr2.5e-3_beta0.01_wd0.05/test/000000.npy",
    allow_pickle=True,
)
print(array.shape)
array

(384, 368)


array([[ 3.04885907e-06, -6.41175648e-05, -8.41216461e-05, ...,
         5.85786067e-02, -8.90148804e-02, -3.96178253e-02],
       [-1.45590175e-05, -4.92031140e-05, -3.33135758e-05, ...,
         1.40480161e-01, -1.18626244e-02,  5.07695191e-02],
       [-7.37343453e-06, -7.73725042e-05, -3.20664512e-05, ...,
         4.48914729e-02,  2.08979957e-02,  1.81443207e-02],
       ...,
       [-6.40458950e-07, -2.37953955e-05, -3.23130043e-05, ...,
        -1.45362735e-01,  3.94450538e-02,  8.22350457e-02],
       [-3.13781129e-05, -1.62495344e-05, -3.81992068e-05, ...,
         2.49305844e-01,  3.76292579e-02, -1.92835815e-02],
       [-1.56760270e-05, -9.72990620e-06, -2.16443259e-05, ...,
         5.52216731e-02, -4.70493920e-02, -1.32143982e-02]])

In [18]:
array = np.load("input/predict_split/ex123/test/000000.npy")
array.shape
array

array([[ 2.70680607e-06, -6.42092200e-05, -8.30491263e-05, ...,
         7.09261969e-02,  1.42842159e-01,  1.91618025e-01],
       [-1.50063715e-05, -4.95341737e-05, -3.35606201e-05, ...,
         2.85222143e-01,  1.78699449e-01,  8.45400915e-02],
       [-6.58305044e-06, -7.82479256e-05, -3.24354551e-05, ...,
         7.00854436e-02,  1.32392541e-01,  8.05990025e-03],
       ...,
       [-1.32964215e-06, -2.37605345e-05, -3.21361476e-05, ...,
         4.41102773e-01, -7.23679736e-02, -2.53604859e-01],
       [-3.12274606e-05, -1.70018011e-05, -3.90337591e-05, ...,
        -4.79047447e-01,  5.00441492e-01,  3.25025618e-01],
       [-1.55575945e-05, -9.56874283e-06, -2.18745954e-05, ...,
        -1.18612550e-01, -4.51425254e-01, -3.14911991e-01]])