In [1]:
config_dir = "../experiments"
exp_name = "208_subtask/small"

In [2]:
%cd /kaggle/working

import pickle
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(
    version_base=None, config_path=f"{config_dir}/{exp_name.split('/')[0]}"
):
    cfg = compose(
        config_name="config.yaml",
        overrides=[f"exp={exp_name.split('/')[-1]}"],
        return_hydra_config=True,
    )

# 定数定義
output_dir = Path(f"output/experiments/{exp_name}")
gcs_path = f"gs://{cfg.dir.gcs_bucket}/{cfg.dir.gcs_base_dir}/experiments/{exp_name}/"

# 結果などの読み込み
r2_score_dict = pickle.load(open(output_dir / "val2_r2_score_dict.pkl", "rb"))
print("r2: ", np.mean(list(r2_score_dict.values())))

/kaggle/working
r2:  0.7294915497703501


## データの読み込み

In [3]:
original_xs_df = pl.read_parquet(gcs_path + "val2_original_xs.parquet", retries=5)
sub_df = pl.read_parquet(gcs_path + "val2_sub_predict.parquet", retries=5)
predict_df = pl.read_parquet(gcs_path + "val2_predict.parquet", retries=5)
label_df = pl.read_parquet(gcs_path + "val2_label.parquet", retries=5)
ss_df = pl.read_csv(
    "input/leap-atmospheric-physics-ai-climsim/sample_submission.csv", n_rows=1
)
weight_array = ss_df.select([x for x in ss_df.columns if x != "sample_id"]).to_numpy()[
    0
]

In [4]:
# カラム名の変更
test_df = pl.read_csv("input/leap-atmospheric-physics-ai-climsim/test.csv", n_rows=1)
original_xs_df = original_xs_df.rename(
    dict(zip(original_xs_df.columns, test_df.columns))
)
sub_df = sub_df.rename(dict(zip(sub_df.columns, test_df.columns[:361])))

predict_df = predict_df.rename(dict(zip(predict_df.columns, ss_df.columns)))
label_df = label_df.rename(dict(zip(label_df.columns, ss_df.columns)))

original_xs_df.head()

sample_id,state_t_0,state_t_1,state_t_2,state_t_3,state_t_4,state_t_5,state_t_6,state_t_7,state_t_8,state_t_9,state_t_10,state_t_11,state_t_12,state_t_13,state_t_14,state_t_15,state_t_16,state_t_17,state_t_18,state_t_19,state_t_20,state_t_21,state_t_22,state_t_23,state_t_24,state_t_25,state_t_26,state_t_27,state_t_28,state_t_29,state_t_30,state_t_31,state_t_32,state_t_33,state_t_34,state_t_35,…,pbuf_N2O_23,pbuf_N2O_24,pbuf_N2O_25,pbuf_N2O_26,pbuf_N2O_27,pbuf_N2O_28,pbuf_N2O_29,pbuf_N2O_30,pbuf_N2O_31,pbuf_N2O_32,pbuf_N2O_33,pbuf_N2O_34,pbuf_N2O_35,pbuf_N2O_36,pbuf_N2O_37,pbuf_N2O_38,pbuf_N2O_39,pbuf_N2O_40,pbuf_N2O_41,pbuf_N2O_42,pbuf_N2O_43,pbuf_N2O_44,pbuf_N2O_45,pbuf_N2O_46,pbuf_N2O_47,pbuf_N2O_48,pbuf_N2O_49,pbuf_N2O_50,pbuf_N2O_51,pbuf_N2O_52,pbuf_N2O_53,pbuf_N2O_54,pbuf_N2O_55,pbuf_N2O_56,pbuf_N2O_57,pbuf_N2O_58,pbuf_N2O_59
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,209.658019,230.387641,238.343944,246.544958,252.213235,249.866705,242.083703,235.691332,229.256637,225.81596,222.108325,218.657213,215.671462,213.425364,211.642811,210.319557,208.473457,206.429217,205.34601,205.778874,206.590721,206.708699,209.229532,211.685916,214.965148,218.5392,222.454816,226.339222,230.188813,233.977203,237.595957,241.143295,244.594272,247.9316,251.194144,254.212187,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
1,210.26488,229.761805,240.358826,249.42127,252.208494,248.718307,239.976667,233.147371,226.749996,223.194696,219.760581,216.698621,214.28257,212.698314,211.701495,211.02319,210.150563,209.172477,208.321153,208.955834,209.585178,210.150989,211.695567,213.702417,216.053159,218.595077,221.25905,223.952223,226.681248,229.457353,232.267315,235.174321,238.095657,241.120012,244.055982,246.866171,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
2,208.350949,223.604588,231.951835,242.871903,254.993682,256.389548,251.950028,243.998553,234.600467,229.91955,226.333564,223.040641,219.295277,215.761284,212.024627,208.665319,205.783639,203.589675,202.843125,205.056636,207.365564,207.514836,210.496801,213.334081,216.510841,220.129664,224.694799,229.165065,233.726607,238.194542,242.456779,246.386108,250.124195,253.573648,256.853075,259.902213,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
3,208.787141,223.873269,232.505078,244.098158,253.420492,253.766085,248.784052,241.253276,232.967552,229.012059,225.753068,222.79063,219.312547,216.089257,212.967267,210.098076,207.801654,205.934621,205.219532,206.796799,209.779504,210.28622,212.570652,215.399614,218.569742,221.771176,225.321101,229.067855,232.874967,236.637148,240.29244,243.708948,247.098648,250.234183,253.356945,256.212624,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07
4,210.893272,227.831347,239.65502,249.779979,253.028225,250.165577,240.769124,232.650694,225.435323,221.296389,218.051828,215.618932,213.968104,212.965384,212.391519,212.054552,211.770174,211.537142,210.804785,211.6821,212.005317,212.470521,213.285443,214.522345,216.063838,217.819724,219.720225,221.836046,224.003413,226.275044,228.703103,231.241533,233.914734,236.570089,239.201914,241.842729,…,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07,4.9086e-07


## 後処理
q2+q3の値を算出し、温度を利用して分配、答えを算出する。
全てに対して行うと大きくズレる可能性があるので、変動が大きそうなやつに絞ってみる

In [35]:
from sklearn.metrics import r2_score

h = 31

q2_label = label_df[f"ptend_q0002_{h}"].to_numpy()
q2_pred = predict_df[f"ptend_q0002_{h}"].to_numpy()
q3_label = label_df[f"ptend_q0003_{h}"].to_numpy()
q3_pred = predict_df[f"ptend_q0003_{h}"].to_numpy()
print("model score")
print("q2:", r2_score(q2_label, q2_pred))
print("q3:", r2_score(q3_label, q3_pred))

model score
q2: 0.4306495422752499
q3: 0.5406903934814681


In [36]:
# 後処理を試す
# subtaskから変化後のq2+q3を算出
t_min = 253.15
t_max = 273.28


q2_state = original_xs_df[f"state_q0002_{h}"].to_numpy()
q3_state = original_xs_df[f"state_q0003_{h}"].to_numpy()
cloud_new_state = (
    sub_df[f"state_q0002_{h}"].to_numpy() + sub_df[f"state_q0003_{h}"].to_numpy()
)

omn = np.clip(
    (original_xs_df[f"state_t_{h}"].to_numpy() - t_min) * (1 / (t_max - t_min)),
    0.0,
    1.0,
)

q2_pred_rule = (cloud_new_state * omn - q2_state) / 1200
q3_pred_rule = (cloud_new_state * (1 - omn) - q3_state) / 1200
print("subtask score")
print("q2:", r2_score(q2_label, q2_pred_rule))
print("q3:", r2_score(q3_label, q3_pred_rule))

subtask score
q2: 0.17540631710578924
q3: 0.5142788994904051


In [37]:
# 後処理とのアンサンブル
rate = 0.2
q2_pp = q2_pred * (1 - rate) + q2_pred_rule * rate
q3_pp = q3_pred * (1 - rate) + q3_pred_rule * rate

print("ensemble:", rate)
print("q2:", r2_score(q2_label, q2_pp))
print("q3:", r2_score(q3_label, q3_pp))

ensemble: 0.2
q2: 0.4310047379321702
q3: 0.5416875298969993


## all

In [13]:
import pandas as pd

from utils.metric import score

preds = predict_df[:, 1:].to_numpy()
labels = label_df[:, 1:].to_numpy()

_predict_df = pd.DataFrame(
    preds * weight_array, columns=[i for i in range(preds.shape[1])]
).reset_index()
_label_df = pd.DataFrame(
    labels * weight_array, columns=[i for i in range(labels.shape[1])]
).reset_index()
r2_scores = score(_label_df, _predict_df, "index", multioutput="raw_values")

r2_score = float(np.array(r2_scores).mean())
print(f"{r2_score=}")

r2_score=0.7294915497703501


In [42]:
def ensemble_subtask(
    original_xs_df, sub_df, predict_df, h_range=[20, 50], rate=0.2
) -> pl.DataFrame:
    """
    カラム名は付与されているとする
    """
    t_min = 253.15
    t_max = 273.28

    new_pred_df = predict_df.clone()
    for h in range(h_range[0], h_range[1]):
        q2_state = original_xs_df[f"state_q0002_{h}"].to_numpy()
        q3_state = original_xs_df[f"state_q0003_{h}"].to_numpy()
        cloud_new_state = (
            sub_df[f"state_q0002_{h}"].to_numpy()
            + sub_df[f"state_q0003_{h}"].to_numpy()
        )

        omn = np.clip(
            (original_xs_df[f"state_t_{h}"].to_numpy() - t_min) * (1 / (t_max - t_min)),
            0.0,
            1.0,
        )
        q2_pred_rule = (cloud_new_state * omn - q2_state) / 1200
        q3_pred_rule = (cloud_new_state * (1 - omn) - q3_state) / 1200

        q2_pp = (
            predict_df[f"ptend_q0002_{h}"].to_numpy() * (1 - rate) + q2_pred_rule * rate
        )
        q3_pp = (
            predict_df[f"ptend_q0003_{h}"].to_numpy() * (1 - rate) + q3_pred_rule * rate
        )
        new_pred_df = new_pred_df.with_columns(
            [
                pl.Series(name=f"ptend_q0002_{h}", values=q2_pp),
                pl.Series(name=f"ptend_q0003_{h}", values=q3_pp),
            ]
        )
    return new_pred_df

In [43]:
new_pred_df = ensemble_subtask(
    original_xs_df, sub_df, predict_df, h_range=[30, 50], rate=0.2
)

In [44]:
preds = new_pred_df[:, 1:].to_numpy()
labels = label_df[:, 1:].to_numpy()

_predict_df = pd.DataFrame(
    preds * weight_array, columns=[i for i in range(preds.shape[1])]
).reset_index()
_label_df = pd.DataFrame(
    labels * weight_array, columns=[i for i in range(labels.shape[1])]
).reset_index()
r2_scores = score(_label_df, _predict_df, "index", multioutput="raw_values")

r2_score = float(np.array(r2_scores).mean())
print(f"{r2_score=}")

r2_score=0.7296450675929265


In [45]:
new_pred_df = ensemble_subtask(
    original_xs_df, sub_df, predict_df, h_range=[30, 60], rate=0.2
)
preds = new_pred_df[:, 1:].to_numpy()
labels = label_df[:, 1:].to_numpy()

_predict_df = pd.DataFrame(
    preds * weight_array, columns=[i for i in range(preds.shape[1])]
).reset_index()
_label_df = pd.DataFrame(
    labels * weight_array, columns=[i for i in range(labels.shape[1])]
).reset_index()
r2_scores = score(_label_df, _predict_df, "index", multioutput="raw_values")

r2_score = float(np.array(r2_scores).mean())
print(f"{r2_score=}")

r2_score=0.7297166438120948


In [48]:
new_pred_df = ensemble_subtask(
    original_xs_df, sub_df, predict_df, h_range=[30, 60], rate=0.1
)
preds = new_pred_df[:, 1:].to_numpy()
labels = label_df[:, 1:].to_numpy()

_predict_df = pd.DataFrame(
    preds * weight_array, columns=[i for i in range(preds.shape[1])]
).reset_index()
_label_df = pd.DataFrame(
    labels * weight_array, columns=[i for i in range(labels.shape[1])]
).reset_index()
r2_scores = score(_label_df, _predict_df, "index", multioutput="raw_values")

r2_score = float(np.array(r2_scores).mean())
print(f"{r2_score=}")

r2_score=0.7297224920109082
