In [1]:
%cd /kaggle/working
import json
import os
import pickle
import shutil
import sys
import time
from glob import glob
from pathlib import Path
from typing import Literal

import hydra
import numpy as np
import polars as pl
import torch
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from torch.nn.functional import normalize
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm
from tqdm.notebook import tqdm

/kaggle/working


In [2]:
import os

from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../preprocess/make_test_sim6"):
    cfg = compose(
        config_name="config.yaml",
        overrides=["exp=base", "debug=True"],
        return_hydra_config=True,
    )
    print(OmegaConf.to_yaml(cfg.exp))

seed: 7
topk: 3
chunk_size: 100
output_dir: input/sim_data
scale_dir: output/preprocess/normalize_009_rate_feat/bolton
test_data_path: input/test.parquet
test_old_data_path: input/test_old.parquet
data_dir: input/ClimSim_low-res



In [3]:
def compute_cosine_similarity(tensor1, tensor2):
    return torch.mm(tensor1, tensor2.t())


def get_top_k_similar_rows(matrix, target_matrix, k=5, chunk_size=1000, device="cpu"):
    matrix = matrix.to(device)
    normalized_matrix = normalize(matrix, p=2, dim=1)
    target_matrix = target_matrix.to(device)
    normalized_target_matrix = normalize(target_matrix, p=2, dim=1)

    top_k_indices = torch.empty((matrix.size(0), k), dtype=torch.long, device="cpu")

    dataset = TensorDataset(normalized_matrix)
    dataloader = DataLoader(dataset, batch_size=chunk_size)

    for ci, chunk in enumerate(tqdm(dataloader)):
        chunk_tensor = chunk[0]
        chunk_size_actual = chunk_tensor.size(0)

        cosine_similarities = compute_cosine_similarity(
            chunk_tensor, normalized_target_matrix
        ).to(device)
        top_k = torch.topk(cosine_similarities, k, dim=1)
        top_k_indices[ci * chunk_size : ci * chunk_size + chunk_size_actual] = (
            top_k.indices.cpu()
        )

    return top_k_indices

In [4]:
df = pl.read_parquet(cfg.exp.test_data_path, n_rows=500 if cfg.debug else None)
old_df = pl.read_parquet(cfg.exp.test_old_data_path, n_rows=5000 if cfg.debug else None)

# old_df は、+6が存在し得る、30+120n (1+4kファイル間隔) だけ残す

In [73]:
old_filter_df = (
    old_df.with_row_index()
    .with_columns(
        [
            ((pl.col("index") // 384) * 30 % 120).alias("time_mod"),
        ]
    )
    .filter((pl.col("time_mod") - 30) % 120 == 0)
    .drop(["index", "time_mod"])
)

print(old_df.shape, old_filter_df.shape)

(5000, 557) (1160, 557)


In [74]:
base_array = df[:, 1:].to_numpy()
old_array = old_filter_df[:, 1:].to_numpy()

In [75]:
# scaling
feat_mean_dict = pickle.load(
    open(
        Path(cfg.exp.scale_dir) / "x_mean_feat_dict.pkl",
        "rb",
    )
)
feat_std_dict = pickle.load(
    open(
        Path(cfg.exp.scale_dir) / "x_std_feat_dict.pkl",
        "rb",
    )
)
data = torch.tensor(base_array[:, :556])
data_old = torch.tensor(old_array[:, :556])
data = (data - feat_mean_dict["base"]) / (feat_std_dict["base"] + 1e-60)
data_old = (data_old - feat_mean_dict["base"]) / (feat_std_dict["base"] + 1e-60)

# flaot32
data = data.float()
data_old = data_old.float()

In [76]:
# similar
device = "cuda" if torch.cuda.is_available() else "cpu"
if cfg.debug:
    device = "cpu"
top_k_similar = get_top_k_similar_rows(
    data,
    data_old,
    k=cfg.exp.topk,
    chunk_size=cfg.exp.chunk_size,
    device=device,
)

  0%|          | 0/5 [00:00<?, ?it/s]

## 疑似validationに対して行う

In [77]:
def get_two_years_month_dirs(year_id):
    start = 1 + year_id * 2
    month_dirs = (
        [f"train/000{start}-{str(m).zfill(2)}" for m in range(2, 13)]
        + [
            f"train/000{y}-{str(m).zfill(2)}"
            for y in range(start + 1, start + 2)
            for m in range(1, 13)
        ]
        + [f"train/000{start+2}-01"]
    )
    return month_dirs

In [78]:
month_dirs = get_two_years_month_dirs(cfg.exp.valid_year_id)

path_list = sorted(
    [
        path
        for paths in [
            glob(str(Path(cfg.exp.data_dir) / dir_path / "*"))
            for dir_path in month_dirs
        ]
        for path in paths
    ]
)

In [79]:
if cfg.debug:
    path_list = path_list[:100]

In [95]:
start_id = 0
df = pl.DataFrame(
    np.vstack([np.load(path) for path in path_list[start_id::24]])
).with_row_index("sample_id")
old_df = pl.DataFrame(
    np.vstack([np.load(path) for path in path_list[start_id::30]])
).with_row_index("sample_id")

In [96]:
df.shape

(1920, 925)

In [97]:
old_filter_df = (
    old_df.with_row_index()
    .with_columns(
        [
            ((pl.col("index") // 384) * 30 % 120).alias("time_mod"),
        ]
    )
    .filter((pl.col("time_mod") - 30) % 120 == 0)
    .drop(["index", "time_mod"])
)

print(old_df.shape, old_filter_df.shape)

(1536, 925) (384, 925)


In [98]:
base_array = df[:, 1:557].to_numpy()
old_array = old_filter_df[:, 1:557].to_numpy()

stats_df = (
    pl.DataFrame(base_array[:, :1])
    .with_row_index()
    .with_columns(
        [
            (pl.col("index") % 384).alias("location"),
            ((pl.col("index") // 384) * 24).alias("file_index"),
            ((pl.col("index") // 384) * 24 % 120).alias("time_mod"),
            ((pl.col("index") // 384) * 24 * 384 + (pl.col("index") % 384)).alias(
                "row_index"
            ),
        ]
    )
    .drop(["column_0"])
)
base_array.shape

(1920, 556)

In [99]:
# scaling
feat_mean_dict = pickle.load(
    open(
        Path(cfg.exp.scale_dir) / "x_mean_feat_dict.pkl",
        "rb",
    )
)
feat_std_dict = pickle.load(
    open(
        Path(cfg.exp.scale_dir) / "x_std_feat_dict.pkl",
        "rb",
    )
)
data = torch.tensor(base_array[:, :556])
data_old = torch.tensor(old_array[:, :556])
data = (data - feat_mean_dict["base"]) / (feat_std_dict["base"] + 1e-60)
data_old = (data_old - feat_mean_dict["base"]) / (feat_std_dict["base"] + 1e-60)

# flaot32
data = data.float()
data_old = data_old.float()

In [100]:
# similar
device = "cuda" if torch.cuda.is_available() else "cpu"
if cfg.debug:
    device = "cpu"
top_k_similar = get_top_k_similar_rows(
    data,
    data_old,
    k=1,
    chunk_size=cfg.exp.chunk_size,
    device=device,
)

  0%|          | 0/20 [00:00<?, ?it/s]

In [101]:
top_k_similar.shape

torch.Size([1920, 1])

In [102]:
stats_df[382:]

index,location,file_index,time_mod,row_index
u32,u32,u32,u32,u32
382,382,0,0,382
383,383,0,0,383
384,0,24,24,9216
385,1,24,24,9217
386,2,24,24,9218
…,…,…,…,…
1915,379,96,96,37243
1916,380,96,96,37244
1917,381,96,96,37245
1918,382,96,96,37246


In [103]:
df_similar = stats_df.with_columns(
    [
        pl.Series(
            "old_row_index",
            values=(((top_k_similar.numpy() // 384) * 4 * 30 + 30) * 384)
            + (top_k_similar.numpy() % 384),
        ),
    ]
).with_columns(
    [
        pl.col("old_row_index")
        .list.slice(0, 1)
        .list.contains(pl.col("row_index") + 384 * 6)
        .alias("is_top1_next")
    ]
)

In [106]:
df_similar.head(386)

index,location,file_index,time_mod,row_index,old_row_index,is_top1_next
u32,u32,u32,u32,u32,list[i64],bool
0,0,0,0,0,[11520],false
1,1,0,0,1,[11521],false
2,2,0,0,2,[11522],false
3,3,0,0,3,[11654],false
4,4,0,0,4,[11524],false
…,…,…,…,…,…,…
381,381,0,0,381,[11901],false
382,382,0,0,382,[11902],false
383,383,0,0,383,[11903],false
384,0,24,24,9216,[11520],true


recall=shape: (1, 1)
┌──────────────┐
│ is_top1_next │
│ ---          │
│ f64          │
╞══════════════╡
│ 0.994792     │
└──────────────┘


384