In [1]:
%cd /kaggle/working
import glob

import numpy as np
import numpy as np
import polars as pl

/kaggle/working


In [78]:
path_list = sorted(glob.glob("input/ClimSim_low-res/train/*/*.npy"))
debug = False
if debug:
    path_list = path_list[:1000]

len(path_list[::24])

2190

In [79]:
base_array = np.vstack([np.load(path) for path in path_list[::24][:100]])
old_array = np.vstack([np.load(path) for path in path_list[::30]])

# Create a DataFrame using Polars
df = (
    pl.DataFrame(base_array[:, :1])
    .with_row_index()
    .with_columns(
        [
            (pl.col("index") % 384).alias("location"),
            ((pl.col("index") // 384) * 24).alias("file_index"),
            ((pl.col("index") // 384) * 24 % 120).alias("time_mod"),
            ((pl.col("index") // 384) * 24 * 384 + (pl.col("index") % 384)).alias(
                "row_index"
            ),
        ]
    )
    .drop(["column_0"])
)
df_old = (
    pl.DataFrame(old_array[:, :1])
    .with_row_index()
    .with_columns(
        [
            (pl.col("index") % 384).alias("location"),
            ((pl.col("index") // 384) * 30).alias("file_index"),
            ((pl.col("index") // 384) * 30 % 120).alias("time_mod"),
            ((pl.col("index") // 384) * 30 * 384 + (pl.col("index") % 384)).alias(
                "row_index"
            ),
        ]
    )
    .drop(["column_0"])
)

df

index,location,file_index,time_mod,row_index
u32,u32,u32,u32,u32
0,0,0,0,0
1,1,0,0,1
2,2,0,0,2
3,3,0,0,3
4,4,0,0,4
…,…,…,…,…
38395,379,2376,96,912763
38396,380,2376,96,912764
38397,381,2376,96,912765
38398,382,2376,96,912766


In [80]:
import pickle
from pathlib import Path

import torch
from torch.nn.functional import normalize
from torch.utils.data import DataLoader, TensorDataset
from tqdm.notebook import tqdm


def compute_cosine_similarity(tensor1, tensor2):
    return torch.mm(tensor1, tensor2.t())


def get_top_k_similar_rows(matrix, target_matrix, k=5, chunk_size=1000, device="cpu"):
    matrix = matrix.to(device)
    normalized_matrix = normalize(matrix, p=2, dim=1)
    target_matrix = target_matrix.to(device)
    normalized_target_matrix = normalize(target_matrix, p=2, dim=1)

    top_k_indices = torch.empty((matrix.size(0), k), dtype=torch.long, device="cpu")

    dataset = TensorDataset(normalized_matrix)
    dataloader = DataLoader(dataset, batch_size=chunk_size)

    for ci, chunk in enumerate(tqdm(dataloader)):
        chunk_tensor = chunk[0]
        chunk_size_actual = chunk_tensor.size(0)

        cosine_similarities = compute_cosine_similarity(
            chunk_tensor, normalized_target_matrix
        ).to(device)
        top_k = torch.topk(cosine_similarities, k, dim=1)
        top_k_indices[ci * chunk_size : ci * chunk_size + chunk_size_actual] = (
            top_k.indices.cpu()
        )

    return top_k_indices


# scale
scale_dir = "output/preprocess/normalize_009_rate_feat/bolton"
feat_mean_dict = pickle.load(
    open(
        Path(scale_dir) / "x_mean_feat_dict.pkl",
        "rb",
    )
)
feat_std_dict = pickle.load(
    open(
        Path(scale_dir) / "x_std_feat_dict.pkl",
        "rb",
    )
)

# Sample data
data = torch.tensor(base_array[:, :556])
data_old = torch.tensor(old_array[:, :556])

data = (data - feat_mean_dict["base"]) / (feat_std_dict["base"] + 1e-60)
data_old = (data_old - feat_mean_dict["base"]) / (feat_std_dict["base"] + 1e-60)


k = 5
chunk_size = 100

# Use CPU or GPU
device = "cpu"  # "cuda" if torch.cuda.is_available() else "cpu"

top_k_similar_rows = get_top_k_similar_rows(
    data, data_old, k=k, chunk_size=chunk_size, device=device
)
top_k_similar_rows

  0%|          | 0/384 [00:00<?, ?it/s]

tensor([[    0,  4608,  9997,  6208,  5440],
        [    1, 10048,  6532,  7300,  6536],
        [    2,  4610,   207,   770,  2690],
        ...,
        [30717, 31101, 29949, 30333, 29565],
        [30718, 31102, 29950, 29566, 30334],
        [30719, 30335, 29951, 31103, 31487]])

In [90]:
df_similar = df.with_columns(
    [
        pl.Series(
            "old_index",
            values=(top_k_similar_rows.numpy()),
        ),
        pl.Series(
            "old_file_index",
            values=(top_k_similar_rows.numpy() // 384) * 30,
        ),
        pl.Series(
            "old_row_index",
            values=(top_k_similar_rows.numpy() // 384 * 30 * 384)
            + (top_k_similar_rows.numpy() % 384),
        ),
    ]
).with_columns(
    [
        pl.col("old_row_index")
        .list.head(2).list.contains(pl.col("row_index") + 384 * i)
        .alias(f"is_in_next{i}")
        for i in [0, 6, 12, 24]
    ]
)

In [91]:
df_similar.filter(pl.col("time_mod") == 24)["is_in_next6"].sum()

7375

In [89]:
df_similar.filter(pl.col("time_mod") == 24)

index,location,file_index,time_mod,row_index,old_index,old_file_index,old_row_index,is_in_next0,is_in_next6,is_in_next12,is_in_next24
u32,u32,u32,u32,u32,list[i64],list[i64],list[i64],bool,bool,bool,bool
384,0,24,24,9216,"[384, 12289, … 1152]","[30, 960, … 90]","[11520, 368641, … 34560]",false,true,false,false
385,1,24,24,9217,"[385, 4993, … 6148]","[30, 390, … 480]","[11521, 149761, … 184324]",false,true,false,false
386,2,24,24,9218,"[386, 591, … 4994]","[30, 30, … 390]","[11522, 11727, … 149762]",false,true,false,false
387,3,24,24,9219,"[387, 7683, … 12294]","[30, 600, … 960]","[11523, 230403, … 368646]",false,true,false,false
388,4,24,24,9220,"[388, 308033, … 4997]","[30, 24060, … 390]","[11524, 9239105, … 149765]",false,true,false,false
…,…,…,…,…,…,…,…,…,…,…,…
37243,379,2304,24,885115,"[29947, 29563, … 18043]","[2310, 2280, … 1380]","[887419, 875899, … 530299]",false,true,false,false
37244,380,2304,24,885116,"[29948, 29564, … 29180]","[2310, 2280, … 2250]","[887420, 875900, … 864380]",false,true,false,false
37245,381,2304,24,885117,"[29949, 30717, … 29181]","[2310, 2370, … 2250]","[887421, 910461, … 864381]",false,true,false,false
37246,382,2304,24,885118,"[29950, 30718, … 31102]","[2310, 2370, … 2400]","[887422, 910462, … 921982]",false,true,false,false


In [84]:
df_old.shape

(672768, 5)