In [1]:
%cd ..

/kaggle/working


In [3]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(
    version_base=None, config_path="../cand_unsupervised/personalized_pagerank"
):
    cfg = compose(config_name="config.yaml", overrides=["exp=base"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  num_candidate: 100
  k:
  - 1
  - 5
  - 10
  - 50
  - 100



In [26]:
import os
import sys
from pathlib import Path

import hydra
import igraph as ig
import numpy as np
import polars as pl
import scipy.sparse as sparse
from hydra.core.hydra_config import HydraConfig
from igraph import Graph
from omegaconf import DictConfig, OmegaConf
from scipy.sparse import csr_matrix, eye
from tqdm.auto import tqdm

import utils
from utils.load import load_label_data, load_log_data, load_yad_data
from utils.metrics import calculate_metrics

In [5]:
with utils.timer("load data"):
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
    all_log_df = pl.concat([train_log_df, test_log_df])

"""
グラフ作成
"""
with utils.trace("making graph"):
    # 連番に変換
    all_log_cast_df = all_log_df.with_columns(
        pl.col("yad_no").cast(str).cast(pl.Categorical).to_physical().alias("yid"),
    )

    unique_df = all_log_cast_df.unique(["yad_no", "yid"])
    unique_yids = unique_df["yid"].to_numpy()
    unique_yad_nos = unique_df["yad_no"].to_list()
    yid2yad_no = dict(zip(unique_yids, unique_yad_nos))

    # 遷移を作成
    transition_dfs = []

    for rti in [-1, 1]:
        if rti == 0:
            continue
        df = (
            all_log_cast_df.with_columns(
                pl.col("yid").alias("from_id"),
                pl.col("yid").shift(-(rti)).over("session_id").alias("to_id"),
            )
            .filter(~pl.col("to_id").is_null())
            .filter(pl.col("from_id") != pl.col("to_id"))  # 同じものへは遷移しない
            .select(["from_id", "to_id"])
        )
        transition_dfs.append(df)
    transition_df = pl.concat(transition_dfs)

    # 行列の作成
    matrix = sparse.csr_matrix(
        (
            np.ones(len(transition_df)),
            (
                transition_df["from_id"].to_numpy(),
                transition_df["to_id"].to_numpy(),
            ),
        )
    ).toarray()

    graph = Graph.Adjacency(matrix)

[load data] done in 0.1 s


[2.4GB(+1.8GB):15.3sec] making graph 


13561

In [27]:
from_yad_no = []
to_yad_nos = []
scores = []

K = 100
for yid in tqdm(range(graph.vcount())):
    ppr = np.array(graph.personalized_pagerank(reset_vertices=yid))
    top_k_indices = np.argsort(-ppr)[:K]
    top_k_values = ppr[top_k_indices]
    from_yad_no.append(yid2yad_no[yid])
    to_yad_nos.append([yid2yad_no[y] for y in top_k_indices])
    scores.append(top_k_values)

  0%|          | 0/13561 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [15]:
yad2yad_df = pl.DataFrame(
    {
        "from_yad_no": from_yad_no,  # unique_sids と同じ順番
        "to_yad_nos": to_yad_nos,
        "score": scores,
    }
)
yad2yad_df = (
    yad2yad_df.explode(["to_yad_nos", "score"])
    .rename({"to_yad_nos": "to_yad_no"})
    .filter((pl.col("score") > 0) & (pl.col("from_yad_no") != pl.col("to_yad_no")))
)

In [25]:
yad2yad_df.head(10)

from_yad_no,to_yad_no,score
i64,i64,f64
2395,11882,0.197548
2395,2808,0.17292
2395,4101,0.043366
2395,3324,0.039428
2395,5821,0.037559
2395,12837,0.030706
2395,5289,0.019935
2395,7281,0.014974
2395,8668,0.006999
2395,11134,0.006979


In [23]:
train_label_df = (
    train_log_df.group_by("session_id")
    .agg(pl.all().sort_by("seq_no").last())
    .sort(by="session_id")
    .join(
        load_label_data(Path(cfg.dir.data_dir)).rename({"yad_no": "label"}),
        on="session_id",
    )
)

In [24]:
train_label_df.filter(pl.col("yad_no") == 2395)

session_id,seq_no,yad_no,label
str,i64,i64,i64
"""000007603d533d…",0,2395,4101
"""3801fd3f98a4a6…",0,2395,5289
"""abfef2e8d37839…",0,2395,3324
"""c216d6876152be…",1,2395,11882
"""c7a8f1743fd7eb…",0,2395,11882
"""e0327742930846…",1,2395,2808
"""f36d727e727476…",0,2395,2808
