In [1]:
%cd ..

/kaggle/working


In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../cand_unsupervised/feat_graph"):
    cfg = compose(config_name="config.yaml", overrides=["exp=base"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp: {}



In [11]:
import os
import sys
from pathlib import Path

import hydra
import igraph as ig
import numpy as np
import polars as pl
import scipy.sparse as sparse
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from scipy.sparse import csr_matrix, eye

import utils
from utils.load import load_log_data, load_yad_data
from utils.metrics import calculate_metrics

In [5]:
with utils.timer("load data"):
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
    yad_df = load_yad_data(Path(cfg.dir.data_dir))
    all_log_df = pl.concat([train_log_df, test_log_df])

[load data] done in 0.1 s


In [6]:
# 連番に変換

all_log_cast_df = all_log_df.with_columns(
    pl.col("yad_no").cast(str).cast(pl.Categorical).to_physical().alias("yid"),
)

unique_df = all_log_cast_df.unique(["yad_no", "yid"])
unique_yids = unique_df["yid"].to_numpy()
unique_yad_nos = unique_df["yad_no"].to_list()
yid2yad_no = dict(zip(unique_yids, unique_yad_nos))


# 遷移を作成
transition_dfs = []

for rti in [-1, 1]:
    if rti == 0:
        continue
    df = (
        all_log_cast_df.with_columns(
            pl.col("yid").alias("from_id"),
            pl.col("yid").shift(-(rti)).over("session_id").alias("to_id"),
        )
        .filter(~pl.col("to_id").is_null())
        .filter(pl.col("from_id") != pl.col("to_id"))  # 同じものへは遷移しない
        .select(["from_id", "to_id"])
    )
    transition_dfs.append(df)
transition_df = pl.concat(transition_dfs)
transition_df.head()

from_id,to_id
u32,u32
5,4
13,12
17,16
20,19
26,25


In [12]:
# 行列の作成
matrix = sparse.csr_matrix(
    (
        np.ones(len(transition_df)),
        (
            transition_df["from_id"].to_numpy(),
            transition_df["to_id"].to_numpy(),
        ),
    )
).toarray()

In [13]:
from igraph import Graph

graph = Graph.Adjacency(matrix)

In [15]:
graph.summary(verbosity=0)

'IGRAPH D--- 13561 412354 -- '

In [17]:
len(graph.pagerank())

13561

In [27]:
graph.outdegree()

[24,
 37,
 2,
 54,
 115,
 120,
 2,
 127,
 318,
 65,
 157,
 44,
 10,
 6,
 26,
 109,
 185,
 28,
 13,
 62,
 118,
 77,
 29,
 48,
 195,
 129,
 86,
 98,
 30,
 102,
 188,
 17,
 113,
 26,
 11,
 74,
 93,
 23,
 32,
 72,
 66,
 146,
 473,
 99,
 775,
 991,
 0,
 72,
 85,
 70,
 6,
 8,
 183,
 163,
 38,
 17,
 53,
 332,
 101,
 48,
 60,
 120,
 56,
 71,
 179,
 365,
 16,
 114,
 116,
 40,
 26,
 15,
 38,
 172,
 78,
 229,
 76,
 30,
 161,
 144,
 36,
 73,
 66,
 375,
 69,
 39,
 42,
 64,
 46,
 5,
 7,
 66,
 18,
 23,
 1218,
 430,
 2,
 25,
 30,
 174,
 11,
 1895,
 976,
 1159,
 1623,
 17,
 9,
 89,
 27,
 63,
 19,
 197,
 21,
 16,
 43,
 171,
 28,
 10,
 92,
 56,
 16,
 47,
 1,
 34,
 87,
 168,
 33,
 33,
 108,
 179,
 16,
 85,
 19,
 39,
 93,
 15,
 49,
 76,
 60,
 84,
 85,
 23,
 36,
 88,
 322,
 61,
 39,
 6,
 12,
 176,
 43,
 9,
 56,
 93,
 7,
 34,
 39,
 29,
 152,
 93,
 56,
 50,
 39,
 87,
 1,
 115,
 70,
 83,
 57,
 111,
 120,
 96,
 54,
 54,
 70,
 143,
 894,
 78,
 114,
 41,
 62,
 40,
 158,
 210,
 136,
 33,
 105,
 396,
 36,
 27,
 137

In [26]:
graph.transitivity_local_undirected()[:10], len(graph.transitivity_local_undirected())

([0.5,
  0.7857142857142857,
  0.0,
  0.3717948717948718,
  0.5627705627705628,
  0.7543859649122807,
  0.0,
  0.3613445378151261,
  0.2502194907813872,
  0.625],
 13561)

In [24]:
# enumerate で 1万頂点分を100個ずつ取るとか
graph.personalized_pagerank(reset_vertices=2)[:10]

[6.239840384214559e-11,
 7.286884748596017e-09,
 0.18612518373664153,
 2.1702977745708375e-06,
 8.05220883065554e-10,
 7.436100091690683e-10,
 2.7158022696140015e-13,
 1.528073839631305e-10,
 2.3136232163932345e-08,
 1.4536256633993815e-08]