In [1]:
%cd ..

/kaggle/working


In [8]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../cand_unsupervised/prob_matrix"):
    cfg = compose(config_name="config.yaml", overrides=["exp=base"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  num_candidate: 100
  k:
  - 1
  - 5
  - 10
  - 50
  - 100
  range_transitions:
  - -1
  - 1
  transition_times: 1
  self_loop_prob: 0.5



In [3]:
import logging
import os
import pickle
import sys
import time
from pathlib import Path

import hydra
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import scipy.sparse as sparse
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from sklearn.preprocessing import normalize
from tqdm.auto import tqdm

import utils
import wandb
from utils.load import load_label_data, load_log_data, load_session_data, load_yad_data
from utils.logger import get_logger
from utils.metrics import calculate_metrics

In [4]:
with utils.timer("load data"):
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
    all_log_df = pl.concat([train_log_df, test_log_df])

[load data] done in 0.1 s


In [5]:
# 連番に変換

all_log_cast_df = all_log_df.with_columns(
    pl.col("yad_no").cast(str).cast(pl.Categorical).to_physical().alias("yid"),
)

unique_df = all_log_cast_df.unique(["yad_no", "yid"])
unique_yids = unique_df["yid"].to_numpy()
unique_yad_nos = unique_df["yad_no"].to_list()
yid2yad_no = dict(zip(unique_yids, unique_yad_nos))

In [6]:
# 遷移を作成
transition_dfs = []

for rti in cfg.exp.range_transitions:
    if rti == 0:
        continue
    df = (
        all_log_cast_df.with_columns(
            pl.col("yid").alias("from_id"),
            pl.col("yid").shift(-(rti)).over("session_id").alias("to_id"),
        )
        .filter(~pl.col("to_id").is_null())
        .filter(pl.col("from_id") != pl.col("to_id"))  # 同じものへは遷移しない
        .select(["from_id", "to_id"])
    )
    transition_dfs.append(df)
transition_df = pl.concat(transition_dfs)
transition_df.head()

from_id,to_id
u32,u32
5,4
13,12
17,16
20,19
26,25


In [9]:
from scipy.sparse import csr_matrix, eye

# 疎行列の作成
sparse_matrix = sparse.csr_matrix(
    (
        np.ones(len(transition_df)),
        (
            transition_df["from_id"].to_numpy(),
            transition_df["to_id"].to_numpy(),
        ),
    )
)


# 右確率行列にする
sparse_matrix_normalized = normalize(sparse_matrix, norm="l1", axis=1)


if cfg.exp.self_loop_prob is not None:
    sparse_matrix_normalized = (
        sparse_matrix_normalized * (1 - cfg.exp.self_loop_prob)
        + eye(sparse_matrix_normalized.shape[0]) * cfg.exp.self_loop_prob
    )
    sparse_matrix_normalized = normalize(sparse_matrix_normalized, norm="l1", axis=1)

In [10]:
# 各行から上位K件の要素のindexとvalueを取得する関数
def top_k_indices_per_row(matrix, K):
    top_k_indices = np.argsort(-matrix, axis=1)[:, :K]
    top_k_values = np.array(
        [matrix[i, top_k_indices[i]] for i in range(matrix.shape[0])]
    )
    return top_k_indices, top_k_values

In [17]:
matrix = sparse_matrix_normalized * sparse_matrix_normalized

In [18]:
K = 100
indices, values = top_k_indices_per_row(matrix.toarray(), K)

In [19]:
# from_yad_no,

from_yad_no = [yid2yad_no[i] for i in range(len(indices))]
to_yad_nos = [[yid2yad_no[c] for c in cs] for cs in indices]

yad2yad_df = pl.DataFrame(
    {
        "from_yad_no": from_yad_no,  # unique_sids と同じ順番
        "to_yad_nos": to_yad_nos,
        "transition_prob": values,
    }
)
yad2yad_df = (
    yad2yad_df.explode(["to_yad_nos", "transition_prob"])
    .filter(pl.col("transition_prob") > 0)
    .rename({"to_yad_nos": "to_yad_no"})
)

In [20]:
# yad2yad_feature.parquet として保存する
yad2yad_df.head()

from_yad_no,to_yad_no,transition_prob
i64,i64,f64
2395,2395,0.407052
2395,11882,0.303571
2395,2808,0.199346
2395,4101,0.02877
2395,5289,0.024802


In [21]:
with utils.timer("load session data"):
    train_session_df = load_session_data(Path(cfg.dir.data_dir), "train")
    test_session_df = load_session_data(Path(cfg.dir.data_dir), "test")


def make_candidates(log_df, session_df, transition_df):
    log_df = (
        log_df.sort(by="session_id").with_columns(pl.col("yad_no").alias("from_yad_no"))
    ).select(["session_id", "from_yad_no"])
    candidate_df = (
        log_df.join(transition_df, on="from_yad_no")
        .group_by(["session_id", "to_yad_no"])
        .agg(
            pl.sum("transition_prob").alias("transition_prob"),
        )
        .sort(by=["session_id", "transition_prob"], descending=True)
        .group_by(["session_id"])
        .agg(
            pl.col("to_yad_no").alias("candidates"),
        )
    )
    candidate_df = session_df.join(
        candidate_df, on="session_id", how="left"
    ).with_columns(
        # candidates が null の場合は空のリストを入れておく
        pl.when(pl.col("candidates").is_null())
        .then(pl.Series("empty", [[]]))
        .otherwise(pl.col("candidates"))
        .alias("candidates")
    )
    return candidate_df


train_candidate_df = make_candidates(train_log_df, train_session_df, yad2yad_df)
test_candidate_df = make_candidates(test_log_df, test_session_df, yad2yad_df)

[load session data] done in 0.0 s


In [22]:
"""
スコア計算
"""
with utils.timer("calculate metrics"):
    train_label_df = load_label_data(Path(cfg.dir.data_dir), "train")
    train_candidate_df = train_candidate_df.with_columns(
        train_label_df.select("yad_no")
    )
    if cfg.debug:
        train_candidate_df = train_candidate_df.head(10000)
    metrics_list = calculate_metrics(
        train_candidate_df,
        candidates_col="candidates",
        label_col="yad_no",
        k=cfg.exp.k,
    )
    for metrics in metrics_list:
        print(metrics)

k: 1
avg_num_candidates: 1.0
recall: 0.14913854616242578
precision: 0.14913854616242578
map@k: 0.14913854616242578

k: 5
avg_num_candidates: 4.939369167780864
recall: 0.46584319946795616
precision: 0.09316863989359124
map@k: 0.2743412377871223

k: 10
avg_num_candidates: 9.797573935392695
recall: 0.5855115033703039
precision: 0.0585511503370304
map@k: 0.2901870653652793

k: 50
avg_num_candidates: 42.81390241705866
recall: 0.8700025632321665
precision: 0.01740005126464332
map@k: 0.3045469892824873

k: 100
avg_num_candidates: 68.20561971333366
recall: 0.9243915787431849
precision: 0.009243915787431849
map@k: 0.3053654885993106

{'k': 1, 'avg_num_candidates': 1.0, 'recall': 0.14913854616242578, 'precision': 0.14913854616242578, 'map@k': 0.14913854616242578}
{'k': 5, 'avg_num_candidates': 4.939369167780864, 'recall': 0.46584319946795616, 'precision': 0.09316863989359124, 'map@k': 0.2743412377871223}
{'k': 10, 'avg_num_candidates': 9.797573935392695, 'recall': 0.5855115033703039, 'precision'

In [16]:
"""
スコア計算
"""
with utils.timer("calculate metrics"):
    train_label_df = load_label_data(Path(cfg.dir.data_dir), "train")
    train_candidate_df = train_candidate_df.with_columns(
        train_label_df.select("yad_no")
    )
    if cfg.debug:
        train_candidate_df = train_candidate_df.head(10000)
    metrics_list = calculate_metrics(
        train_candidate_df,
        candidates_col="candidates",
        label_col="yad_no",
        k=cfg.exp.k,
    )
    for metrics in metrics_list:
        print(metrics)

k: 1
avg_num_candidates: 1.0
recall: 0.14507547679582125
precision: 0.14507547679582125
map@k: 0.14507547679582125

k: 5
avg_num_candidates: 4.836881447048473
recall: 0.45757504381741476
precision: 0.09151500876348294
map@k: 0.2738156597320845

k: 10
avg_num_candidates: 9.078247857622845
recall: 0.5537793819146651
precision: 0.05537793819146652
map@k: 0.2866876871497617

k: 50
avg_num_candidates: 22.437242377848133
recall: 0.6730874477828042
precision: 0.013461748955656085
map@k: 0.2937965795758874

k: 100
avg_num_candidates: 24.10269555036751
recall: 0.6760905860102945
precision: 0.006760905860102946
map@k: 0.2938453238223996

{'k': 1, 'avg_num_candidates': 1.0, 'recall': 0.14507547679582125, 'precision': 0.14507547679582125, 'map@k': 0.14507547679582125}
{'k': 5, 'avg_num_candidates': 4.836881447048473, 'recall': 0.45757504381741476, 'precision': 0.09151500876348294, 'map@k': 0.2738156597320845}
{'k': 10, 'avg_num_candidates': 9.078247857622845, 'recall': 0.5537793819146651, 'precisi

In [23]:
"""
seq_lenごとに求める
"""

'\nseq_lenごとに求める\n'

In [28]:
seq_len_df = train_log_df.group_by("session_id").agg(
    (pl.col("seq_no").max() + 1).alias("seq_len")
)
train_candidate_df = train_candidate_df.join(seq_len_df, on="session_id")

In [30]:
for i in range(1, 10):
    print(i)
    metrics_list = calculate_metrics(
        train_candidate_df.filter(pl.col("seq_len") == i),
        candidates_col="candidates",
        label_col="yad_no",
        k=10,
    )
    for metrics in metrics_list:
        print(metrics)

1
k: 10
avg_num_candidates: 9.665314532920501
recall: 0.3343995770985943
precision: 0.03343995770985943
map@k: 0.09624404010576644

{'k': 10, 'avg_num_candidates': 9.665314532920501, 'recall': 0.3343995770985943, 'precision': 0.03343995770985943, 'map@k': 0.09624404010576644}
2
k: 10
avg_num_candidates: 9.944753783532425
recall: 0.9074801009747201
precision: 0.09074801009747202
map@k: 0.4588309551512962

{'k': 10, 'avg_num_candidates': 9.944753783532425, 'recall': 0.9074801009747201, 'precision': 0.09074801009747202, 'map@k': 0.4588309551512962}
3
k: 10
avg_num_candidates: 9.958827361563518
recall: 0.9663843648208469
precision: 0.0966384364820847
map@k: 0.49656307843441394

{'k': 10, 'avg_num_candidates': 9.958827361563518, 'recall': 0.9663843648208469, 'precision': 0.0966384364820847, 'map@k': 0.49656307843441394}
4
k: 10
avg_num_candidates: 9.965962732919255
recall: 0.9932919254658386
precision: 0.09932919254658387
map@k: 0.6388825791186039

{'k': 10, 'avg_num_candidates': 9.96596273