In [1]:
%cd ..

/kaggle/working


In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../cand_unsupervised/ranking"):
    cfg = compose(config_name="config.yaml")
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
exp:
  num_candidate: 100
  k:
  - 1
  - 5
  - 10
  - 50
  - 100



In [3]:
import utils
from utils.load import load_label_data, load_log_data
from utils.metrics import calculate_metrics

In [8]:
from pathlib import Path

import polars as pl

with utils.timer("load data"):
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
    all_log_df = pl.concat([train_log_df, test_log_df])

[load data] done in 0.1 s


In [9]:
count_df = (
    all_log_df.get_column("yad_no").value_counts().sort(by="counts", descending=True)
)
print(count_df.head())
print(count_df.shape)

shape: (5, 2)
┌────────┬────────┐
│ yad_no ┆ counts │
│ ---    ┆ ---    │
│ i64    ┆ u32    │
╞════════╪════════╡
│ 12350  ┆ 1606   │
│ 719    ┆ 1520   │
│ 3338   ┆ 1492   │
│ 13468  ┆ 1373   │
│ 10095  ┆ 1313   │
└────────┴────────┘
(13562, 2)


In [10]:
with utils.timer("load label data"):
    train_label_df = load_label_data(Path(cfg.dir.data_dir), "train")

with utils.timer("make candidates"):
    ## 上位num_candidate個の yad_no を取得
    yad_list = count_df.get_column("yad_no").to_list()[: cfg.exp.num_candidate]
    train_candidate_df = train_label_df.with_columns(
        pl.Series(
            name="candidates", values=[yad_list for _ in range(len(train_label_df))]
        )
    )
print(train_candidate_df.head(10))

[load label data] done in 0.0 s
[make candidates] done in 6.3 s
shape: (10, 3)
┌──────────────────────────────────┬────────┬───────────────────────┐
│ session_id                       ┆ yad_no ┆ candidates            │
│ ---                              ┆ ---    ┆ ---                   │
│ str                              ┆ i64    ┆ list[i64]             │
╞══════════════════════════════════╪════════╪═══════════════════════╡
│ 000007603d533d30453cc45d0f3d119f ┆ 4101   ┆ [12350, 719, … 11321] │
│ 0000ca043ed437a1472c9d1d154eb49b ┆ 8253   ┆ [12350, 719, … 11321] │
│ 0000d4835cf113316fe447e2f80ba1c8 ┆ 4863   ┆ [12350, 719, … 11321] │
│ 0000fcda1ae1b2f431e55a7075d1f500 ┆ 1652   ┆ [12350, 719, … 11321] │
│ …                                ┆ …      ┆ …                     │
│ 000125c737df1802b6e365f93c96d3c8 ┆ 10378  ┆ [12350, 719, … 11321] │
│ 0001763050a10b21062a1304fb743fd4 ┆ 10362  ┆ [12350, 719, … 11321] │
│ 000178c4d4d567d4715331dd0cdab76c ┆ 1227   ┆ [12350, 719, … 11321] │
│ 0001e6a40

In [42]:
def calculate_precision(recommended_items: list[int], actual_item: int, k: int):
    """Calculate precision for a single row"""
    return int(actual_item in recommended_items[:k]) / k


def calculate_recall(recommended_items: list[int], actual_item: int, k: int):
    """Calculate recall for a single row"""
    return int(actual_item in recommended_items[:k])


def calculate_average_precision_at_k(
    recommended_items: list[int], actual_item: int, k: int
):
    """Calculate MAP@K for a single row"""

    if actual_item not in recommended_items[:k]:
        return 0.0

    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(recommended_items):
        if p == actual_item and p not in recommended_items[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score

In [43]:
candidates_col = "candidates"
label_col = "yad_no"
k_ = 5
map_at_k = train_candidate_df.select(candidates_col, label_col).map_rows(
    lambda row: calculate_average_precision_at_k(row[0], row[1], k_)
)

In [44]:
calculate_average_precision_at_k(yad_list, 10095, 5)

0.2

In [47]:
count_df.with_row_count()

row_nr,yad_no,counts
u32,i64,u32
0,12350,1606
1,719,1520
2,3338,1492
3,13468,1373
4,10095,1313
5,8567,1264
6,532,1206
7,8553,1197
8,2201,1173
9,915,1097


## 地域が近いものを選択する

セッション内の最頻値を見て最も近いものを選択し、同一地域のランキングをもとに候補を生成する

In [48]:
train_log_df

session_id,seq_no,yad_no
str,i64,i64
"""000007603d533d…",0,2395
"""0000ca043ed437…",0,13535
"""0000d4835cf113…",0,123
"""0000fcda1ae1b2…",0,8475
"""000104bdffaaad…",0,96
"""000104bdffaaad…",1,898
"""00011afe25c343…",0,6868
"""000125c737df18…",0,8602
"""0001763050a10b…",0,13106
"""000178c4d4d567…",0,12062
