In [1]:
%cd ..

/kaggle/working


In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(
    version_base=None, config_path="../cand_unsupervised/feat_session2location"
):
    cfg = compose(config_name="config.yaml", overrides=["exp=bpr_sml"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  num_candidate: 10
  k:
  - 1
  - 5
  - 10
  location: sml_cd
  implicit:
    model: bpr
    params:
      factors: 16
      learning_rate: 0.01
      regularization: 0.01
      iterations: 100
      verify_negative_samples: true
      random_state: ${seed}
      num_threads: 0



In [3]:
import os
import sys
from pathlib import Path

import hydra
import implicit
import numpy as np
import polars as pl
import scipy.sparse as sparse
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf

import utils
from utils.load import load_label_data, load_log_data, load_session_data, load_yad_data
from utils.metrics import calculate_metrics

In [4]:
yad_df = load_yad_data(Path(cfg.dir.data_dir))
train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
all_log_df = pl.concat([train_log_df, test_log_df])
all_log_df = all_log_df.join(yad_df, on="yad_no", how="left")

In [23]:
all_log_df = all_log_df.with_columns(
    # session_id を連番に変換
    pl.col("session_id").cast(pl.Categorical).to_physical().alias("sid"),
    # location を連番に変換
    pl.col(cfg.exp.location)
    .cast(pl.Categorical)
    .to_physical()
    .alias(cfg.exp.location + "_id"),
)

unique_df = all_log_df.unique(["sid", "session_id"])
unique_sids = unique_df["sid"].to_numpy()
unique_session_ids = unique_df["session_id"].to_list()

unique_df = all_log_df.unique([cfg.exp.location, cfg.exp.location + "_id"])
unique_location_ids = unique_df[cfg.exp.location + "_id"].to_numpy()
unique_locations = unique_df[cfg.exp.location].to_list()
loc_id2loc = dict(zip(unique_location_ids,unique_locations))

In [10]:
sparse_item_user = sparse.csr_matrix(
    (
        np.ones(len(all_log_df)),
        (
            all_log_df["sid"].to_numpy(),
            all_log_df[cfg.exp.location + "_id"].to_numpy(),
        ),
    )
)

In [11]:
if cfg.exp.implicit.model == "bpr":
    from implicit.cpu.bpr import BayesianPersonalizedRanking

    model = BayesianPersonalizedRanking(
        **OmegaConf.to_container(cfg.exp.implicit.params, resolve=True)
    )
elif cfg.exp.implicit.model == "als":
    from implicit.cpu.als import AlternatingLeastSquares

    model = AlternatingLeastSquares(
        **OmegaConf.to_container(cfg.exp.implicit.params, resolve=True)
    )

model.fit(sparse_item_user)

  0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
session_ids = unique_session_ids
session_vectors = model.user_factors[unique_sids]
session_factor_df = pl.DataFrame({"session_id": session_ids}).with_columns(
    pl.Series(name=f"session_factor_{i}", values=session_vectors[:, i])
    for i in range(session_vectors.shape[1])
)
print(session_factor_df.head())

locations = unique_locations
location_vectors = model.item_factors[unique_location_ids]
location_factor_df = pl.DataFrame({cfg.exp.location: locations}).with_columns(
    pl.Series(name=f"{cfg.exp.location}_factor_{i}", values=location_vectors[:, i])
    for i in range(location_vectors.shape[1])
)
print(location_factor_df.head())

shape: (5, 18)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ session_i ┆ session_f ┆ session_f ┆ session_f ┆ … ┆ session_f ┆ session_f ┆ session_f ┆ session_ │
│ d         ┆ actor_0   ┆ actor_1   ┆ actor_2   ┆   ┆ actor_13  ┆ actor_14  ┆ actor_15  ┆ factor_1 │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ 6        │
│ str       ┆ f32       ┆ f32       ┆ f32       ┆   ┆ f32       ┆ f32       ┆ f32       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f32      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0016277eb ┆ 0.046707  ┆ -0.067843 ┆ -0.125507 ┆ … ┆ 0.057333  ┆ -0.085104 ┆ 0.185233  ┆ 1.0      │
│ 9b0e69f7b ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ 72aa0c0ff ┆           ┆           ┆           ┆   ┆           ┆           

In [13]:
train_session_df = load_session_data(Path(cfg.dir.data_dir), "train")
test_session_df = load_session_data(Path(cfg.dir.data_dir), "test")

In [16]:
unique_sids

array([    92,     94,    148, ..., 463384, 463391, 463393], dtype=uint32)

In [25]:

# 少し時間がかかる
candidates, scores = model.recommend(
    unique_sids[[0,1]],
    sparse_item_user[unique_sids[[0,1]]],
    N=cfg.exp.num_candidate,
    filter_already_liked_items=False,
)
print(candidates)

[[ 10 413 442 334 518 390 337 456 522 350]
 [ 84  75 163 181 411 166 106 268 132 294]]


In [26]:
unique_sids

array([    10,    116,    124, ..., 463065, 463181, 463360], dtype=uint32)

In [29]:
all_log_df.filter(pl.col('sml_cd_id')==10)

session_id,seq_no,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,sid,sml_cd_id
str,i64,i64,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str,u32,u32
"""0002499cf2713a…",0,2043,0,143.0,1.0,0,1.0,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",10,10
"""00063c614aca49…",0,5583,0,55.0,,1,,,,,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",28,10
"""00063c614aca49…",1,6001,0,,1.0,1,,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",28,10
"""000ded3703abf4…",0,966,0,78.0,1.0,0,1.0,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",62,10
"""000ded3703abf4…",1,10293,0,86.0,1.0,0,1.0,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",62,10
"""000ded3703abf4…",2,966,0,78.0,1.0,0,1.0,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",62,10
"""003504344b08e4…",0,3783,0,65.0,,0,,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",258,10
"""004eb4cdd8e765…",0,3042,0,74.0,1.0,0,1.0,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",382,10
"""004eb4cdd8e765…",1,10417,0,104.0,1.0,0,1.0,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",382,10
"""008aaae5680075…",0,6134,0,108.0,1.0,0,,,,1.0,"""dc414a17890cfc…","""223938a74a6099…","""63083678169ddd…","""33bfe292401fc7…",657,10


In [28]:
candidates = [[loc_id2loc[c] for c in cs] for cs in candidates]
candidates

[['33bfe292401fc7f99b8b9831a71f61ee',
  '808efa4c8737bf0963e46418b950e7a7',
  '5fb4c282c7d5072b95af4e8b0990b4a2',
  'f6a32c8fba39224a42dcf944a825bb74',
  'a06dc7e0b1ce7bf4b2a82795544d9fee',
  '45ce8679946956887d43548f184374b1',
  '5e59cb1cb29ddcdf9bc1a8b56a4407a2',
  '1372439b363adca1c9e5d212aa373c39',
  '5edfaad7acb9578b63dd8a1ad31bb504',
  '2df67a9182e1805e4fca214292d1166c'],
 ['8cb854e17cd42e2b44f0c603da4608d4',
  '9dff180c5e5089dfb26be591000b90f1',
  'a47fc5d7a9e908347293ab6ec859c711',
  '206c6d79ad3864ef1e490c5ad25ce3bf',
  '30e4ee82595c5d3fe2a691efaf23e1e3',
  '53990961d5141c807d742c6b3ca398ad',
  '4cb261fbd1d92af8d4c789301ded1811',
  'e896511a999c6508414df4dcee407926',
  '69916a1fb9c5b4db92add1f92454711f',
  '677e82825c47001f72241e1d12d3f4ee']]