# 確率行列の作り方において、location情報も加味する

- 方法１：合計確率が一定値になるように、同じlocationに対して均等に遷移確率を与える
  - 均等なはずがないので微妙
- 方法２：合計確率が一定値になるように、同じlocationに対して出現回数で均等に重み付けして遷移確率を与える
- 方法３：既存の遷移確率について、違うlocation同士の場合は確率を減衰させる
  - 実績値データを意図的に歪めることになるので微妙。ただし、外れ値の影響を減衰させるという意味では良いかも
 
まずは方法２を試す。

In [1]:
%cd ..

/kaggle/working


In [7]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(
    version_base=None, config_path="../cand_unsupervised/prob_matrix_with_location"
):
    cfg = compose(config_name="config.yaml", overrides=["exp=base"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  num_candidate: 100
  k:
  - 1
  - 5
  - 10
  - 50
  - 100
  range_transitions:
  - -1
  - 1
  transition_times: 2
  self_loop_prob: 0.5
  location_prob: 0.2



In [5]:
import logging
import os
import pickle
import sys
import time
from pathlib import Path

import hydra
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import scipy.sparse as sparse
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from scipy.sparse import csr_matrix, eye
from sklearn.preprocessing import normalize
from tqdm.auto import tqdm

import utils
import wandb
from utils.load import load_label_data, load_log_data, load_session_data, load_yad_data
from utils.logger import get_logger
from utils.metrics import calculate_metrics

In [8]:
with utils.timer("load data"):
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
    yad_df = load_yad_data(Path(cfg.dir.data_dir))
    all_log_df = pl.concat([train_log_df, test_log_df])

[load data] done in 0.1 s


In [34]:
all_log_cast_df = all_log_df.with_columns(
    pl.col("yad_no").cast(str).cast(pl.Categorical).to_physical().alias("yid"),
)

unique_df = all_log_cast_df.unique(["yad_no", "yid"])
unique_yids = unique_df["yid"].to_numpy()
unique_yad_nos = unique_df["yad_no"].to_list()
yid2yad_no = dict(zip(unique_yids, unique_yad_nos))
yad_no2yid = dict(zip(unique_yad_nos, unique_yids))

In [54]:
# idに変換

yad_id_df = yad_df.with_columns(
    pl.col("yad_no").map_dict(yad_no2yid).cast(pl.UInt32).alias("yid")
).drop("yad_no")

# yad_no の出現回数を計算し、from_yad_noでの合計が1になるような weightを計算する
yad_with_counts = yad_id_df.select(["yid", "sml_cd"]).join(
    all_log_cast_df.unique(['session_id','yid'])["yid"].value_counts(), on="yid"
)

# 同じlocationからlocationへのyad_noのペアを作る
yad2yad_location_df = (
    (
        yad_id_df.select(["yid", "sml_cd"])
        .join(
            yad_with_counts,
            on="sml_cd",
            how="outer",
        )
        .rename({"yid": "from_id", "yid_right": "to_id"})
        .with_columns(
            (pl.col("counts") / pl.col("counts").sum().over("from_id")).alias("weight")
        )
        .drop("counts")
    )
    .drop("sml_cd")
    .drop_nulls()
)


yad2yad_location_df.head()

from_id,to_id,weight
u32,u32,f64
13398,13398,0.005892
13398,13469,0.000196
13398,11721,0.000786
13398,8216,0.003535
13398,12404,0.000393


In [30]:
# 遷移を作成
transition_dfs = []
for rti in cfg.exp.range_transitions:
    if rti == 0:
        continue
    df = (
        all_log_cast_df.with_columns(
            pl.col("yid").alias("from_id"),
            pl.col("yid").shift(-(rti)).over("session_id").alias("to_id"),
        )
        .filter(~pl.col("to_id").is_null())
        .filter(pl.col("from_id") != pl.col("to_id"))  # 同じものへは遷移しない
        .select(["session_id", "from_id", "to_id"])
    )
    transition_dfs.append(df)
transition_df = (
    pl.concat(transition_dfs)
    .unique(["session_id", "from_id", "to_id"])
    .drop("session_id")
)

In [31]:
transition_df = (
    transition_df.group_by(["from_id", "to_id"])
    .agg(pl.col("from_id").count().alias("counts"))
    .with_columns(
        (pl.col("counts") / pl.col("counts").sum().over("from_id")).alias("weight")
    )
    .drop("counts")
)

transition_df.head()

from_id,to_id,weight
u32,u32,f64
196,2199,0.080745
7497,5897,0.428571
275,7807,0.121951
2324,2283,0.119048
4572,369,0.045045


In [55]:
transition_with_location_df = pl.concat(
    [
        transition_df.with_columns(pl.col("weight") * (1 - cfg.exp.location_prob)),
        yad2yad_location_df.with_columns(pl.col("weight") * cfg.exp.location_prob),
    ]
)
transition_with_location_df.head()

from_id,to_id,weight
u32,u32,f64
196,2199,0.064596
7497,5897,0.342857
275,7807,0.097561
2324,2283,0.095238
4572,369,0.036036


In [56]:
# 疎行列の作成
sparse_matrix = sparse.csr_matrix(
    (
        transition_with_location_df["weight"].to_numpy(),
        (
            transition_with_location_df["from_id"].to_numpy(),
            transition_with_location_df["to_id"].to_numpy(),
        ),
    )
)

In [57]:
sparse_matrix.toarray().sum(axis=1)

array([1. , 1. , 1. , ..., 0.2, 1. , 0.2])