In [1]:
import os
import pickle
import random
import sys
import uuid
from pathlib import Path

import implicit
import lightgbm as lgb
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)

sys.path.append(os.pardir)
from hydra import compose, initialize

from utils import load_datasets
from utils.embedding import TextEmbedder

with initialize(config_path="../yamls", version_base=None):
    config = compose(config_name="config.yaml")


train_df = pd.read_csv(Path(config.input_path) / "train.csv")
test_df = pd.read_csv(Path(config.input_path) / "test.csv")

sample_submission_df = pd.read_csv(Path(config.input_path) / "sample_submission.csv")
anime_df = pd.read_csv(Path(config.input_path) / "anime.csv")

# 整形
anime_df["genres"] = anime_df["genres"].str.replace(" ", "")

# Merge the train data with the anime meta data
all_df = pd.concat([train_df, test_df])
all_df = all_df.merge(anime_df, on="anime_id", how="left")

In [2]:
# 文字列として扱う列を結合し、元の列を落とす
concat_feature = ["japanese_name", "genres", "producers", "licensors", "studios", "rating"]
text_df = anime_df[concat_feature].copy()
# スペース区切りで結合する
text_df[concat_feature] = text_df[concat_feature].astype(str)
text_df["combined_features"] = text_df[concat_feature].agg(" ".join, axis=1)
embedder = TextEmbedder()
anime_embeddings = embedder.get_embeddings(text_df["combined_features"].values.tolist())

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


cuda


  0%|          | 0/125 [00:00<?, ?it/s]

In [3]:
df = all_df[["user_id", "anime_id"]].copy()
df["row_number"] = df["anime_id"].map(anime_df[["anime_id"]].copy().reset_index().set_index("anime_id")["index"])
embeddings = anime_embeddings[df["row_number"]]

In [4]:
import cupy as cp
from cuml.metrics import pairwise_distances

embeddings_cp = cp.array(anime_embeddings)
cosine_sim_matrix = pairwise_distances(embeddings_cp, embeddings_cp, metric="cosine")

In [20]:
cosine_sim_matrix = cp.asnumpy(cosine_sim_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix)
cosine_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,...,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,-4.768372e-07,0.07573247,0.07782,0.1483079,0.0807299,0.122721,0.142194,0.131773,0.125792,0.166743,0.118254,0.10687,0.087455,0.105032,0.110744,0.144638,0.113836,0.135775,0.09987,0.115507,0.111474,0.111028,0.112678,0.116159,0.144496,...,0.078268,0.09887,0.098379,0.129045,0.099309,0.097455,0.090272,0.155578,0.08778,0.101,0.150021,0.145553,0.109585,0.096127,0.151873,0.147162,0.110571,0.115632,0.157318,0.163243,0.159308,0.092853,0.148412,0.163778,0.116992
1,0.07573247,2.384186e-07,0.034509,0.1350629,0.07217371,0.108237,0.154446,0.138049,0.114584,0.13093,0.116862,0.09384,0.096828,0.088386,0.109291,0.142455,0.121685,0.153723,0.121759,0.095568,0.130341,0.098221,0.103235,0.108737,0.125271,...,0.085632,0.100752,0.104951,0.090187,0.0928,0.097915,0.094476,0.148975,0.080706,0.08983,0.135806,0.142094,0.115468,0.095069,0.148786,0.128809,0.097236,0.104361,0.140249,0.144731,0.140047,0.097965,0.143746,0.14775,0.105372
2,0.07782018,0.03450871,0.0,0.1503061,0.08856702,0.116681,0.13915,0.136937,0.120177,0.141459,0.113619,0.101302,0.095589,0.098952,0.093583,0.145021,0.113412,0.144447,0.116628,0.098136,0.122895,0.104566,0.108641,0.115779,0.135703,...,0.086716,0.100072,0.102886,0.112495,0.101633,0.105006,0.09871,0.14437,0.080618,0.091641,0.147899,0.149887,0.101953,0.09936,0.150925,0.140039,0.094877,0.113727,0.151138,0.14963,0.156745,0.097243,0.135273,0.155095,0.117249
3,0.1483079,0.1350629,0.150306,7.152557e-07,0.1228979,0.147557,0.138848,0.152949,0.111838,0.121464,0.164629,0.153505,0.157257,0.124585,0.170409,0.138384,0.141066,0.126736,0.145212,0.14251,0.159835,0.136008,0.134839,0.16994,0.111908,...,0.152963,0.144059,0.155753,0.162839,0.142507,0.14155,0.136451,0.134189,0.167814,0.144596,0.118141,0.077845,0.150693,0.142451,0.114889,0.114342,0.14851,0.156801,0.082627,0.147594,0.091099,0.13991,0.142727,0.12319,0.137901
4,0.0807299,0.07217371,0.088567,0.1228979,5.960464e-08,0.102056,0.139229,0.12855,0.110009,0.131044,0.114632,0.101905,0.113143,0.079647,0.108835,0.124803,0.115403,0.139773,0.12477,0.08661,0.12138,0.11222,0.105373,0.105738,0.125731,...,0.091713,0.105885,0.114634,0.102259,0.098189,0.106038,0.084832,0.143696,0.089973,0.08876,0.148819,0.142324,0.114415,0.090577,0.129551,0.137147,0.098768,0.115693,0.131649,0.147498,0.122649,0.112077,0.135486,0.148771,0.104469


In [26]:
%%time


def calculate_cosine_sim(row_numbers, aggway="sum"):
    """
    文字列の類似度を測る。transformで使うために用いる
    """
    result = cosine_sim_df.iloc[row_numbers.to_numpy(), row_numbers.to_numpy()].to_numpy()
    np.fill_diagonal(result, 0.0)

    if aggway == "sum":
        result = np.sum(result, axis=1)
    elif aggway == "mean":
        result = np.mean(result, axis=1)
    elif aggway == "var":
        result = np.var(result, axis=1)
    elif aggway == "max":
        result = np.max(result, axis=1)
    return result


use_cols = []
for aggway in ["sum", "mean", "var", "max"]:
    col = f"cosine_sim_{aggway}"
    use_cols.append(col)
    print(col)
    df[col] = df.groupby("user_id")["row_number"].transform(calculate_cosine_sim, aggway)

cosine_sim_sum
cosine_sim_mean
cosine_sim_var
cosine_sim_max
CPU times: user 5.93 s, sys: 14.9 ms, total: 5.94 s
Wall time: 5.93 s


In [27]:
df

Unnamed: 0,user_id,anime_id,row_number,cosine_sim_sum,cosine_sim_mean,cosine_sim_var,cosine_sim_max
0,0008e10fb39e55447333,0669cc0219d468761195,49,8.469231,0.124548,0.001035,0.183304
1,0008e10fb39e55447333,111adb8835b8a1a2cf54,122,10.252946,0.150779,0.000751,0.197469
2,0008e10fb39e55447333,1fc8683c393432a2f9c7,234,10.936378,0.160829,0.000572,0.185150
3,0008e10fb39e55447333,2290175205d55e81b197,263,8.539645,0.125583,0.001000,0.168756
4,0008e10fb39e55447333,28f173b60331d5cabb0d,318,8.201374,0.120608,0.001168,0.175911
...,...,...,...,...,...,...,...
254072,ffe85a36cd20500faa58,f508b02efeac8ecb8cc0,1908,7.339842,0.128769,0.000947,0.170199
254073,ffe85a36cd20500faa58,f5b8ecea3beea4b82d79,1910,6.339636,0.111222,0.000558,0.152204
254074,ffe85a36cd20500faa58,f6c208226b6b69948053,1915,7.003919,0.122876,0.000800,0.167487
254075,ffe85a36cd20500faa58,fe67592c312fc1e17745,1986,6.506927,0.114157,0.000770,0.153200
