In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
import sys
sys.path.append('../')

from myutils.metrics import mapk

In [3]:
OUTPUT_NAME = 'prone_directed'

In [4]:
train_log_df = pd.read_csv('../datasets/atmaCup16_Dataset/train_log.csv')
train_df = pd.read_csv('../datasets/atmaCup16_Dataset/train_label.csv')
test_log_df = pd.read_csv('../datasets/atmaCup16_Dataset/test_log.csv')
yado_df = pd.read_csv('../datasets/atmaCup16_Dataset/yado.csv')

train_session_only_df = train_df[['session_id']]
test_session_only_df = pd.read_csv('../datasets/atmaCup16_Dataset/test_session.csv')

train_test_log_df = pd.concat([train_log_df, test_log_df], axis=0).reset_index(drop=True)

In [5]:
yado_df['yad_no'].min()

1

In [6]:
from csrgraph import csrgraph
import nodevectors as nv
from scipy.sparse import csr_matrix
from nodevectors.prone import ProNE
from nodevectors import prone

"""
https://www.guruguru.science/competitions/21/discussions/3455f45d-9a42-4f91-abee-2fec1666fcf0/
"""

def create_adjacency_matrix(edges):
    """
    無向グラフ
    """
    n = np.max(edges) + 1  # ノードの数を取得します（ノードが0から始まると仮定します）

    # edges_flipped = np.flip(edges, axis=1)  # エッジの始点と終点を入れ替えます
    # edges_doubled = np.vstack((edges, edges_flipped))  # エッジリストを2倍にします
    data = np.ones(len(edges))  # エッジの数だけ1を用意します
    adjacency_matrix = csr_matrix((data, (edges[:, 0], edges[:, 1])), shape=(n, n))  # CSR形式のスパース行列を作成します

    return adjacency_matrix


def calc_prone_embs(edge, n_components):
    # 隣接行列
    adj_mat = create_adjacency_matrix(edge)
    # グラフ
    graph = csrgraph(adj_mat)

    # 埋め込みモデルの学習
    model_params = dict(n_components=n_components, step=10, mu=0.2, theta=0.5, exponent=0.75, verbose=True)
    node_emb_model = prone.ProNE(**model_params)
    node_embs = node_emb_model.fit_transform(graph)
    # 正規化する
    node_embs = node_embs / (np.linalg.norm(node_embs, axis=1, keepdims=True) + 1e-8)
    return node_embs


def make_prone_vec(train_test_log_df: pd.DataFrame, n_components:int):
    # 視聴グラフを作成
    # エッジ。anime_noがuser_noの後に続くように調整
    
    # view_edge = df[[f"{name1}_no", f"{name2}_no"]].values
    # num_user = np.max(view_edge[:, 0]) + 1
    # view_edge[:, 1] = view_edge[:, 1] + num_user
    sentence_list = train_test_log_df.groupby('session_id')['yad_no'].apply(list).tolist()

    edges = set()
    for items in sentence_list:
        if len(items) == 1:
            continue
        for i in range(len(items)-1):
            edges.add((items[i]-1, items[i+1]-1))
            # edges.add((items[i+1]-1, items[i]-1))
    edges = list(edges)
    edges = np.array(edges)
    print(edges.shape) # (num_edge, 2)

    node_embs = calc_prone_embs(edges, n_components)

    return node_embs

In [7]:
dimensions = 1024

In [8]:
item_vecs = make_prone_vec(train_test_log_df, dimensions)

(83690, 2)


In [9]:
item_vecs.shape

(13806, 1024)

In [10]:
# item_ids = train_test_log_df['yad_no'].unique().tolist()
item_ids = yado_df['yad_no'].to_list()
item_factors = {item: item_vecs[item-1, :] for item in item_ids}
item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "yad_no"})

item_cols = [f"item_factor_{i}" for i in range(dimensions)]
item_factors_df.columns = ["yad_no"] + item_cols

In [11]:
user_item_list_dict = {session_id: yad_nos.tolist() for session_id, yad_nos in train_test_log_df.groupby('session_id')['yad_no']}
user_factors = {user_id: np.mean([item_factors[item_id] for item_id in user_item_list], axis=0) for user_id, user_item_list in user_item_list_dict.items()}
user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "session_id"})
user_cols = [f"user_factor_{i}" for i in range(dimensions)]
user_factors_df.columns = ["session_id"] + user_cols

user_factors_last_item = {user_id: item_factors[user_item_list[-1]] for user_id, user_item_list in user_item_list_dict.items()}
user_factors_last_item_df = pd.DataFrame(user_factors_last_item).T.reset_index().rename(columns={"index": "session_id"})
user_cols = [f"user_factor_{i}" for i in range(dimensions)]
user_factors_last_item_df.columns = ["session_id"] + user_cols

In [12]:
item_factors_df

Unnamed: 0,yad_no,item_factor_0,item_factor_1,item_factor_2,item_factor_3,item_factor_4,item_factor_5,item_factor_6,item_factor_7,item_factor_8,...,item_factor_1014,item_factor_1015,item_factor_1016,item_factor_1017,item_factor_1018,item_factor_1019,item_factor_1020,item_factor_1021,item_factor_1022,item_factor_1023
0,1,-0.000135,0.000073,0.000320,0.000062,-0.000302,-0.003891,0.000475,-0.089975,-0.000049,...,-0.007796,0.020071,0.129385,0.028034,0.019760,0.032484,-0.052124,0.015137,0.014552,-0.026435
1,2,-0.000007,0.000001,-0.000008,-0.000034,0.000042,0.000056,0.000016,-0.000117,0.000032,...,0.001766,-0.000140,-0.000671,0.000958,0.000273,0.000285,-0.000015,-0.000310,-0.001350,-0.000460
2,3,-0.348397,-0.080655,-0.000602,0.000056,0.000187,0.001434,-0.000114,0.004593,-0.000193,...,-0.006441,0.006431,-0.000649,-0.013176,-0.010163,0.011783,-0.006353,0.006057,0.007161,0.005335
3,4,-0.028498,-0.006675,-0.000245,-0.000048,-0.000007,0.000255,-0.000056,-0.000208,0.000128,...,0.005489,0.005304,-0.005155,-0.000281,-0.005397,0.008192,0.004700,-0.000645,-0.004394,-0.000798
4,5,-0.000024,0.000201,0.000007,0.000041,0.000149,-0.000158,-0.000050,0.000253,-0.000029,...,0.000475,-0.001994,0.002723,0.002899,-0.000614,-0.000387,0.000715,0.001016,0.001765,0.001636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
13802,13803,-0.000080,-0.000044,0.094033,0.000396,-0.000056,0.000135,-0.000179,0.000021,-0.000184,...,0.021810,0.010336,0.039482,0.023411,0.010327,-0.001849,-0.014774,-0.047337,-0.005479,-0.000442
13803,13804,-0.000066,0.000014,-0.000135,-0.000043,0.000113,-0.000075,0.000009,-0.000753,0.000065,...,-0.000303,-0.002057,-0.010176,0.001444,-0.002782,0.006842,-0.005268,0.000836,0.000989,0.000390
13804,13805,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [13]:
user_factors_df

Unnamed: 0,session_id,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,...,user_factor_1014,user_factor_1015,user_factor_1016,user_factor_1017,user_factor_1018,user_factor_1019,user_factor_1020,user_factor_1021,user_factor_1022,user_factor_1023
0,000007603d533d30453cc45d0f3d119f,-0.000169,-0.000367,0.001150,0.000171,-0.000843,-0.000660,-0.000599,0.000493,0.000179,...,-0.000996,-0.003026,-0.000046,-0.012297,0.010535,-0.003686,-0.001161,0.003393,0.000655,-0.014920
1,00001149e9c73985425197104712478c,-0.002312,0.000452,0.000368,-0.000115,-0.000008,-0.000132,0.000141,-0.002465,-0.000029,...,-0.004785,0.006586,0.011291,-0.001319,0.004788,0.000618,-0.012510,0.003285,-0.000973,-0.000692
2,0000ca043ed437a1472c9d1d154eb49b,-0.000125,-0.000085,0.000070,0.000196,-0.000059,0.000125,-0.000065,-0.000511,-0.000003,...,0.015965,0.057653,-0.044917,0.024435,0.026031,-0.006656,-0.034838,-0.004816,0.032289,-0.014750
3,0000d4835cf113316fe447e2f80ba1c8,0.000133,-0.000262,0.000485,0.001044,-0.000031,0.000595,0.000960,-0.000590,-0.000262,...,-0.034305,-0.001747,0.071191,0.031117,-0.045767,0.044961,0.039333,-0.014086,-0.069350,-0.028096
4,0000e02747d749a52b7736dfa751e258,-0.078975,-0.015721,0.000585,0.000112,0.000066,0.000234,0.000062,0.001000,0.000115,...,-0.007115,0.005673,-0.011879,0.018039,-0.003130,-0.001179,-0.005156,0.003694,-0.009877,0.021499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463393,ffff9a7dcc892875c7a8b821fa436228,-0.038324,0.191712,-0.000204,-0.000145,-0.000133,-0.000152,-0.000206,0.000390,0.000055,...,-0.013729,-0.016949,-0.010476,-0.020281,0.015450,-0.017085,-0.001407,-0.001347,0.043756,-0.014720
463394,ffffb1d30300fe17f661941fd085b04b,-0.014511,0.000085,-0.000033,-0.000291,0.000244,-0.000203,0.002335,-0.003538,0.000078,...,0.007884,-0.011010,-0.000700,0.000638,-0.009061,-0.017822,-0.006490,-0.007355,-0.004321,0.010335
463395,ffffcd5bc19d62cad5a3815c87818d83,-0.000004,0.000126,-0.000107,-0.000094,-0.000185,-0.000161,0.000118,-0.000020,0.000137,...,0.007801,0.008257,0.006906,0.008614,0.003980,0.009911,-0.000680,-0.000447,-0.006149,-0.002165
463396,ffffe984aafd6127ce8e43e3ca40c79d,-0.041831,-0.002552,0.000488,0.000109,0.000109,0.000125,-0.000024,-0.000310,0.000054,...,0.007262,0.009056,-0.000716,0.013384,0.024563,-0.001416,-0.004036,-0.002442,-0.019865,0.000546


In [14]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [15]:
from sklearn.neighbors import NearestNeighbors


user_vecs = user_factors_df[[f'user_factor_{i}' for i in range(dimensions)]].values
# user_vecs = user_factors_last_item_df[[f'user_factor_{i}' for i in range(dimensions)]].values
item_vecs = item_factors_df[[f'item_factor_{i}' for i in range(dimensions)]].values

nn = NearestNeighbors(n_neighbors=20, metric='cosine')
nn.fit(item_vecs)
dists, indices = nn.kneighbors(user_vecs)

In [16]:
user_last_item_dict = {user_id: user_item_list[-1] for user_id, user_item_list in user_item_list_dict.items()}

In [17]:
results = []

for i, user_id in enumerate(user_item_list_dict.keys()):
    for j in range(20):
        yad_no = indices[i][j]+1
        if user_last_item_dict[user_id] == yad_no:
            continue

        results.append([user_id, yad_no, 1 - dists[i][j]])
word2vec_sim_df = pd.DataFrame(results, columns=['session_id', 'yad_no', 'word2vec_sim'])

In [18]:
word2vec_sim_df

Unnamed: 0,session_id,yad_no,word2vec_sim
0,000007603d533d30453cc45d0f3d119f,4101,0.998068
1,000007603d533d30453cc45d0f3d119f,11882,0.996103
2,000007603d533d30453cc45d0f3d119f,3324,0.994980
3,000007603d533d30453cc45d0f3d119f,2808,0.958968
4,000007603d533d30453cc45d0f3d119f,8668,0.916899
...,...,...,...
8820112,fffffa7baf370083ebcdd98f26a7e31a,11273,0.514680
8820113,fffffa7baf370083ebcdd98f26a7e31a,10095,0.376571
8820114,fffffa7baf370083ebcdd98f26a7e31a,13202,0.375520
8820115,fffffa7baf370083ebcdd98f26a7e31a,13524,0.271971


In [19]:
def make_candidate_word2vec(session_only_df: pd.DataFrame, word2vec_sim_df: pd.DataFrame):
    session_df = session_only_df.merge(word2vec_sim_df, on=['session_id'], how='left')

    session_df = session_df[session_df['yad_no'].notnull()].reset_index(drop=True)
    session_df['yad_no'] = session_df['yad_no'].astype(int)

    session_df['rank'] = session_df.groupby('session_id')['word2vec_sim'].rank(ascending=False, method='min')

    pred_df = session_df.groupby('session_id')['yad_no'].apply(lambda x: x.tolist()).reset_index()
    pred_df = pd.merge(session_only_df, pred_df, on='session_id', how='left')

    pred_df['yad_no'] = pred_df['yad_no'].apply(lambda d: d if isinstance(d, list) else [])

    return session_df, pred_df

In [20]:
train_session_df, train_pred_df = make_candidate_word2vec(train_session_only_df, word2vec_sim_df)
test_session_df, test_pred_df = make_candidate_word2vec(test_session_only_df, word2vec_sim_df)

In [21]:
train_session_df

Unnamed: 0,session_id,yad_no,word2vec_sim,rank
0,000007603d533d30453cc45d0f3d119f,4101,0.998068,1.0
1,000007603d533d30453cc45d0f3d119f,11882,0.996103,2.0
2,000007603d533d30453cc45d0f3d119f,3324,0.994980,3.0
3,000007603d533d30453cc45d0f3d119f,2808,0.958968,4.0
4,000007603d533d30453cc45d0f3d119f,8668,0.916899,5.0
...,...,...,...,...
5495894,fffffa7baf370083ebcdd98f26a7e31a,11273,0.514680,15.0
5495895,fffffa7baf370083ebcdd98f26a7e31a,10095,0.376571,16.0
5495896,fffffa7baf370083ebcdd98f26a7e31a,13202,0.375520,17.0
5495897,fffffa7baf370083ebcdd98f26a7e31a,13524,0.271971,18.0


In [22]:
"""
cos_sims = []
for session_id, yad_no in tqdm(zip(train_session_df['session_id'].values, train_session_df['yad_no'].values)):
    session_last_yad_no = user_last_item_dict[session_id]

    sim = cos_sim(item_factors[session_last_yad_no], item_factors[yad_no])
    cos_sims.append(sim)
"""

"\ncos_sims = []\nfor session_id, yad_no in tqdm(zip(train_session_df['session_id'].values, train_session_df['yad_no'].values)):\n    session_last_yad_no = user_last_item_dict[session_id]\n\n    sim = cos_sim(item_factors[session_last_yad_no], item_factors[yad_no])\n    cos_sims.append(sim)\n"

In [23]:
train_pred_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,"[4101, 11882, 3324, 2808, 8668, 5821, 538, 107..."
1,0000ca043ed437a1472c9d1d154eb49b,"[8253, 4488, 8747, 12593, 9864, 4909, 540, 225..."
2,0000d4835cf113316fe447e2f80ba1c8,"[5238, 587, 415, 774, 515, 406, 402, 798, 668,..."
3,0000fcda1ae1b2f431e55a7075d1f500,"[2272, 626, 9790, 13296, 11715, 851, 1341, 111..."
4,000104bdffaaad1a1e0a9ebacf585f33,"[7240, 96, 1284, 254, 5490, 3894, 12338, 8696,..."
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,"[1678, 1092, 13682, 13210, 13079, 5719, 5179, ..."
288694,ffff2360540745117193ecadcdc06538,"[513, 6654, 2900, 4767, 8703, 5299, 681, 8465,..."
288695,ffff7fb4617164b2604aaf51c40bf82d,"[7308, 7820, 10364, 4040, 2097, 3566, 5037, 12..."
288696,ffffcd5bc19d62cad5a3815c87818d83,"[11316, 570, 12500, 9671, 12781, 12829, 11091,..."


## cv
dim=256 0.20479048683487233  
dim=512 0.24105351563621108  
dim=1024 0.284614876566241

In [24]:
mapk(train_df['yad_no'].to_list(), train_pred_df['yad_no'].to_list(), k=10)

0.29134931823632476

In [25]:
train_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,4101
1,0000ca043ed437a1472c9d1d154eb49b,8253
2,0000d4835cf113316fe447e2f80ba1c8,4863
3,0000fcda1ae1b2f431e55a7075d1f500,1652
4,000104bdffaaad1a1e0a9ebacf585f33,96
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,2259
288694,ffff2360540745117193ecadcdc06538,963
288695,ffff7fb4617164b2604aaf51c40bf82d,13719
288696,ffffcd5bc19d62cad5a3815c87818d83,10619


In [26]:
train_session_df.to_pickle(f'../datasets/candidate_df/{OUTPUT_NAME}_train.pkl')
test_session_df.to_pickle(f'../datasets/candidate_df/{OUTPUT_NAME}_test.pkl')

In [28]:
import os
os.makedirs(f'../datasets/{OUTPUT_NAME}_pkl/', exist_ok=True)

In [29]:
user_factors_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_user_factors_df.pkl')
user_factors_last_item_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_user_factors_last_item_df.pkl')
item_factors_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_item_factors_df.pkl')