In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
import sys
sys.path.append('../')

from myutils.metrics import mapk

In [3]:
OUTPUT_NAME = 'prone'

In [4]:
train_log_df = pd.read_csv('../datasets/atmaCup16_Dataset/train_log.csv')
train_df = pd.read_csv('../datasets/atmaCup16_Dataset/train_label.csv')
test_log_df = pd.read_csv('../datasets/atmaCup16_Dataset/test_log.csv')
yado_df = pd.read_csv('../datasets/atmaCup16_Dataset/yado.csv')

train_session_only_df = train_df[['session_id']]
test_session_only_df = pd.read_csv('../datasets/atmaCup16_Dataset/test_session.csv')

train_test_log_df = pd.concat([train_log_df, test_log_df], axis=0).reset_index(drop=True)

In [5]:
yado_df['yad_no'].min()

1

In [6]:
from csrgraph import csrgraph
import nodevectors as nv
from scipy.sparse import csr_matrix
from nodevectors.prone import ProNE
from nodevectors import prone

def create_adjacency_matrix(edges):
    """
    無向グラフ
    """
    n = np.max(edges) + 1  # ノードの数を取得します（ノードが0から始まると仮定します）

    # edges_flipped = np.flip(edges, axis=1)  # エッジの始点と終点を入れ替えます
    # edges_doubled = np.vstack((edges, edges_flipped))  # エッジリストを2倍にします
    data = np.ones(len(edges))  # エッジの数だけ1を用意します
    adjacency_matrix = csr_matrix((data, (edges[:, 0], edges[:, 1])), shape=(n, n))  # CSR形式のスパース行列を作成します

    return adjacency_matrix


def calc_prone_embs(edge, n_components):
    # 隣接行列
    adj_mat = create_adjacency_matrix(edge)
    # グラフ
    graph = csrgraph(adj_mat)

    # 埋め込みモデルの学習
    model_params = dict(n_components=n_components, step=10, mu=0.2, theta=0.5, exponent=0.75, verbose=True)
    node_emb_model = prone.ProNE(**model_params)
    node_embs = node_emb_model.fit_transform(graph)
    # 正規化する
    node_embs = node_embs / (np.linalg.norm(node_embs, axis=1, keepdims=True) + 1e-8)
    return node_embs


def make_prone_vec(train_test_log_df: pd.DataFrame, n_components:int):
    # 視聴グラフを作成
    # エッジ。anime_noがuser_noの後に続くように調整
    
    # view_edge = df[[f"{name1}_no", f"{name2}_no"]].values
    # num_user = np.max(view_edge[:, 0]) + 1
    # view_edge[:, 1] = view_edge[:, 1] + num_user
    sentence_list = train_test_log_df.groupby('session_id')['yad_no'].apply(list).tolist()

    edges = set()
    for items in sentence_list:
        if len(items) == 1:
            continue
        for i in range(len(items)-1):
            edges.add((items[i]-1, items[i+1]-1))
            edges.add((items[i+1]-1, items[i]-1))
    edges = list(edges)
    edges = np.array(edges)
    print(edges.shape) # (num_edge, 2)

    node_embs = calc_prone_embs(edges, n_components)

    return node_embs

In [7]:
dimensions = 1024

In [8]:
item_vecs = make_prone_vec(train_test_log_df, dimensions)

(108750, 2)


In [9]:
item_vecs.shape

(13806, 1024)

In [10]:
# item_ids = train_test_log_df['yad_no'].unique().tolist()
item_ids = yado_df['yad_no'].to_list()
item_factors = {item: item_vecs[item-1, :] for item in item_ids}
item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "yad_no"})

item_cols = [f"item_factor_{i}" for i in range(dimensions)]
item_factors_df.columns = ["yad_no"] + item_cols

In [11]:
user_item_list_dict = {session_id: yad_nos.tolist() for session_id, yad_nos in train_test_log_df.groupby('session_id')['yad_no']}
user_factors = {user_id: np.mean([item_factors[item_id] for item_id in user_item_list], axis=0) for user_id, user_item_list in user_item_list_dict.items()}
user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "session_id"})
user_cols = [f"user_factor_{i}" for i in range(dimensions)]
user_factors_df.columns = ["session_id"] + user_cols

user_factors_last_item = {user_id: item_factors[user_item_list[-1]] for user_id, user_item_list in user_item_list_dict.items()}
user_factors_last_item_df = pd.DataFrame(user_factors_last_item).T.reset_index().rename(columns={"index": "session_id"})
user_cols = [f"user_factor_{i}" for i in range(dimensions)]
user_factors_last_item_df.columns = ["session_id"] + user_cols

In [12]:
item_factors_df

Unnamed: 0,yad_no,item_factor_0,item_factor_1,item_factor_2,item_factor_3,item_factor_4,item_factor_5,item_factor_6,item_factor_7,item_factor_8,...,item_factor_1014,item_factor_1015,item_factor_1016,item_factor_1017,item_factor_1018,item_factor_1019,item_factor_1020,item_factor_1021,item_factor_1022,item_factor_1023
0,1,4.339596e-04,0.001153,-0.000096,-0.000189,0.000466,-3.624129e-03,-0.000039,-0.133191,-0.000064,...,0.018481,0.055519,0.019735,-0.013640,0.053192,-0.019903,0.013264,0.022620,-0.016727,0.015887
1,2,-3.018322e-05,0.000007,-0.000005,0.000008,0.000023,7.961751e-07,0.000002,-0.000018,-0.000023,...,0.000018,0.001235,-0.000069,-0.001269,0.000057,-0.000760,-0.000374,-0.000307,-0.000569,-0.000183
2,3,1.623927e-01,0.451548,-0.006571,0.000779,-0.000621,2.644931e-03,-0.000027,0.008070,-0.000471,...,-0.032164,-0.009038,0.001779,0.000259,-0.002131,0.007045,-0.006681,-0.004972,0.008576,-0.026692
3,4,2.698097e-02,0.074359,-0.001056,0.000143,-0.000076,3.375589e-04,0.000017,-0.000610,0.000077,...,0.028297,-0.007105,-0.019591,0.026887,-0.021098,0.007170,0.005355,-0.004192,-0.030193,0.008505
4,5,-1.875272e-05,0.000023,0.000010,0.000043,-0.000024,2.848194e-05,0.000011,0.000036,-0.000044,...,0.001354,0.000870,-0.000483,0.001466,-0.000708,-0.000109,0.000334,-0.000911,0.000838,0.000311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
13802,13803,3.748445e-04,0.001268,0.104295,-0.000010,0.000007,2.986860e-04,0.000007,0.000023,0.000044,...,0.000683,-0.001523,-0.012741,0.017605,0.008456,0.000891,-0.006548,-0.000467,-0.009185,-0.005669
13803,13804,1.853780e-07,0.000024,-0.000021,-0.000053,-0.000014,-5.131127e-05,-0.000021,-0.001898,-0.000024,...,-0.010744,0.021710,0.005953,-0.011940,-0.023650,0.012451,0.024981,0.006651,0.010849,-0.011222
13804,13805,5.102978e-05,-0.000036,-0.000003,0.000061,-0.000013,5.176288e-05,0.000030,-0.000019,0.000023,...,0.000214,-0.000729,-0.002647,-0.005917,0.002019,0.002427,0.011418,0.009031,-0.009470,0.004187


In [13]:
user_factors_df

Unnamed: 0,session_id,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,...,user_factor_1014,user_factor_1015,user_factor_1016,user_factor_1017,user_factor_1018,user_factor_1019,user_factor_1020,user_factor_1021,user_factor_1022,user_factor_1023
0,000007603d533d30453cc45d0f3d119f,-8.024747e-07,0.000166,-0.000167,-0.000045,-0.000215,-0.000041,-0.000137,-0.000197,-0.000019,...,0.001639,-0.010238,0.011061,-0.000057,0.008932,-0.008776,-0.003428,0.009920,-0.007384,-0.010178
1,00001149e9c73985425197104712478c,2.544950e-03,0.004256,0.000480,-0.000200,0.000159,-0.000143,0.000008,-0.002052,0.000270,...,-0.021500,-0.003498,0.004997,-0.001432,0.012118,-0.003018,-0.016710,-0.003545,-0.010714,0.001139
2,0000ca043ed437a1472c9d1d154eb49b,2.321482e-05,0.000123,0.000006,-0.000024,0.000025,0.000006,-0.000034,-0.000300,0.276702,...,0.027475,-0.021602,-0.021677,-0.079188,-0.012514,-0.026368,0.074574,-0.031171,-0.026621,0.011148
3,0000d4835cf113316fe447e2f80ba1c8,1.087184e-04,-0.000018,0.000051,0.000077,0.000028,-0.000069,0.000174,0.000041,0.000002,...,-0.031901,0.000724,0.012089,0.003654,-0.017301,-0.017577,-0.012103,-0.008473,0.009060,0.129765
4,0000e02747d749a52b7736dfa751e258,4.031227e-02,0.105464,-0.000581,0.000189,-0.000126,0.000660,0.000002,0.001988,-0.000057,...,-0.017067,0.008965,-0.009559,0.007329,0.008155,0.005248,0.005540,-0.000127,0.005361,0.000330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463393,ffff9a7dcc892875c7a8b821fa436228,5.368026e-01,-0.204658,0.000635,0.000254,-0.000331,0.000067,-0.000015,0.000190,-0.000103,...,-0.016052,0.033099,-0.022016,-0.004299,0.026159,0.009928,-0.004665,0.018782,-0.018789,-0.006199
463394,ffffb1d30300fe17f661941fd085b04b,1.094235e-02,0.021107,-0.000124,-0.002541,0.000350,-0.000187,-0.000016,-0.003317,0.002879,...,0.000716,0.007096,0.000992,-0.019691,0.001078,0.013932,0.000830,0.046647,-0.004324,-0.001941
463395,ffffcd5bc19d62cad5a3815c87818d83,-2.502608e-05,-0.000001,-0.000058,-0.000002,0.000006,-0.000015,-0.000003,-0.000149,0.000022,...,0.003659,0.004068,-0.013442,-0.006783,0.002409,-0.005846,-0.005145,0.001794,0.000641,-0.006856
463396,ffffe984aafd6127ce8e43e3ca40c79d,2.051850e-02,0.042521,-0.000010,0.000011,0.000048,0.000098,0.000018,0.000053,0.000146,...,-0.005013,0.037460,0.010080,-0.000939,0.002054,-0.009534,0.010280,-0.013284,-0.000621,0.001127


In [14]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [15]:
from sklearn.neighbors import NearestNeighbors


user_vecs = user_factors_df[[f'user_factor_{i}' for i in range(dimensions)]].values
# user_vecs = user_factors_last_item_df[[f'user_factor_{i}' for i in range(dimensions)]].values
item_vecs = item_factors_df[[f'item_factor_{i}' for i in range(dimensions)]].values

nn = NearestNeighbors(n_neighbors=20, metric='cosine')
nn.fit(item_vecs)
dists, indices = nn.kneighbors(user_vecs)

In [16]:
user_last_item_dict = {user_id: user_item_list[-1] for user_id, user_item_list in user_item_list_dict.items()}

In [17]:
results = []

for i, user_id in enumerate(user_item_list_dict.keys()):
    for j in range(20):
        yad_no = indices[i][j]+1
        if user_last_item_dict[user_id] == yad_no:
            continue

        results.append([user_id, yad_no, 1 - dists[i][j]])
word2vec_sim_df = pd.DataFrame(results, columns=['session_id', 'yad_no', 'word2vec_sim'])

In [18]:
word2vec_sim_df

Unnamed: 0,session_id,yad_no,word2vec_sim
0,000007603d533d30453cc45d0f3d119f,11882,0.993255
1,000007603d533d30453cc45d0f3d119f,4101,0.991203
2,000007603d533d30453cc45d0f3d119f,3324,0.985492
3,000007603d533d30453cc45d0f3d119f,2808,0.969227
4,000007603d533d30453cc45d0f3d119f,8668,0.961310
...,...,...,...
8808861,fffffa7baf370083ebcdd98f26a7e31a,846,0.753347
8808862,fffffa7baf370083ebcdd98f26a7e31a,3,0.647567
8808863,fffffa7baf370083ebcdd98f26a7e31a,5800,0.623340
8808864,fffffa7baf370083ebcdd98f26a7e31a,13524,0.448887


In [19]:
def make_candidate_word2vec(session_only_df: pd.DataFrame, word2vec_sim_df: pd.DataFrame):
    session_df = session_only_df.merge(word2vec_sim_df, on=['session_id'], how='left')

    session_df = session_df[session_df['yad_no'].notnull()].reset_index(drop=True)
    session_df['yad_no'] = session_df['yad_no'].astype(int)

    session_df['rank'] = session_df.groupby('session_id')['word2vec_sim'].rank(ascending=False, method='min')

    pred_df = session_df.groupby('session_id')['yad_no'].apply(lambda x: x.tolist()).reset_index()
    pred_df = pd.merge(session_only_df, pred_df, on='session_id', how='left')

    pred_df['yad_no'] = pred_df['yad_no'].apply(lambda d: d if isinstance(d, list) else [])

    return session_df, pred_df

In [20]:
train_session_df, train_pred_df = make_candidate_word2vec(train_session_only_df, word2vec_sim_df)
test_session_df, test_pred_df = make_candidate_word2vec(test_session_only_df, word2vec_sim_df)

In [21]:
train_session_df

Unnamed: 0,session_id,yad_no,word2vec_sim,rank
0,000007603d533d30453cc45d0f3d119f,11882,0.993255,1.0
1,000007603d533d30453cc45d0f3d119f,4101,0.991203,2.0
2,000007603d533d30453cc45d0f3d119f,3324,0.985492,3.0
3,000007603d533d30453cc45d0f3d119f,2808,0.969227,4.0
4,000007603d533d30453cc45d0f3d119f,8668,0.961310,5.0
...,...,...,...,...
5488477,fffffa7baf370083ebcdd98f26a7e31a,846,0.753347,15.0
5488478,fffffa7baf370083ebcdd98f26a7e31a,3,0.647567,16.0
5488479,fffffa7baf370083ebcdd98f26a7e31a,5800,0.623340,17.0
5488480,fffffa7baf370083ebcdd98f26a7e31a,13524,0.448887,18.0


In [22]:
"""
cos_sims = []
for session_id, yad_no in tqdm(zip(train_session_df['session_id'].values, train_session_df['yad_no'].values)):
    session_last_yad_no = user_last_item_dict[session_id]

    sim = cos_sim(item_factors[session_last_yad_no], item_factors[yad_no])
    cos_sims.append(sim)
"""

"\ncos_sims = []\nfor session_id, yad_no in tqdm(zip(train_session_df['session_id'].values, train_session_df['yad_no'].values)):\n    session_last_yad_no = user_last_item_dict[session_id]\n\n    sim = cos_sim(item_factors[session_last_yad_no], item_factors[yad_no])\n    cos_sims.append(sim)\n"

In [23]:
train_pred_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,"[11882, 4101, 3324, 2808, 8668, 5289, 5821, 10..."
1,0000ca043ed437a1472c9d1d154eb49b,"[8253, 4909, 4488, 6516, 12609, 7675, 9024, 14..."
2,0000d4835cf113316fe447e2f80ba1c8,"[5238, 2290, 2615, 4863, 4355, 6722, 9039, 117..."
3,0000fcda1ae1b2f431e55a7075d1f500,"[2272, 13296, 773, 9790, 1341, 626, 7872, 1314..."
4,000104bdffaaad1a1e0a9ebacf585f33,"[1284, 96, 7240, 254, 5490, 8696, 902, 1490, 1..."
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,"[13682, 13079, 1092, 5719, 600, 13210, 5179, 1..."
288694,ffff2360540745117193ecadcdc06538,"[6654, 3940, 2900, 5299, 399, 8465, 963, 513, ..."
288695,ffff7fb4617164b2604aaf51c40bf82d,"[7308, 7820, 4398, 10364, 3566, 12240, 4040, 9..."
288696,ffffcd5bc19d62cad5a3815c87818d83,"[9671, 12829, 570, 12500, 11091, 3238, 1820, 1..."


## cv
dim=256 0.20479048683487233  
dim=512 0.24105351563621108  
dim=1024 0.284614876566241

In [24]:
mapk(train_df['yad_no'].to_list(), train_pred_df['yad_no'].to_list(), k=10)

0.284614876566241

In [25]:
train_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,4101
1,0000ca043ed437a1472c9d1d154eb49b,8253
2,0000d4835cf113316fe447e2f80ba1c8,4863
3,0000fcda1ae1b2f431e55a7075d1f500,1652
4,000104bdffaaad1a1e0a9ebacf585f33,96
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,2259
288694,ffff2360540745117193ecadcdc06538,963
288695,ffff7fb4617164b2604aaf51c40bf82d,13719
288696,ffffcd5bc19d62cad5a3815c87818d83,10619


In [26]:
train_session_df.to_pickle(f'../datasets/candidate_df/{OUTPUT_NAME}_train.pkl')
test_session_df.to_pickle(f'../datasets/candidate_df/{OUTPUT_NAME}_test.pkl')

In [None]:
import os
os.makedirs(f'../datasets/{OUTPUT_NAME}_pkl/', exist_ok=True)

In [27]:
user_factors_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_user_factors_df.pkl')
user_factors_last_item_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_user_factors_last_item_df.pkl')
item_factors_df.to_pickle(f'../datasets/{OUTPUT_NAME}_pkl/{OUTPUT_NAME}_item_factors_df.pkl')