In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
import sys
sys.path.append('../')

from myutils.metrics import mapk

In [3]:
OUTPUT_NAME = 'node2vec'

In [4]:
train_log_df = pd.read_csv('../datasets/atmaCup16_Dataset/train_log.csv')
train_df = pd.read_csv('../datasets/atmaCup16_Dataset/train_label.csv')
test_log_df = pd.read_csv('../datasets/atmaCup16_Dataset/test_log.csv')
yado_df = pd.read_csv('../datasets/atmaCup16_Dataset/yado.csv')

train_session_only_df = train_df[['session_id']]
test_session_only_df = pd.read_csv('../datasets/atmaCup16_Dataset/test_session.csv')

train_test_log_df = pd.concat([train_log_df, test_log_df], axis=0).reset_index(drop=True)

In [5]:
yado_df['yad_no'].min()

1

In [6]:
sentence_list = train_test_log_df.groupby('session_id')['yad_no'].apply(list).tolist()

In [7]:
import networkx as nx
G = nx.Graph()

G.add_nodes_from(yado_df['yad_no'].tolist())

edges = set()
for items in sentence_list:
    if len(items) == 1:
        continue
    for i in range(len(items)-1):
        edges.add((items[i], items[i+1]))
edges = list(edges)
G.add_edges_from(edges)

In [8]:
from node2vec import Node2Vec
import pickle
dimensions = 256

node2vec = Node2Vec(
    G,  # graphを指定
    dimensions=dimensions,  # embedding後の次元数(default: 128)
    walk_length=80,  # nodeの探索数(default: 80)
    num_walks=10,  # nodeあたりの平均walk数(default: 10)
    p=1,  # pの値 大きいほどDFSに近い探索(default: 1)
    q=1,  # qの値 大きいほどBFSに近い探索(default: 1)
    seed=42,
    workers=1
)


output_path = f'../datasets/node2vec_pkl/node2vec_dim_{dimensions}.pkl'
pickle.dump(node2vec, open(output_path, 'wb'))

print('node2vec fit')
node2vec_model = node2vec.fit(window=10, min_count=1,)

output_path = f'../datasets/node2vec_pkl/node2vec_model_dim_{dimensions}.pkl'
pickle.dump(node2vec_model, open(output_path, 'wb'))


output_path = f'../datasets/node2vec_pkl/node2vec_model_dim_{dimensions}.pkl'
node2vec_model = pickle.load(open(output_path, 'rb'))

  from .autonotebook import tqdm as notebook_tqdm
Computing transition probabilities: 100%|██████████| 13806/13806 [00:08<00:00, 1657.30it/s]
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:50<00:00,  5.07s/it]


node2vec fit


In [9]:
# item_ids = train_test_log_df['yad_no'].unique().tolist()
item_ids = yado_df['yad_no'].to_list()
# strにしないと、keyではなくindexで取得しようとしてエラーになる
item_factors = {item: node2vec_model.wv[str(item)] for item in item_ids}
item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "yad_no"})

item_cols = [f"item_factor_{i}" for i in range(dimensions)]
item_factors_df.columns = ["yad_no"] + item_cols

In [10]:
user_item_list_dict = {session_id: yad_nos.tolist() for session_id, yad_nos in train_test_log_df.groupby('session_id')['yad_no']}
user_factors = {user_id: np.mean([node2vec_model.wv[str(item_id)] for item_id in user_item_list], axis=0) for user_id, user_item_list in user_item_list_dict.items()}
user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "session_id"})
user_cols = [f"user_factor_{i}" for i in range(dimensions)]
user_factors_df.columns = ["session_id"] + user_cols

user_factors_last_item = {user_id: node2vec_model.wv[str(user_item_list[-1])] for user_id, user_item_list in user_item_list_dict.items()}
user_factors_last_item_df = pd.DataFrame(user_factors_last_item).T.reset_index().rename(columns={"index": "session_id"})
user_cols = [f"user_factor_{i}" for i in range(dimensions)]
user_factors_last_item_df.columns = ["session_id"] + user_cols

In [11]:
item_factors_df

Unnamed: 0,yad_no,item_factor_0,item_factor_1,item_factor_2,item_factor_3,item_factor_4,item_factor_5,item_factor_6,item_factor_7,item_factor_8,...,item_factor_246,item_factor_247,item_factor_248,item_factor_249,item_factor_250,item_factor_251,item_factor_252,item_factor_253,item_factor_254,item_factor_255
0,1,-0.465011,-0.194455,-0.342215,-0.092711,-0.045938,-0.293998,0.038662,0.040464,0.480109,...,0.550159,-0.369464,-0.113164,-0.129572,0.723206,0.353853,0.732190,-0.615832,-0.040559,-0.163511
1,2,-0.196631,0.105356,0.030539,-0.118949,-0.480339,-0.030145,0.369313,0.577021,0.070234,...,0.030216,0.071987,-0.308143,-0.158368,0.395769,-0.102580,0.452296,-0.678276,0.308369,-0.195898
2,3,-0.073875,0.028553,0.379010,-0.407424,-0.067514,0.086802,0.126333,0.002414,-0.147998,...,0.264591,-0.027001,0.158245,-0.373954,0.222846,0.321430,0.498740,-0.440987,-0.114747,-0.068319
3,4,-0.141257,-0.058802,0.239744,-0.166719,0.081533,0.179132,0.171799,-0.188074,-0.339638,...,-0.112137,-0.114088,0.236210,-0.331756,0.223249,0.314776,0.415830,-0.454482,-0.410732,-0.382376
4,5,-0.276601,0.014311,0.132917,-0.229465,-0.215634,-0.259618,-0.125262,0.020865,-0.081804,...,0.207035,-0.361392,-0.189437,0.081159,0.138989,0.298327,0.530347,-0.393964,0.046485,-0.462267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13801,13802,0.002669,-0.001907,0.003557,0.002115,0.001977,0.002623,-0.002685,-0.002443,0.003627,...,0.003442,0.001627,-0.003164,-0.001899,-0.002215,0.002331,0.002709,-0.001052,0.003014,0.000895
13802,13803,-0.465944,0.020414,0.155174,-0.381585,-0.167423,-0.195857,0.089163,0.100445,-0.275950,...,0.116011,-0.192916,0.202523,0.053809,0.312431,0.124450,0.472563,-0.129923,0.211707,-0.155131
13803,13804,-0.561459,0.031070,0.096828,-0.115477,-0.063864,-0.393154,0.025283,0.382135,0.313645,...,0.482758,-0.412895,0.097509,-0.063310,-0.263685,-0.149204,0.334513,-0.427987,-0.185923,-0.249481
13804,13805,-0.288549,0.255652,0.138934,-0.193849,-0.188288,-0.259211,0.147652,0.297840,0.307060,...,-0.495622,-0.607045,0.401228,-0.121980,-0.196196,0.340264,0.333809,-0.724671,-0.220870,-0.186820


In [12]:
user_factors_df

Unnamed: 0,session_id,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,...,user_factor_246,user_factor_247,user_factor_248,user_factor_249,user_factor_250,user_factor_251,user_factor_252,user_factor_253,user_factor_254,user_factor_255
0,000007603d533d30453cc45d0f3d119f,-0.237018,0.711067,0.309764,-0.266563,-0.312340,-0.129179,-0.048322,0.029635,-0.273560,...,0.026674,-0.244198,-0.030228,0.053457,0.079082,0.351946,0.665704,0.087710,-0.039126,-0.123282
1,00001149e9c73985425197104712478c,-0.284068,-0.010262,0.250098,-0.234949,-0.098894,0.132130,0.272573,-0.259737,0.194923,...,0.127696,0.055437,0.259300,0.078732,0.311373,0.243826,0.604472,-0.500194,-0.304222,-0.327431
2,0000ca043ed437a1472c9d1d154eb49b,-0.091434,0.330426,0.103902,-0.156354,-0.214806,-0.020675,0.280597,-0.120378,-0.248688,...,-0.109473,0.169042,-0.091253,-0.236972,0.135504,0.153860,0.356767,-0.122315,0.071387,-0.520533
3,0000d4835cf113316fe447e2f80ba1c8,-0.266545,-0.065985,0.180724,0.656032,-0.408095,0.191066,0.274075,0.045336,0.080646,...,0.719053,-0.371494,-0.168375,-0.221903,0.211329,-0.114065,0.402491,-0.257150,0.325383,-0.072049
4,0000e02747d749a52b7736dfa751e258,-0.003070,0.148226,0.456831,-0.273833,-0.216581,-0.188670,0.111053,0.031686,-0.440005,...,0.156530,0.026880,0.228770,0.034937,0.156484,0.262868,-0.053502,-0.559101,0.007979,-0.369569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463393,ffff9a7dcc892875c7a8b821fa436228,-0.454488,-0.205471,0.355939,-0.250358,-0.162521,-0.123031,-0.323941,0.194349,-0.000047,...,0.262715,-0.000411,0.126588,-0.258264,0.175444,0.131940,0.097128,-0.507345,0.000289,-0.191661
463394,ffffb1d30300fe17f661941fd085b04b,-0.483678,-0.054752,0.239757,-0.267505,-0.167443,0.247603,-0.062070,-0.242097,0.190960,...,-0.233535,0.269121,0.001065,-0.042897,0.135888,0.085969,-0.005077,-0.396297,-0.324921,-0.236160
463395,ffffcd5bc19d62cad5a3815c87818d83,0.056297,-0.270552,0.432280,0.112578,0.086015,-0.547916,-0.120533,0.071180,0.449325,...,-0.027515,-0.324551,0.000791,-0.182184,-0.169202,0.531953,0.594763,-0.142899,0.024322,0.064197
463396,ffffe984aafd6127ce8e43e3ca40c79d,-0.224717,0.237469,0.586991,-0.190262,0.051134,-0.188636,0.177109,-0.117233,-0.061950,...,-0.067229,0.014129,0.266524,-0.230457,0.230463,0.285927,0.399649,-0.354046,-0.268699,-0.215031


In [13]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [14]:
from sklearn.neighbors import NearestNeighbors


user_vecs = user_factors_df[[f'user_factor_{i}' for i in range(dimensions)]].values
# user_vecs = user_factors_last_item_df[[f'user_factor_{i}' for i in range(dimensions)]].values
item_vecs = item_factors_df[[f'item_factor_{i}' for i in range(dimensions)]].values

nn = NearestNeighbors(n_neighbors=20, metric='cosine')
nn.fit(item_vecs)
dists, indices = nn.kneighbors(user_vecs)

In [15]:
user_last_item_dict = {user_id: user_item_list[-1] for user_id, user_item_list in user_item_list_dict.items()}

In [16]:
results = []

for i, user_id in enumerate(user_item_list_dict.keys()):
    for j in range(20):
        yad_no = indices[i][j]+1
        if user_last_item_dict[user_id] == yad_no:
            continue

        results.append([user_id, yad_no, 1 - dists[i][j]])
word2vec_sim_df = pd.DataFrame(results, columns=['session_id', 'yad_no', 'word2vec_sim'])

In [17]:
word2vec_sim_df

Unnamed: 0,session_id,yad_no,word2vec_sim
0,000007603d533d30453cc45d0f3d119f,2808,0.989724
1,000007603d533d30453cc45d0f3d119f,4101,0.989375
2,000007603d533d30453cc45d0f3d119f,11882,0.989308
3,000007603d533d30453cc45d0f3d119f,3324,0.983550
4,000007603d533d30453cc45d0f3d119f,8668,0.980623
...,...,...,...
8804649,fffffa7baf370083ebcdd98f26a7e31a,6247,0.901572
8804650,fffffa7baf370083ebcdd98f26a7e31a,5800,0.896078
8804651,fffffa7baf370083ebcdd98f26a7e31a,8462,0.894492
8804652,fffffa7baf370083ebcdd98f26a7e31a,846,0.892161


In [18]:
def make_candidate_word2vec(session_only_df: pd.DataFrame, word2vec_sim_df: pd.DataFrame):
    session_df = session_only_df.merge(word2vec_sim_df, on=['session_id'], how='left')

    session_df = session_df[session_df['yad_no'].notnull()].reset_index(drop=True)
    session_df['yad_no'] = session_df['yad_no'].astype(int)

    session_df['rank'] = session_df.groupby('session_id')['word2vec_sim'].rank(ascending=False, method='min')

    pred_df = session_df.groupby('session_id')['yad_no'].apply(lambda x: x.tolist()).reset_index()
    pred_df = pd.merge(session_only_df, pred_df, on='session_id', how='left')

    pred_df['yad_no'] = pred_df['yad_no'].apply(lambda d: d if isinstance(d, list) else [])

    return session_df, pred_df

In [19]:
train_session_df, train_pred_df = make_candidate_word2vec(train_session_only_df, word2vec_sim_df)
test_session_df, test_pred_df = make_candidate_word2vec(test_session_only_df, word2vec_sim_df)

In [20]:
train_session_df

Unnamed: 0,session_id,yad_no,word2vec_sim,rank
0,000007603d533d30453cc45d0f3d119f,2808,0.989724,1.0
1,000007603d533d30453cc45d0f3d119f,4101,0.989375,2.0
2,000007603d533d30453cc45d0f3d119f,11882,0.989308,3.0
3,000007603d533d30453cc45d0f3d119f,3324,0.983550,4.0
4,000007603d533d30453cc45d0f3d119f,8668,0.980623,5.0
...,...,...,...,...
5485326,fffffa7baf370083ebcdd98f26a7e31a,6247,0.901572,15.0
5485327,fffffa7baf370083ebcdd98f26a7e31a,5800,0.896078,16.0
5485328,fffffa7baf370083ebcdd98f26a7e31a,8462,0.894492,17.0
5485329,fffffa7baf370083ebcdd98f26a7e31a,846,0.892161,18.0


In [21]:
"""
cos_sims = []
for session_id, yad_no in tqdm(zip(train_session_df['session_id'].values, train_session_df['yad_no'].values)):
    session_last_yad_no = user_last_item_dict[session_id]

    sim = cos_sim(item_factors[session_last_yad_no], item_factors[yad_no])
    cos_sims.append(sim)
"""

"\ncos_sims = []\nfor session_id, yad_no in tqdm(zip(train_session_df['session_id'].values, train_session_df['yad_no'].values)):\n    session_last_yad_no = user_last_item_dict[session_id]\n\n    sim = cos_sim(item_factors[session_last_yad_no], item_factors[yad_no])\n    cos_sims.append(sim)\n"

In [22]:
train_pred_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,"[2808, 4101, 11882, 3324, 8668, 5289, 12837, 5..."
1,0000ca043ed437a1472c9d1d154eb49b,"[8253, 7675, 4488, 6053, 5749, 7180, 4909, 822..."
2,0000d4835cf113316fe447e2f80ba1c8,"[5238, 5778, 9039, 2290, 2615, 1650, 12379, 87..."
3,0000fcda1ae1b2f431e55a7075d1f500,"[7872, 13145, 13296, 2272, 626, 1341, 11111, 1..."
4,000104bdffaaad1a1e0a9ebacf585f33,"[96, 7240, 1284, 1490, 7749, 12338, 10915, 113..."
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,"[11924, 11677, 3364, 7609, 929, 600, 9459, 111..."
288694,ffff2360540745117193ecadcdc06538,"[3940, 963, 6654, 513, 399, 5299, 12625, 2900,..."
288695,ffff7fb4617164b2604aaf51c40bf82d,"[10364, 5037, 4398, 4040, 13420, 9002, 3566, 2..."
288696,ffffcd5bc19d62cad5a3815c87818d83,"[10619, 6933, 6595, 12781, 12500, 11091, 12398..."


In [23]:
mapk(train_df['yad_no'].to_list(), train_pred_df['yad_no'].to_list(), k=10)

0.34761723460787874

In [24]:
train_df

Unnamed: 0,session_id,yad_no
0,000007603d533d30453cc45d0f3d119f,4101
1,0000ca043ed437a1472c9d1d154eb49b,8253
2,0000d4835cf113316fe447e2f80ba1c8,4863
3,0000fcda1ae1b2f431e55a7075d1f500,1652
4,000104bdffaaad1a1e0a9ebacf585f33,96
...,...,...
288693,ffff2262d38abdeb247ebd591835dcc9,2259
288694,ffff2360540745117193ecadcdc06538,963
288695,ffff7fb4617164b2604aaf51c40bf82d,13719
288696,ffffcd5bc19d62cad5a3815c87818d83,10619


In [25]:
train_session_df.to_pickle(f'../datasets/candidate_df/{OUTPUT_NAME}_train.pkl')
test_session_df.to_pickle(f'../datasets/candidate_df/{OUTPUT_NAME}_test.pkl')

In [None]:
import os
os.makedirs(f'../datasets/node2vec_pkl', exist_ok=True)

In [26]:
user_factors_df.to_pickle(f'../datasets/node2vec_pkl/node2vec_user_factors_df.pkl')
user_factors_last_item_df.to_pickle(f'../datasets/node2vec_pkl/node2vec_user_factors_last_item_df.pkl')
item_factors_df.to_pickle(f'../datasets/node2vec_pkl/node2vec_item_factors_df.pkl')
