# word2vec

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict

import pickle
import pandas as pd
import numpy as np
import cudf
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim.similarities.annoy import AnnoyIndexer

In [2]:
SEED = 42
random.seed(SEED)
cores = multiprocessing.cpu_count()

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [4]:
k = 20

In [5]:
with open(PREP_DIR + "word2vec.pkl", "rb") as f:
    w2v = pickle.load(f)
aid_vecs = w2v.wv.vectors
key2idx = w2v.wv.key_to_index
idx2key = w2v.wv.index_to_key
annoy_index = AnnoyIndexer(w2v, 50)

In [6]:
def make_similar_aid_df(args):
    session = args[0]
    session_vec = args[1]
    similar_aid_sim = w2v.wv.most_similar(session_vec, topn=k, indexer=annoy_index)
    return pd.DataFrame({"session":[session for _ in range(k)], 
                         "aid":[a[0] for a in similar_aid_sim],
                         "sim":[a[1] for a in similar_aid_sim]})

In [7]:
#week_suffixes = ["_week4", "_week4", ""]
week_suffixes = ["_week3"]

for suffix in week_suffixes:
    # データ読み込み
    if suffix == "":
        sessions_df = pd.read_pickle(os.getenv("PREP_DIR") + f"test_sessions.pkl")
    else:
        sessions_df = pd.read_pickle(os.getenv("PREP_DIR") + f"train_sessions{suffix}.pkl")
    sessions_df = sessions_df.sort_values(["session", "ts"], ascending=(True, False))
    sessions_df = sessions_df.groupby("session")["aid"].apply(list).reset_index()

    # session全体のaidのvec平均値算出
    session_avg_vecs = {}
    for session, aid_seq in tqdm(zip(sessions_df["session"].values, sessions_df["aid"].values),total=len(sessions_df)):
        aid_seq_len = len(aid_seq)
        session_avg_vec = np.zeros(50)
        for aid in aid_seq:
            idx = key2idx[aid]
            session_avg_vec += aid_vecs[idx] / aid_seq_len
        session_avg_vecs[session] = session_avg_vec
    with open(PREP_DIR + f"w2v_session_total_avg_vec{suffix}.pkl", "wb") as f:
        pickle.dump(session_avg_vecs, f)

    # 近傍aid探索
    processes = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(make_similar_aid_df, list(session_avg_vecs.items()))
        dfs = tqdm(dfs, total=len(session_avg_vecs))
        dfs = list(dfs)
    df = pd.concat(dfs)
    df.to_pickle(PREP_DIR + f"w2v_session_total_avg_similar_aids{suffix}.pkl")

100%|██████████| 4642744/4642744 [01:08<00:00, 67759.21it/s] 
100%|██████████| 4642744/4642744 [26:51<00:00, 2880.62it/s] 
