# word2vec

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict
import pickle

import pandas as pd
import numpy as np
import cudf
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim.similarities.annoy import AnnoyIndexer

In [2]:
SEED = 42
random.seed(SEED)
cores = multiprocessing.cpu_count()

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [4]:
# データ読み込み
train_sessions = pd.read_pickle(PREP_DIR + "train_sessions.pkl")
test_sessions = pd.read_pickle(PREP_DIR + "test_sessions.pkl")

train_sessions = train_sessions.drop(columns=["type", "ts"])
test_sessions = test_sessions.drop(columns=["type", "ts"])

sessions = pd.concat([train_sessions, test_sessions])
sentences = sessions.groupby("session")["aid"].apply(list).tolist()
aids = sorted(sessions["aid"].unique().tolist())

del train_sessions, test_sessions
gc.collect()

0

In [5]:
w2v = Word2Vec(sentences=sentences, vector_size=50, min_count=1, window=20, workers=cores-1, seed=SEED, sg=1, epochs=4, sample=0)
annoy_index = AnnoyIndexer(w2v, 50)

In [6]:
with open(PREP_DIR + "word2vec.pkl", "wb") as f:
    pickle.dump(w2v, f)

In [6]:
aid_xs = []
aid_ys = []
sims = []

for aid in tqdm(aids):
    mss = w2v.wv.most_similar(positive=[aid], topn=21, indexer=annoy_index)
    aid_xs.extend([aid for _ in range(20)])
    aid_ys.extend([ms[0] for ms in mss][1:])
    sims.extend([ms[1] for ms in mss][1:])
results = pd.DataFrame({"aid_x":aid_xs, "aid_y":aid_ys, "sim":sims})
results.to_pickle(PREP_DIR + f"word2vec_similar.pkl")
results.head(20)

100%|██████████| 1855603/1855603 [08:39<00:00, 3572.00it/s]


Unnamed: 0,aid_x,aid_y,sim
0,0,466458,0.817219
1,0,1423226,0.800505
2,0,556806,0.795435
3,0,1781001,0.793806
4,0,1098090,0.792751
5,0,467359,0.792213
6,0,170003,0.790647
7,0,1636583,0.789078
8,0,22351,0.78888
9,0,1696009,0.788624


# SVD

In [7]:
vecs = w2v.wv.vectors
keys = list(w2v.wv.key_to_index.keys())

In [11]:
vecs_df = pd.DataFrame(vecs)
vecs_df.columns = [f"w2v_{str(c)}" for c in vecs_df.columns]
vecs_df["aid"] = keys
vecs_df.to_pickle(PREP_DIR + f"w2v_vector_n50.pkl")

In [86]:
from sklearn.decomposition import TruncatedSVD

for n_comp in [5,10,15,20]:
    svd = TruncatedSVD(n_components=n_comp, random_state=SEED)
    vecs_svd = svd.fit_transform(vecs)
    vecs_df = pd.DataFrame(vecs_svd)
    vecs_df.columns = [f"w2v_{str(c)}" for c in vecs_df.columns]
    vecs_df["aid"] = keys
    vecs_df.to_pickle(PREP_DIR + f"w2v_vector_n{str(n_comp)}.pkl")