# w2v_cluster_freq
w2vのベクトルのsessionごとのtotal平均でクラスタリングし、クラスタごとの最頻値を抽出

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict

import pickle
import pandas as pd
import numpy as np
import cudf
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim.similarities.annoy import AnnoyIndexer
from cuml import KMeans

In [2]:
SEED = 42
random.seed(SEED)
cores = multiprocessing.cpu_count()
n_cluster = 100_000

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [4]:
week = None


if week is None:
    avg_vecs_path = PREP_DIR + f"w2v_session_total_avg_vec.pkl"
    sessions_path = PREP_DIR + f"test_sessions.parquet"
    session_cluster_path = PREP_DIR + f"w2v_session_cluster_c{str(n_cluster)}.parquet"
    cluster_freq_path = PREP_DIR + f"w2v_cluster_freq_c{str(n_cluster)}.parquet"
else:
    avg_vecs_path = PREP_DIR + f"w2v_session_total_avg_vec_{week}.pkl"
    sessions_path = PREP_DIR + f"train_sessions_{week}.parquet"
    session_cluster_path = PREP_DIR + f"w2v_session_cluster_c{str(n_cluster)}_{week}.parquet"
    cluster_freq_path = PREP_DIR + f"w2v_cluster_freq_c{str(n_cluster)}_{week}.parquet"

In [5]:
week = "week4"


if week is None:
    avg_vecs_path = PREP_DIR + f"w2v_session_total_avg_vec.pkl"
    sessions_path = PREP_DIR + f"test_sessions.parquet"
    session_cluster_path = PREP_DIR + f"w2v_session_cluster_c{str(n_cluster)}.parquet"
    cluster_freq_path = PREP_DIR + f"w2v_cluster_freq_c{str(n_cluster)}.parquet"
else:
    avg_vecs_path = PREP_DIR + f"w2v_session_total_avg_vec_{week}.pkl"
    sessions_path = PREP_DIR + f"train_sessions_{week}.parquet"
    session_cluster_path = PREP_DIR + f"w2v_session_cluster_c{str(n_cluster)}_{week}.parquet"
    cluster_freq_path = PREP_DIR + f"w2v_cluster_freq_c{str(n_cluster)}_{week}.parquet"


# データ読み込み
with open(avg_vecs_path, "rb") as f:
    avg_vecs = pickle.load(f)
avg_vecs_df = cudf.DataFrame(list(avg_vecs.values()), index=avg_vecs.keys())
sessions = cudf.read_parquet(sessions_path)

# クラスタリング
kmeans = KMeans(n_clusters=n_cluster, random_state=SEED)
kmeans.fit(avg_vecs_df)
session_clstr = cudf.DataFrame({"session": avg_vecs.keys(), "clstr": kmeans.labels_})
session_clstr.to_parquet(session_cluster_path)

# クラスタごとの最頻値算出
sessions = sessions.merge(session_clstr, on="session", how="left")
clstr_freq = sessions.groupby(["clstr", "aid"])["ts"].count().reset_index()
clstr_freq.columns = ["clstr", "aid", "cnt"]
clstr_freq = clstr_freq.sort_values(["clstr", "cnt"], ascending=(True, False), ignore_index=True)
clstr_freq["rank"] = clstr_freq.groupby("clstr").cumcount()
clstr_freq.to_parquet(cluster_freq_path)