このディスカッションではword2vecを用いて商品をembeddingを行います。  
[word2vecを利用した埋め込み分析とSWEMを用いた比較実験](https://speakerdeck.com/takapy/word2vecwoli-yong-sitamai-meip-mifen-xi-toswemwoyong-itabi-jiao-shi-yan)を非常に参考にさせていただきました。

# word2vecによる分散表現の獲得

In [14]:
import os
import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from glob import glob

from gensim.models import word2vec, KeyedVectors
# import umap

In [15]:
BASE_PATH = Path('../')
INPUT_DIR = BASE_PATH / 'input'

cartlog_df = pd.read_csv(INPUT_DIR / 'cart_log.csv')
cartlog_df = cartlog_df[cartlog_df['action_name']=='買物']
cartlog_df['item_detail_add_1'] = cartlog_df['item_detail_add_1'].fillna('欠損商品')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [16]:
# word2vecによる分散表現の学習
def word2vec_vectorizer(input_df, col, replace=True):
    path_name = f'word2vec_{col}.wordvectors'
    if replace or os.path.exists(path_name):
        model = word2vec.Word2Vec(
            sentences=input_df[col].tolist(),
            vector_size=32,
            min_count=1,
            window=5,
            sg=1, # skip-gramモデルを使用するか(0: しない, 1: する)
            hs=1, # 頻出単語をはじくか(0: はじく, 1: はじかない)
            epochs=5,
            workers=multiprocessing.cpu_count(),
        )
        model.wv.save(path_name)
    model = KeyedVectors.load(path_name)
    vectors = input_df[col].apply(lambda x: np.mean([model[e] for e in x], axis=0))
    return vectors

grp_df = cartlog_df.groupby("session_id")["item_detail_add_1"].apply(list).to_frame()
train_vectors = word2vec_vectorizer(grp_df, 'item_detail_add_1')

In [24]:
# 学習した結果を用いてembedding
model = KeyedVectors.load('word2vec_item_detail_add_1.wordvectors')
item2vec = {item: model[item] for item in cartlog_df['item_detail_add_1'].unique()}
item2vec_df = pd.DataFrame(item2vec).T

In [25]:
item2vec_df.columns = ['WE_'+str(col) for col in item2vec_df.columns]

item2vec_df = item2vec_df.reset_index()
item2vec_df = item2vec_df.rename(columns={'index': 'item_detail_add_1'})

In [26]:
item2vec_df.to_pickle('../save/item2vec.pkl')

In [27]:
item2vec_df.head()

Unnamed: 0,item_detail_add_1,WE_0,WE_1,WE_2,WE_3,WE_4,WE_5,WE_6,WE_7,WE_8,...,WE_22,WE_23,WE_24,WE_25,WE_26,WE_27,WE_28,WE_29,WE_30,WE_31
0,みかんM 大箱,-0.514487,-0.033075,0.550641,0.069548,0.278677,-0.801782,0.516729,1.235958,-0.127477,...,-0.230008,0.340092,0.502318,-0.100363,0.487873,0.868714,-0.08724,-0.306108,-0.3688,0.743355
1,大盛__白糸こんにゃく５０,-0.146149,-0.353589,1.018491,0.477366,0.239085,0.010357,0.601837,0.058288,0.203697,...,-0.104798,1.103203,0.301417,-0.272717,-0.094404,-0.068904,0.44507,-0.065786,-0.17418,-0.02046
2,ベビーシュークリーム,-0.330114,-0.397534,0.169174,-0.101737,0.000875,0.160388,0.716118,-0.086521,-0.032288,...,0.168622,0.244968,0.015846,-0.092296,-0.22309,-0.028382,0.588407,-0.037854,-0.332171,-0.086298
3,システマハブラシ　超コン,-0.420249,0.099647,0.745661,-0.006738,0.412956,0.757717,-0.292513,-0.339556,0.18196,...,-0.129993,-0.494572,-0.0192,-0.352631,-0.000842,0.53542,0.383507,-0.184542,-0.284896,0.40574
4,アーモンド効果_砂糖不使用,-0.214873,-0.62491,0.746134,0.518793,0.26344,0.253628,0.068422,-0.74454,0.333025,...,0.159708,-0.671398,0.332945,0.456905,-0.068264,0.580323,-0.099976,-0.829716,-0.574757,0.140301


# word2vecを用いた類似したアイテムの出力

ベビーチーズの近いものを見たけどなんだか良さそう🙌

In [20]:
name = 'ベビーチーズ'
model.most_similar(name) 

[('カマンベール入りベビーチ', 0.9093077182769775),
 ('アーモンド入りベビーチー', 0.8766981959342957),
 ('スモーク味ベビーチーズ', 0.8647148609161377),
 ('モッツァレラベビーチーズ', 0.8643010258674622),
 ('ブラックペッパー入りベビ', 0.858326256275177),
 ('おいしく健康プラスベビー', 0.8482185006141663),
 ('十勝のむヨーグルト糖質ｏ', 0.8290136456489563),
 ('リアロ\u3000風呂いす\u300030ｃｍ', 0.828390896320343),
 ('おうちDE居酒屋ベビーチー', 0.8272545337677002),
 ('毎日骨太ベビーチーズ', 0.8247004151344299)]

# UMAPを用いた可視化

In [21]:
# um = umap.UMAP(random_state=42)
# um.fit(item2vec_df)
# um_vec = um.transform(item2vec_df)

In [22]:
# plt.figure(figsize=(8, 6))
# plt.scatter(um_vec[:, 0], um_vec[:, 1], s=1)
# plt.show()