In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'
embedding_path = '../../embedding_artifact'
input_path = '../../input_artifact'

In [2]:
import sys
import time
import numpy as np

import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from gensim.models import Word2Vec
import tempfile
import gc
gc.enable()

In [3]:
import logging

log_path = 'Product ID Embedding.log'
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

logger.info(f'Restart notebook\n==========================\n{time.ctime()}\n==========================')

10:46:47 INFO: Restart notebook
Mon Jun  1 10:46:47 2020


## Read File

In [4]:
cl = pd.concat([pd.read_csv(os.path.join(train_path,'click_log.csv')), pd.read_csv(os.path.join(test_path,'click_log.csv'))])

In [5]:
ad = pd.concat([pd.read_csv(os.path.join(train_path,'ad.csv')), pd.read_csv(os.path.join(test_path,'ad.csv'))])
ad.drop_duplicates(subset=['creative_id', 'product_id'], inplace=True)

In [9]:
merge = pd.merge(cl,ad,on='creative_id')
del cl, ad
gc.collect()
merge.sort_values(['user_id', 'time'], inplace=True)

In [13]:
product_agg_user = {}
product_agg_user_dedup = {}

for user, product in tqdm.tqdm(merge[['user_id', 'product_id']].values):
    user, product = str(user), str(product)
    if user in product_agg_user:
        product_agg_user[user].append(product)
        if product not in product_agg_user_dedup[user]:
            product_agg_user_dedup[user].append(product)
    else:
        product_agg_user[user] = [product]
        product_agg_user_dedup[user] = [product]

100%|██████████████████████████████████████████████████████████████████| 63668283/63668283 [03:04<00:00, 344572.43it/s]


In [16]:
with open(os.path.join(embedding_path, 'product_agg_user.json'), 'w') as f:
    json.dump(product_agg_user, f)
with open(os.path.join(embedding_path, 'product_agg_user_dedup.json'), 'w') as f:
    json.dump(product_agg_user_dedup, f)

In [18]:
del product_agg_user, product_agg_user_dedup
gc.collect()

139

## Train Word2Vec Model

In [20]:
with open(os.path.join(embedding_path, 'product_agg_user_dedup.json'), 'r') as f:
    product_agg_user_dedup = json.load(f)

In [21]:
start = time.time()
model = Word2Vec(sentences = product_agg_user_dedup.values(), size=128, window=64, sg=0, hs=0, negative=20, cbow_mean=1, min_count=1, workers=16)
logger.info(f'Model training is done after {time.time()-start:.2f}s')

11:00:36 INFO: Model training is done after 103.70s


In [22]:
model_path = os.path.join('\\'.join(os.getcwd().split('\\')[:-2]+['embedding_artifact']), 'product_id_embed_s128_w64_cbow_')
with tempfile.NamedTemporaryFile(prefix=model_path, delete=False) as tmp:
    tmp_file_path = tmp.name
    start = time.time()
    model.save(tmp_file_path)
    logger.info(f'Model is saved to {tmp_file_path} after {time.time()-start:.2f}s')

11:00:37 INFO: Model is saved to C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\product_id_embed_s128_w64_cbow_8yemmp45 after 0.47s
