In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'
embedding_path = '../../embedding_artifact'
input_path = '../../input_artifact'

In [2]:
import sys
import time
import numpy as np

import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from gensim.models import Word2Vec
import tempfile
import gc
gc.enable()

In [3]:
import logging

log_path = 'Creative ID Embedding.log'
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

logger.info(f'Restart notebook\n==========================\n{time.ctime()}\n==========================')

11:28:08 INFO: Restart notebook
Mon Jun  1 11:28:08 2020


## Read File

In [4]:
cl = pd.concat([pd.read_csv(os.path.join(train_path,'click_log.csv')), pd.read_csv(os.path.join(test_path,'click_log.csv'))])
cl.sort_values(['user_id', 'time'], inplace=True)

In [8]:
creative_agg_user = {}
creative_agg_user_dedup = {}

for user, creative in tqdm.tqdm(cl[['user_id', 'creative_id']].values):
    user, creative = str(user), str(creative)
    if user in creative_agg_user:
        creative_agg_user[user].append(creative)
        if creative not in creative_agg_user_dedup[user]:
            creative_agg_user_dedup[user].append(creative)
    else:
        creative_agg_user[user] = [creative]
        creative_agg_user_dedup[user] = [creative]

100%|██████████████████████████████████████████████████████████████████| 63668283/63668283 [05:48<00:00, 182803.77it/s]


In [9]:
with open(os.path.join(embedding_path, 'creative_agg_user.json'), 'w') as f:
    json.dump(creative_agg_user, f)
with open(os.path.join(embedding_path, 'creative_agg_user_dedup.json'), 'w') as f:
    json.dump(creative_agg_user_dedup, f)

In [10]:
del creative_agg_user, creative_agg_user_dedup
gc.collect()

108

## Train Word2Vec Model

In [4]:
with open(os.path.join(embedding_path, 'creative_agg_user_dedup.json'), 'r') as f:
    creative_agg_user_dedup = json.load(f)

In [5]:
start = time.time()
model = Word2Vec(sentences = creative_agg_user_dedup.values(), size=160, window=64, sg=0, hs=0, negative=20, cbow_mean=1, min_count=1, workers=16)
logger.info(f'Model training is done after {time.time()-start:.2f}s')

12:02:34 INFO: Model training is done after 2038.46s


In [6]:
model_path = os.path.join('\\'.join(os.getcwd().split('\\')[:-2]+['embedding_artifact']), 'creative_id_embed_s160_w64_cbow_')
with tempfile.NamedTemporaryFile(prefix=model_path, delete=False) as tmp:
    tmp_file_path = tmp.name
    start = time.time()
    model.save(tmp_file_path)
    logger.info(f'Model is saved to {tmp_file_path} after {time.time()-start:.2f}s')

12:02:47 INFO: Model is saved to C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\creative_id_embed_s160_w64_cbow_38168zon after 13.57s
