In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'
embedding_path = '../../embedding_artifact'
input_path = '../../input_artifact'

In [2]:
import sys
import time
import numpy as np

import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from gensim.models import Word2Vec
import tempfile
import gc
gc.enable()

In [3]:
import logging

log_path = 'Advertiser ID Embedding.log'
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

logger.info(f'Restart notebook\n==========================\n{time.ctime()}\n==========================')

10:25:45 INFO: Restart notebook
Mon Jun  1 10:25:45 2020


## Read File

In [4]:
cl = pd.concat([pd.read_csv(os.path.join(train_path,'click_log.csv')), pd.read_csv(os.path.join(test_path,'click_log.csv'))])

In [5]:
ad = pd.concat([pd.read_csv(os.path.join(train_path,'ad.csv')), pd.read_csv(os.path.join(test_path,'ad.csv'))])
ad.drop_duplicates(subset=['creative_id', 'advertiser_id'], inplace=True)

In [6]:
merge = pd.merge(cl,ad,on='creative_id')
del cl, ad
gc.collect()
merge.sort_values(['user_id', 'time'], inplace=True)

In [9]:
advertiser_agg_user = {}
advertiser_agg_user_dedup = {}

for user, advertiser in tqdm.tqdm(merge[['user_id', 'advertiser_id']].values):
    user, advertiser = str(user), str(advertiser)
    if user in advertiser_agg_user:
        advertiser_agg_user[user].append(advertiser)
        if advertiser not in advertiser_agg_user_dedup[user]:
            advertiser_agg_user_dedup[user].append(advertiser)
    else:
        advertiser_agg_user[user] = [advertiser]
        advertiser_agg_user_dedup[user] = [advertiser]

100%|██████████████████████████████████████████████████████████████████| 63668283/63668283 [04:45<00:00, 223198.86it/s]


In [13]:
with open(os.path.join(embedding_path, 'advertiser_agg_user.json'), 'w') as f:
    json.dump(advertiser_agg_user, f)
with open(os.path.join(embedding_path, 'advertiser_agg_user_dedup.json'), 'w') as f:
    json.dump(advertiser_agg_user_dedup, f)

In [15]:
del advertiser_agg_user, advertiser_agg_user_dedup
gc.collect()

1902849

## Train Word2Vec Model

In [14]:
with open(os.path.join(embedding_path, 'advertiser_agg_user_dedup.json'), 'r') as f:
    advertiser_agg_user_dedup = json.load(f)

In [16]:
start = time.time()
model = Word2Vec(sentences = advertiser_agg_user_dedup.values(), size=128, window=64, sg=0, hs=0, negative=20, cbow_mean=1, min_count=1, workers=16)
logger.info(f'Model training is done after {time.time()-start:.2f}s')

10:43:42 INFO: Model training is done after 298.91s


In [17]:
model_path = os.path.join('\\'.join(os.getcwd().split('\\')[:-2]+['embedding_artifact']), 'advertiser_id_embed_s128_w64_cbow_')
with tempfile.NamedTemporaryFile(prefix=model_path, delete=False) as tmp:
    tmp_file_path = tmp.name
    start = time.time()
    model.save(tmp_file_path)
    logger.info(f'Model is saved to {tmp_file_path} after {time.time()-start:.2f}s')

10:43:43 INFO: Model is saved to C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\advertiser_id_embed_s128_w64_cbow_n4re8tds after 0.65s
