In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'

In [2]:
import sys
import time
import numpy as np

import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from gensim.models import Word2Vec
import tempfile

In [3]:
import logging

log_path = 'Creative Embedding.log'
if os.path.isfile(log_path): os.remove(log_path)
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

## Set up Artifact Directory

In [4]:
embedding_path = '../../embedding_artifact'
if not os.path.isdir(embedding_path): os.mkdir(embedding_path)

In [5]:
input_path = '../../input_artifact'
if not os.path.isdir(input_path): os.mkdir(input_path)

## Read File

In [8]:
cl = pd.concat([pd.read_csv(os.path.join(train_path,'click_log.csv')), pd.read_csv(os.path.join(test_path,'click_log.csv'))])

In [9]:
cl_agg_user = {}

for row in tqdm.tqdm(cl.values):
    user, creative = int(row[1]), str(row[2])
    if row[1] in cl_agg_user:
        cl_agg_user[user].append(creative)
    else:
        cl_agg_user[user] = [creative]

100%|██████████████████████████████████████████████████████████████████| 63668283/63668283 [02:35<00:00, 409710.34it/s]


In [10]:
with open(os.path.join(embedding_path, 'cl_agg_user.json'), 'w') as f:
    json.dump(cl_agg_user, f)

## Train Word2Vec Model

In [5]:
with open(os.path.join(embedding_path, 'cl_agg_user.json'), 'r') as f:
    cl_agg_user = json.load(f)

In [7]:
start = time.time()
model = Word2Vec(sentences = cl_agg_user.values(), size=128, window=64, sg=0, hs=0, negative=20, cbow_mean=1, min_count=1, workers=16)
logger.info(f'Model training is done after {time.time()-start:.2f}s')

23:44:56 INFO: Model training is done after 2077.70s


In [27]:
model_path = os.path.join('\\'.join(os.getcwd().split('\\')[:-2]+['embedding_artifact']), 'creative_embed_s128_w64_cbow_')
with tempfile.NamedTemporaryFile(prefix=model_path, delete=False) as tmp:
    tmp_file_path = tmp.name
    start = time.time()
    model.save(tmp_file_path)
    logger.info(f'Model is saved to {tmp_file_path} after {time.time()-start:.2f}s')

10:56:07 INFO: Model is saved to C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\creative_embed_s128_w64_cbow_h74txu7m after 40.22s


## Generate Training Data

In [45]:
train_user = pd.read_csv(os.path.join(train_path,'user.csv'))
model = Word2Vec.load(tmp_file_path)

In [97]:
age = train_user['age'].values-1
age_path = os.path.join(input_path, 'tag_age.npy')
with open(age_path, 'wb') as f:
    np.save(f, age)
logger.info(f'Target age is saved to {age_path}')

13:00:12 INFO: Target age is saved to ../../input_artifact\tag_age.npy


In [98]:
gender = train_user['gender'].values-1
gender_path = os.path.join(input_path, 'tag_gender.npy')
with open(gender_path, 'wb') as f:
    np.save(f, gender)
logger.info(f'Target gender is saved to {gender_path}')

13:00:50 INFO: Target gender is saved to ../../input_artifact\tag_gender.npy


In [62]:
input_seq, input_avg_pool = [], []
for user in tqdm.tqdm(train_user['user_id'].values):
    seq = np.stack([model.wv[creative] for creative in cl_agg_user[str(user)]], axis=0)
    input_seq.append(seq)
    avg_pool = np.mean(seq, axis=0)
    input_avg_pool.append(avg_pool)

100%|████████████████████████████████████████████████████████████████████████| 900000/900000 [04:23<00:00, 3415.90it/s]


In [79]:
input_avg_pool = np.stack(input_avg_pool, axis=0)

In [92]:
avg_pool_file_path = os.path.join(input_path, 'input_creative_avg_pool.npy')
with open(avg_pool_file_path, 'wb') as f:
    np.save(f, input_avg_pool)
logger.info(f'Avg-pooled creative embedding is saved to {avg_pool_file_path}')

12:58:12 INFO: Avg-pooled creative embedding is saved to ../../input_artifact\input_creative_avg_pool.npy


In [100]:
seq_path = os.path.join('\\'.join(os.getcwd().split('\\')[:-2]+['input_artifact']), 'input_creative_seq_')
with tempfile.NamedTemporaryFile(prefix=seq_path, delete=False) as tmp:
    start = time.time()
    np.savez(tmp, *input_seq)
    logger.info(f'Sequence creative embedding is saved to {tmp.name} after {time.time()-start:.2f}s')

13:11:59 INFO: Sequence creative embedding is saved to C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\input_artifact\input_creative_seq_0xa5h_2v after 423.20s


## Generate Testing Data

In [11]:
with open(os.path.join(embedding_path, 'cl_agg_user.json'), 'r') as f:
    cl_agg_user = json.load(f)

In [14]:
test_cl = pd.read_csv(os.path.join(test_path,'click_log.csv'))
model = Word2Vec.load(r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\creative_embed_s128_w64_cbow_h74txu7m')

In [19]:
input_avg_pool = []
user_id = []
for user in tqdm.tqdm(test_cl['user_id'].unique()):
    seq = np.stack([model.wv[creative] for creative in cl_agg_user[str(user)]], axis=0)
    avg_pool = np.mean(seq, axis=0)
    input_avg_pool.append(avg_pool)
    user_id.append(user)

100%|██████████████████████████████████████████████████████████████████████| 1000000/1000000 [02:17<00:00, 7261.43it/s]


In [20]:
input_avg_pool = np.stack(input_avg_pool, axis=0)

In [29]:
avg_pool_file_path = os.path.join(input_path, 'test_input_creative_avg_pool.npy')
with open(avg_pool_file_path, 'wb') as f:
    np.save(f, input_avg_pool)
logger.info(f'Avg-pooled creative embedding for testing population is saved to {avg_pool_file_path}')

16:11:54 INFO: Avg-pooled creative embedding for testing population is saved to ../../input_artifact\test_input_creative_avg_pool.npy


In [23]:
user_id = np.array(user_id)

In [30]:
test_head_file_path = os.path.join(input_path, 'test_head_creative_avg_pool.npy')
with open(test_head_file_path, 'wb') as f:
    np.save(f, user_id)
logger.info(f'Head file for testing population is saved to {test_head_file_path}')

16:11:55 INFO: Head file for testing population is saved to ../../input_artifact\test_head_creative_avg_pool.npy
