In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'

In [2]:
import sys
import time
import numpy as np

import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from gensim.models import Word2Vec

In [3]:
import logging

log_path = 'Creative Embedding.log'
if os.path.isfile(log_path): os.remove(log_path)
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

## Set up Artifact Directory

In [4]:
embedding_path = '../../embedding_artifact'
if not os.path.isdir(embedding_path): os.mkdir(embedding_path)

## Read File

In [6]:
cl = pd.concat([pd.read_csv(os.path.join(train_path,'click_log.csv')), pd.read_csv(os.path.join(test_path,'click_log.csv'))])

In [7]:
cl_agg_user = {}

for row in tqdm.tqdm(cl.values):
    user, creative = int(row[1]), str(row[2])
    if row[1] in cl_agg_user:
        cl_agg_user[user].append(creative)
    else:
        cl_agg_user[user] = [creative]

100%|██████████████████████████████████████████████████████████████████| 63668283/63668283 [02:38<00:00, 402574.27it/s]


In [8]:
with open(os.path.join(embedding_path, 'cl_agg_user.json'), 'w') as f:
    json.dump(cl_agg_user, f)

## Train Word2Vec Model

In [5]:
with open(os.path.join(embedding_path, 'cl_agg_user.json'), 'r') as f:
    cl_agg_user = json.load(f)

In [None]:
start = time.time()
model = Word2Vec(sentences = cl_agg_user.values(), size=128, window=64, sg=0, hs=0, negative=20, cbow_mean=1, min_count=1, workers=16)
logger.info(f'Model training is done after {time.time()-start:.2f}s')