In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'
embedding_path = '../../embedding_artifact'
input_path = '../../input_artifact'
input_split_path = '../../input_artifact/input_split'
if not os.path.isdir(input_split_path): os.mkdir(input_split_path)
model_path = '../../model_artifact'
output_path = '../../output_artifact'

In [2]:
import sys
import gc
gc.enable()
import time
import re

import numpy as np
import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score
from gensim.models import Word2Vec
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
import logging

log_path = '[1.0]Sequence Data Preparation.log'
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

logger.info(f'Restart notebook\n==========================\n{time.ctime()}\n==========================')

13:48:24 INFO: Restart notebook
Wed Jun  3 13:48:24 2020


In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info('Device in Use: {}'.format(DEVICE))
torch.cuda.empty_cache()
t = torch.cuda.get_device_properties(DEVICE).total_memory/1024**3
c = torch.cuda.memory_cached(DEVICE)/1024**3
a = torch.cuda.memory_allocated(DEVICE)/1024**3
logger.info('CUDA Memory: Total {:.2f} GB, Cached {:.2f} GB, Allocated {:.2f} GB'.format(t,c,a))

13:48:24 INFO: Device in Use: cuda
13:48:24 INFO: CUDA Memory: Total 8.00 GB, Cached 0.00 GB, Allocated 0.00 GB


## Prepare Data

### Split Data to Fit in Memory

In [5]:
train_idx = np.arange(1, 900001)
np.random.seed(1898)
np.random.shuffle(train_idx)
save_path = os.path.join(input_split_path, 'train_idx_shuffle.npy')
with open(save_path, 'wb') as f:
    np.save(f, train_idx)
logger.info(f'Shuffled index for training set is saved to {save_path}')

14:50:14 INFO: Shuffled index for training set is saved to ../../input_artifact/input_split\train_idx_shuffle.npy


In [5]:
test_idx = np.arange(3000001, 4000001)
np.random.seed(1898)
np.random.shuffle(test_idx)
save_path = os.path.join(input_split_path, 'test_idx_shuffle.npy')
with open(save_path, 'wb') as f:
    np.save(f, test_idx)
logger.info(f'Shuffled index for test set is saved to {save_path}')

13:49:44 INFO: Shuffled index for test set is saved to ../../input_artifact/input_split\test_idx_shuffle.npy


#### Split Ground Truth

In [13]:
head = pd.read_csv(os.path.join(train_path,'user.csv'))
head['gender'] = head['gender'] - 1
head['age'] = head['age'] - 1

truth_idx = train_idx - 1

for split_idx in range(1,11):
    sub = head.iloc[truth_idx[(split_idx-1)*90000:split_idx*90000], :].values
    sub_name = f'train_truth_{split_idx}.npy'
    sub_path = os.path.join(input_split_path, sub_name)
    with open(sub_path, 'wb') as f:
        np.save(f, sub)
    logger.info(f'{sub_name} is saved to {sub_path}')
    del sub
    gc.collect()
del head
gc.collect()

18:24:35 INFO: train_truth_1.npy is saved to ../../input_artifact/input_split\train_truth_1.npy
18:24:35 INFO: train_truth_2.npy is saved to ../../input_artifact/input_split\train_truth_2.npy
18:24:35 INFO: train_truth_3.npy is saved to ../../input_artifact/input_split\train_truth_3.npy
18:24:35 INFO: train_truth_4.npy is saved to ../../input_artifact/input_split\train_truth_4.npy
18:24:35 INFO: train_truth_5.npy is saved to ../../input_artifact/input_split\train_truth_5.npy
18:24:35 INFO: train_truth_6.npy is saved to ../../input_artifact/input_split\train_truth_6.npy
18:24:35 INFO: train_truth_7.npy is saved to ../../input_artifact/input_split\train_truth_7.npy
18:24:35 INFO: train_truth_8.npy is saved to ../../input_artifact/input_split\train_truth_8.npy
18:24:35 INFO: train_truth_9.npy is saved to ../../input_artifact/input_split\train_truth_9.npy
18:24:36 INFO: train_truth_10.npy is saved to ../../input_artifact/input_split\train_truth_10.npy


0

#### Split Sequence Data

In [7]:
for full_name in ['product_agg_user.json','advertiser_agg_user.json', 'ad_agg_user.json', 'creative_agg_user.json']:
    with open(os.path.join(embedding_path, full_name), 'r') as f:
        full = json.load(f)
    for split_idx in range(1,11):
        sub = {}
        for key in train_idx[(split_idx-1)*90000:split_idx*90000]:
            sub[str(key)] = full[str(key)]
        sub_name = 'train_' + full_name.split('.')[0] + f'_{split_idx}.json'
        sub_path = os.path.join(input_split_path, sub_name)
        with open(sub_path, 'w') as f:
            json.dump(sub, f)
        logger.info(f'{sub_name} is saved to {sub_path}')
        del sub
        gc.collect()
    del full
    gc.collect()

14:51:04 INFO: train_product_agg_user_1.json is saved to ../../input_artifact/input_split\train_product_agg_user_1.json
14:51:08 INFO: train_product_agg_user_2.json is saved to ../../input_artifact/input_split\train_product_agg_user_2.json
14:51:12 INFO: train_product_agg_user_3.json is saved to ../../input_artifact/input_split\train_product_agg_user_3.json
14:51:16 INFO: train_product_agg_user_4.json is saved to ../../input_artifact/input_split\train_product_agg_user_4.json
14:51:20 INFO: train_product_agg_user_5.json is saved to ../../input_artifact/input_split\train_product_agg_user_5.json
14:51:24 INFO: train_product_agg_user_6.json is saved to ../../input_artifact/input_split\train_product_agg_user_6.json
14:51:28 INFO: train_product_agg_user_7.json is saved to ../../input_artifact/input_split\train_product_agg_user_7.json
14:51:32 INFO: train_product_agg_user_8.json is saved to ../../input_artifact/input_split\train_product_agg_user_8.json
14:51:36 INFO: train_product_agg_user_9.

In [9]:
for full_name in ['product_agg_user.json','advertiser_agg_user.json', 'ad_agg_user.json', 'creative_agg_user.json']:
    with open(os.path.join(embedding_path, full_name), 'r') as f:
        full = json.load(f)
    for split_idx in range(1,11):
        sub = {}
        for key in test_idx[(split_idx-1)*100000:split_idx*100000]:
            sub[str(key)] = full[str(key)]
        sub_name = 'test_' + full_name.split('.')[0] + f'_{split_idx}.json'
        sub_path = os.path.join(input_split_path, sub_name)
        with open(sub_path, 'w') as f:
            json.dump(sub, f)
        logger.info(f'{sub_name} is saved to {sub_path}')
        del sub
        _ = gc.collect()
    del full
    _ = gc.collect()

14:50:59 INFO: test_product_agg_user_1.json is saved to ../../input_artifact/input_split\test_product_agg_user_1.json
14:51:04 INFO: test_product_agg_user_2.json is saved to ../../input_artifact/input_split\test_product_agg_user_2.json
14:51:08 INFO: test_product_agg_user_3.json is saved to ../../input_artifact/input_split\test_product_agg_user_3.json
14:51:12 INFO: test_product_agg_user_4.json is saved to ../../input_artifact/input_split\test_product_agg_user_4.json
14:51:16 INFO: test_product_agg_user_5.json is saved to ../../input_artifact/input_split\test_product_agg_user_5.json
14:51:20 INFO: test_product_agg_user_6.json is saved to ../../input_artifact/input_split\test_product_agg_user_6.json
14:51:24 INFO: test_product_agg_user_7.json is saved to ../../input_artifact/input_split\test_product_agg_user_7.json
14:51:29 INFO: test_product_agg_user_8.json is saved to ../../input_artifact/input_split\test_product_agg_user_8.json
14:51:33 INFO: test_product_agg_user_9.json is saved to 

### DataLoader Utility

In [5]:
creative_embedding_path = r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\creative_id_embed_s160_w64_cbow_38168zon'
ad_embedding_path = r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\ad_id_embed_s160_w64_cbow_ibfi8g78'
advertiser_embedding_path = r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\advertiser_id_embed_s128_w64_cbow_n4re8tds'
product_embedding_path = r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\product_id_embed_s128_w64_cbow_8yemmp45'

def prepare_data(split_id, max_seq=100, slient=False):
    global input_split_path, creative_embedding_path, ad_embedding_path, advertiser_embedding_path, product_embedding_path

    start = time.time()
    if not slient: logger.info(f'Processing Split-{split_id}')
    truth_path = os.path.join(input_split_path, f'train_truth_{split_id}.npy')
    with open(truth_path, 'rb') as f:
        truth = np.load(f)
    inp_user, out_age, out_gender = truth[:,0], truth[:,1], truth[:,2]
    if not slient: logger.info(f'Target output ready after {time.time()-start:.2f}s')
    del truth
    _ = gc.collect()
    
    creative_embedding = Word2Vec.load(creative_embedding_path) 
    if not slient: logger.info(f'Creative ID embedding artifact is loaded after {time.time()-start:.2f}s')
    creative_path = os.path.join(input_split_path, f'train_creative_agg_user_{split_id}.json')
    with open(creative_path, 'r') as f:
        creative = json.load(f)
    inp_creative = []
    for user in inp_user:
        inp_creative.append(np.stack([creative_embedding.wv[key] for key in creative[str(user)][:max_seq]], axis=0))  
    if not slient: logger.info(f'Creative input ready after {time.time()-start:.2f}s')
    del creative_embedding, creative
    _ = gc.collect()
    
    ad_embedding = Word2Vec.load(ad_embedding_path) 
    if not slient: logger.info(f'Ad ID embedding artifact is loaded after {time.time()-start:.2f}s')
    ad_path = os.path.join(input_split_path, f'train_ad_agg_user_{split_id}.json')
    with open(ad_path, 'r') as f:
        ad = json.load(f)
    inp_ad = []
    for user in inp_user:
        inp_ad.append(np.stack([ad_embedding.wv[key] for key in ad[str(user)][:max_seq]], axis=0))
    if not slient: logger.info(f'Ad input ready after {time.time()-start:.2f}s')
    del ad_embedding, ad
    _ = gc.collect()

    advertiser_embedding = Word2Vec.load(advertiser_embedding_path) 
    if not slient: logger.info(f'Advertiser ID embedding artifact is loaded after {time.time()-start:.2f}s')
    advertiser_path = os.path.join(input_split_path, f'train_advertiser_agg_user_{split_id}.json')
    with open(advertiser_path, 'r') as f:
        advertiser = json.load(f)
    inp_advertiser = []
    for user in inp_user:
        inp_advertiser.append(np.stack([advertiser_embedding.wv[key] for key in advertiser[str(user)][:max_seq]], axis=0))   
    if not slient: logger.info(f'Advertiser input ready after {time.time()-start:.2f}s')
    del advertiser_embedding, advertiser
    _ = gc.collect()
    
    product_embedding = Word2Vec.load(product_embedding_path) 
    if not slient: logger.info(f'Product ID embedding artifact is loaded after {time.time()-start:.2f}s')
    product_path = os.path.join(input_split_path, f'train_product_agg_user_{split_id}.json')
    with open(product_path, 'r') as f:
        product = json.load(f)
    inp_product = []
    for user in inp_user:
        inp_product.append(np.stack([product_embedding.wv[key] for key in product[str(user)][:max_seq]], axis=0))
    if not slient: logger.info(f'Product input ready after {time.time()-start:.2f}s')
    del product_embedding, product, inp_user
    _ = gc.collect()
        
    return inp_creative, inp_ad, inp_advertiser, inp_product, out_age, out_gender

In [6]:
for sid in range(1,11):
    inp_creative, inp_ad, inp_advertiser, inp_product, out_age, out_gender = prepare_data(sid)
    del inp_creative, inp_ad, inp_advertiser, inp_product, out_age, out_gender
    _ = gc.collect()

19:41:38 INFO: Processing Split-1
19:41:38 INFO: Target output ready after 0.02s
19:41:57 INFO: Creative ID embedding artifact is loaded after 18.58s
19:42:08 INFO: Creative input ready after 29.67s
19:42:25 INFO: Ad ID embedding artifact is loaded after 47.25s
19:42:37 INFO: Ad input ready after 59.09s
19:42:39 INFO: Advertiser ID embedding artifact is loaded after 61.10s
19:42:49 INFO: Advertiser input ready after 70.80s
19:42:50 INFO: Product ID embedding artifact is loaded after 71.55s
19:42:59 INFO: Product input ready after 80.77s
19:43:21 INFO: Processing Split-2
19:43:21 INFO: Target output ready after 0.01s
19:43:40 INFO: Creative ID embedding artifact is loaded after 18.29s
19:43:51 INFO: Creative input ready after 29.77s
19:44:09 INFO: Ad ID embedding artifact is loaded after 47.31s
19:44:20 INFO: Ad input ready after 58.94s
19:44:22 INFO: Advertiser ID embedding artifact is loaded after 60.93s
19:44:32 INFO: Advertiser input ready after 70.43s
19:44:33 INFO: Product ID embe