### Attention
https://hyen4110.tistory.com/31   
seq2seq - https://hyen4110.tistory.com/30
seq2seq의 핵심은 2개의 RNN을 encoder-decoder 아키텍쳐 형식으로 만드는 것이다. 입력 단어를 하나씩 읽어 정해진 차원의 벡터 표현을 언은 후, 이를 다시 입력값으로 하여 한 단어 한 단어 추출한다.   


# NAML : Neural News Recommendation with Attentive Multi-View Learning


## Global settings and imports

In [1]:
import sys
import os
import zipfile
import numpy as np
import scrapbook as sb
import tensorflow as tf
from tqdm import tqdm
from tempfile import TemporaryDirectory
tf.get_logger().setLevel('ERROR')

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.naml import NAMLModel
from recommenders.models.newsrec.io.mind_all_iterator import MINDAllIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.13 (default, Mar 29 2022, 02:18:16) 
[GCC 7.5.0]
Tensorflow version: 2.7.3


## Download and load data

In [2]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding_all.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict_all.pkl")
vertDict_file = os.path.join(data_path, "utils", "vert_dict.pkl")
subvertDict_file = os.path.join(data_path, "utils", "subvert_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'naml.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set('demo')

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

100%|█████████████████████████████████████| 17.0k/17.0k [00:08<00:00, 2.00kKB/s]
100%|█████████████████████████████████████| 9.84k/9.84k [00:05<00:00, 1.86kKB/s]
100%|█████████████████████████████████████| 95.0k/95.0k [00:27<00:00, 3.49kKB/s]


In [3]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          vertDict_file=vertDict_file, 
                          subvertDict_file=subvertDict_file,
                          batch_size=16,
                          epochs=5)
print(hparams)



HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 16, 'show_step': 100000, 'title_size': 30, 'body_size': 50, 'his_size': 50, 'vert_num': 17, 'subvert_num': 249, 'data_format': 'naml', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'naml', 'dense_activation': 'relu', 'loss': 'cross_entropy_loss', 'wordEmb_file': '/tmp/tmpmjvrjmqp/utils/embedding_all.npy', 'wordDict_file': '/tmp/tmpmjvrjmqp/utils/word_dict_all.pkl', 'userDict_file': '/tmp/tmpmjvrjmqp/utils/uid2index.pkl', 'vertDict_file': '/tmp/tmpmjvrjmqp/utils/vert_dict.pkl', 'subvertDict_file': '/tmp/tmpmjvrjmqp/utils/subvert_dict.pkl'}


In [4]:
iterator = MINDAllIterator

## Train the NAML model

In [5]:
model = NAMLModel(hparams, iterator, seed=42)
print(model.run_eval(valid_news_file, valid_behaviors_file))

2022-07-11 10:17:29.939048: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-11 10:17:30.018883: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-11 10:17:30.043385: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-11 10:17:30.043900: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

{'group_auc': 0.4806, 'mean_mrr': 0.2097, 'ndcg@5': 0.2125, 'ndcg@10': 0.2762}


In [6]:
%%time
model.fit(train_news_file, train_behaviors_file,valid_news_file, valid_behaviors_file)

0it [00:00, ?it/s]2022-07-11 10:18:56.189428: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.18GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-07-11 10:18:56.190077: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.18GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-07-11 10:18:56.414383: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1015.76MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-07-11 10:18:56.414430: W tensorflow/core/common_runtime/bfc_al

at epoch 1
train info: logloss loss:1.4839619423375598
eval info: group_auc:0.5882, mean_mrr:0.2579, ndcg@10:0.3486, ndcg@5:0.2853
at epoch 1 , train time: 491.7 eval time: 44.2


2171it [08:06,  4.46it/s]
18709it [00:15, 1195.10it/s]
7523it [00:23, 316.92it/s]
7538it [00:00, 16017.95it/s]


at epoch 2
train info: logloss loss:1.403804251341598
eval info: group_auc:0.6243, mean_mrr:0.2793, ndcg@10:0.3757, ndcg@5:0.3127
at epoch 2 , train time: 486.7 eval time: 43.1


2171it [08:10,  4.43it/s]
18709it [00:16, 1138.49it/s]
7523it [00:25, 300.39it/s]
7538it [00:00, 13166.76it/s]


at epoch 3
train info: logloss loss:1.3555402696434664
eval info: group_auc:0.6442, mean_mrr:0.29, ndcg@10:0.3875, ndcg@5:0.3247
at epoch 3 , train time: 490.4 eval time: 45.4


2171it [08:04,  4.49it/s]
18709it [00:16, 1145.38it/s]
7523it [00:24, 305.93it/s]
7538it [00:00, 12218.46it/s]


at epoch 4
train info: logloss loss:1.3267070193380763
eval info: group_auc:0.6438, mean_mrr:0.2919, ndcg@10:0.3874, ndcg@5:0.3242
at epoch 4 , train time: 484.1 eval time: 44.8


2171it [08:05,  4.47it/s]
18709it [00:19, 945.95it/s] 
7523it [00:25, 295.44it/s]
7538it [00:00, 8429.40it/s] 


at epoch 5
train info: logloss loss:1.306584160677138
eval info: group_auc:0.6462, mean_mrr:0.2926, ndcg@10:0.3878, ndcg@5:0.3241
at epoch 5 , train time: 485.9 eval time: 49.5
CPU times: user 28min 26s, sys: 1min 18s, total: 29min 44s
Wall time: 44min 25s


<recommenders.models.newsrec.models.naml.NAMLModel at 0x7f7554141050>

In [7]:
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

18709it [00:15, 1175.98it/s]
7523it [00:23, 316.72it/s]
7538it [00:00, 15311.85it/s]


{'group_auc': 0.6462, 'mean_mrr': 0.2926, 'ndcg@5': 0.3241, 'ndcg@10': 0.3878}


In [8]:
sb.glue('res_syn', res_syn)

## Save the model

In [9]:
model_path = os.path.join(data_path, 'model')
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, 'naml_ckpt'))

## Output Prediction File
This code segment is used to generate the prediction.zip file, which is in the same format in MIND Competition Submission Tutorial.
Please change the MIND_type parameter to large if you want to submit your prediction to MIND Competition.

In [10]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)

18709it [00:15, 1173.31it/s]
7523it [00:24, 310.69it/s]
7538it [00:00, 13872.37it/s]


In [11]:
with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1
        pred_rank = (np.argsort(np.argsort(preds)[::-1])+1).tolist()
        pred_rank = '['+','.join([str(i) for i in pred_rank]) + ']'
        f.write(' '.join([str(impr_index), pred_rank])+'\n')

7538it [00:00, 100950.10it/s]


In [12]:
f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)
f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')
f.close()

In [13]:
data_path

'/tmp/tmpmjvrjmqp'