<a href="https://colab.research.google.com/github/wendy60/Hybrid-recommender-system/blob/second-submit/LSTUR_Neural_News_Recommendation_with_Long_and_Short_term_User_Representations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I use an open source project from github, so I need to declare the copyright for each model. I use the MIND public dataset and the python package -- recommenders from microsoft.

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# **Global settings and imports**

In [None]:
pip install recommenders

Collecting recommenders
  Downloading recommenders-0.7.0-py3-none-manylinux1_x86_64.whl (314 kB)
[K     |████████████████████████████████| 314 kB 5.5 MB/s 
[?25hCollecting pydocumentdb>=2.3.3<3
  Downloading pydocumentdb-2.3.5-py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 2.4 MB/s 
Collecting cornac<2,>=1.1.2
  Downloading cornac-1.14.1-cp37-cp37m-manylinux1_x86_64.whl (12.4 MB)
[K     |████████████████████████████████| 12.4 MB 63.7 MB/s 
Collecting memory-profiler<1,>=0.54.0
  Downloading memory_profiler-0.58.0.tar.gz (36 kB)
Collecting scikit-surprise<=1.1.1,>=0.19.1
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 39.3 MB/s 
Collecting transformers<5,>=2.5.0
  Downloading transformers-4.12.4-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 53.4 MB/s 
[?25hCollecting pyyaml<6,>=5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |███████████

In [None]:
pip install tensorflow-gpu==1.15.2

Collecting tensorflow-gpu==1.15.2
  Downloading tensorflow_gpu-1.15.2-cp37-cp37m-manylinux2010_x86_64.whl (410.9 MB)
[K     |████████████████████████████████| 410.9 MB 12 kB/s 
[?25hCollecting tensorboard<1.16.0,>=1.15.0
  Downloading tensorboard-1.15.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 21.4 MB/s 
Collecting tensorflow-estimator==1.15.1
  Downloading tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503 kB)
[K     |████████████████████████████████| 503 kB 62.6 MB/s 
Collecting gast==0.2.2
  Downloading gast-0.2.2.tar.gz (10 kB)
Collecting keras-applications>=1.0.8
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 8.3 MB/s 
Building wheels for collected packages: gast
  Building wheel for gast (setup.py) ... [?25l[?25hdone
  Created wheel for gast: filename=gast-0.2.2-py3-none-any.whl size=7554 sha256=b2e93466765df35b427564703decc90b445fa990926ef8f92e7c6222e69af44e
  Stored in d

In [None]:
import sys
import os
import numpy as np
import zipfile
from tqdm import tqdm
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.lstur import LSTURModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
Tensorflow version: 1.15.2


# **Prepare Parameters**

In [None]:
epochs = 5
seed = 40
batch_size = 32

# Options: demo, small, large
#MIND_type = 'demo'
MIND_type = 'small'

# **Download and load data**

In [None]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')

valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')

wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
## yaml file is a configuration file
yaml_file = os.path.join(data_path, "utils", r'lstur.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)


100%|██████████| 17.0k/17.0k [00:00<00:00, 30.1kKB/s]
100%|██████████| 9.84k/9.84k [00:00<00:00, 21.5kKB/s]
100%|██████████| 95.0k/95.0k [00:02<00:00, 42.9kKB/s]



# **Create hyper-parameters**

In [None]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs)
print(hparams)

data_format=news,iterator_type=None,support_quick_scoring=True,wordEmb_file=/tmp/tmp2e6_vy6e/utils/embedding.npy,wordDict_file=/tmp/tmp2e6_vy6e/utils/word_dict.pkl,userDict_file=/tmp/tmp2e6_vy6e/utils/uid2index.pkl,vertDict_file=None,subvertDict_file=None,title_size=30,body_size=None,word_emb_dim=300,word_size=None,user_num=None,vert_num=None,subvert_num=None,his_size=50,npratio=4,dropout=0.2,attention_hidden_dim=200,head_num=4,head_dim=100,cnn_activation=relu,dense_activation=None,filter_num=400,window_size=3,vert_emb_dim=100,subvert_emb_dim=100,gru_unit=400,type=ini,user_emb_dim=50,learning_rate=0.0001,loss=cross_entropy_loss,optimizer=adam,epochs=5,batch_size=32,show_step=100000,metrics=['group_auc', 'mean_mrr', 'ndcg@5;10']


In [None]:
iterator = MINDIterator

# **Train the LSTUR model**

In [None]:
model = LSTURModel(hparams, iterator, seed=seed)

Tensor("conv1d/Relu:0", shape=(?, 30, 400), dtype=float32)
Tensor("att_layer2/Sum_1:0", shape=(?, 400), dtype=float32)


In [None]:

print(model.run_eval(valid_news_file, valid_behaviors_file))

1326it [00:01, 704.32it/s]
2286it [00:55, 41.24it/s]
73152it [00:09, 7440.00it/s]


{'group_auc': 0.6438, 'mean_mrr': 0.2928, 'ndcg@5': 0.3229, 'ndcg@10': 0.3897}


In [None]:
%%time
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

7386it [13:59,  8.79it/s]
1326it [00:01, 700.77it/s]
2286it [00:56, 40.81it/s]
73152it [00:09, 7344.38it/s]


at epoch 1
train info: logloss loss:1.2936728832410533
eval info: group_auc:0.66, mean_mrr:0.3071, ndcg@10:0.4036, ndcg@5:0.3406
at epoch 1 , train time: 839.8 eval time: 133.0


7386it [14:01,  8.78it/s]
1326it [00:01, 710.20it/s]
2286it [00:56, 40.74it/s]
73152it [00:10, 6930.21it/s]


at epoch 2
train info: logloss loss:1.269264179760287
eval info: group_auc:0.6636, mean_mrr:0.3117, ndcg@10:0.4083, ndcg@5:0.3441
at epoch 2 , train time: 841.6 eval time: 134.0


7386it [14:03,  8.76it/s]
1326it [00:01, 722.20it/s]
2286it [00:56, 40.63it/s]
73152it [00:10, 7201.88it/s]


at epoch 3
train info: logloss loss:1.2476109264989936
eval info: group_auc:0.673, mean_mrr:0.3218, ndcg@10:0.4183, ndcg@5:0.3559
at epoch 3 , train time: 843.1 eval time: 133.2


7386it [13:57,  8.82it/s]
1326it [00:01, 714.95it/s]
2286it [00:56, 40.36it/s]
73152it [00:10, 7184.20it/s]


at epoch 4
train info: logloss loss:1.2256476001754997
eval info: group_auc:0.6664, mean_mrr:0.3164, ndcg@10:0.412, ndcg@5:0.3483
at epoch 4 , train time: 837.4 eval time: 134.2


7386it [13:58,  8.81it/s]
1326it [00:01, 718.76it/s]
2286it [00:55, 41.07it/s]
73152it [00:09, 7532.17it/s]


at epoch 5
train info: logloss loss:1.19740165509798
eval info: group_auc:0.664, mean_mrr:0.3153, ndcg@10:0.4116, ndcg@5:0.3481
at epoch 5 , train time: 838.8 eval time: 131.1
CPU times: user 1h 37min 19s, sys: 5min 51s, total: 1h 43min 10s
Wall time: 1h 21min 6s


<recommenders.models.newsrec.models.lstur.LSTURModel at 0x7f2cfce507d0>

In [None]:
%%time
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

586it [00:01, 535.29it/s]
236it [00:07, 31.44it/s]
7538it [00:01, 5255.15it/s]


{'group_auc': 0.6428, 'mean_mrr': 0.2985, 'ndcg@5': 0.3314, 'ndcg@10': 0.3929}
CPU times: user 21.1 s, sys: 2.81 s, total: 23.9 s
Wall time: 19 s
