In [1]:
import os
import pandas as pd
import numpy as np
import random
import pickle
import torch

import warnings
warnings.filterwarnings('ignore')

%config Completer.use_jedi = False

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [7]:
df = pd.read_csv('./data/traindata_demo.csv')
df.dropna(subset=['sapo'], inplace=True)
df

Unnamed: 0,uid,articleID,sapo
0,1002681074,4243622,Galaxy S21 Ultra được trang bị pin 5.000 mAh n...
1,1002681074,4243459,"Thấy nòng súng súng thò ra khỏi cửa, nữ cảnh s..."
2,1002681074,4238172,"Sau khi có chức vô địch Australia Mở rộng, Nov..."
3,1002681074,4237883,Novak Djokovic đạt tỷ lệ thắng 81% trước các đ...
4,1002681074,4231206,Một người không thể đi hơn một xe máy ra đường...
...,...,...,...
2726,1003347668,4154347,Trên chương trình El Larguero của đài Cadena S...
2727,1003347668,4154384,Tiền đạo Robert Lewandowski nói anh xứng đáng ...
2728,1003347668,4155165,Hầu hết những chỉ số tấn công của Lionel Messi...
2729,1003347668,4155219,Indonesia


In [8]:
df.uid.unique().shape

(19,)

In [97]:
tmp = df.groupby('uid')
len_group = []
for name, group in tmp:
    len_group.append(group['articleID'].count())

In [110]:
baskets = pd.DataFrame()
baskets['leng'] = len_group
baskets[baskets.leng <10].count()

leng    2289
dtype: int64

## Preprocessing

In [9]:
# lấy user mà clicked articleID => article khác của user khác
def get_negative_sample(articleID, df, npratio=4):
    uid = df[df.articleID==articleID].uid.values[0]
    tmp_df = df[df.uid != uid].sample(4)
    return [articleID] + list(tmp_df.articleID)

In [7]:
def preprocess_clicked(df, npratio=4):
    uids = df.uid.unique()
    user_count = len(uids)
    userid_dict = {}
    for uid in uids:
        if uid not in userid_dict:
            userid_dict[uid] = len(userid_dict) # map uid_raw -> 0 1 2

    all_train_id = []
    all_train_pn = []
    all_label = []
    
    all_test_id = []
    all_test_pn = []
    all_test_label = []
    all_test_index = []
    
    all_user_pos = []
    all_test_user_pos = []

    for uid in uids:
        tmp_df = df[df.uid==uid] # df of uid
        clicked_news = tmp_df.articleID.values
        clicked_news = set(clicked_news)  # get all unique article which user clicked
        
        for idx in range(len(tmp_df)-1):
            line = tmp_df.iloc[idx]
            all_train_pn.append(get_negative_sample(line.articleID, df))
            all_label.append([1,0,0,0,0])
            all_train_id.append(userid_dict[uid])

            remain_clicked = list(clicked_news - set([line.articleID]))
            remain_clicked = random.sample(remain_clicked, min(50, len(remain_clicked)))
            remain_clicked += [0] * (50-len(remain_clicked)) # <50 cho = 0
            all_user_pos.append(remain_clicked)
        
        # get the last line for testing
        sess_index = []
        sess_index.append(len(all_test_pn))

        line = tmp_df.iloc[-1]
        all_test_pn += get_negative_sample(line.articleID, df)
        sess_index.append(len(all_test_pn))
        all_test_index.append(sess_index)
        all_test_label += [1,0,0,0,0]
        all_test_id += [userid_dict[uid]] * (npratio+1)
        allpos = random.sample(clicked_news, min(50, len(clicked_news)))
        allpos += [0] * (50-len(allpos))
        for i in range(5):
            all_test_user_pos.append(allpos)
    
    all_train_pn = np.array(all_train_pn,dtype='int32')
    all_label = np.array(all_label,dtype='int32')
    all_train_id = np.array(all_train_id,dtype='uint64')
    all_test_pn = np.array(all_test_pn,dtype='int32')
    all_test_label = np.array(all_test_label,dtype='int32')
    all_test_id = np.array(all_test_id,dtype='uint64')
    all_user_pos = np.array(all_user_pos,dtype='int32')
    all_test_user_pos = np.array(all_test_user_pos, dtype='int32')

    return (userid_dict, user_count, all_train_pn, all_label, all_train_id, all_test_pn, all_test_label, all_test_id, all_user_pos, all_test_user_pos, all_test_index)

In [8]:
%%time
userid_dict,user_count, all_train_pn, all_label, all_train_id, all_test_pn, all_test_label, all_test_id, all_user_pos, all_test_user_pos, all_test_index = preprocess_clicked(df)

CPU times: user 4.5 s, sys: 11.7 ms, total: 4.51 s
Wall time: 4.51 s


In [9]:
dataloader = (user_count, all_train_pn, all_label, all_train_id, all_test_pn, all_test_label, all_test_id, all_user_pos, all_test_user_pos, all_test_index)

file = open('./model/thuytt_ver2/dataloader.pkl', 'wb')
pickle.dump(dataloader, file)
file.close()

In [10]:
file = open('./model/thuytt_ver2/dataloader.pkl', 'rb')
user_count, all_train_pn, all_label, all_train_id, all_test_pn, all_test_label, all_test_id, all_user_pos, all_test_user_pos, all_test_index = pickle.load(file)
file.close()

## Tokenizer and make word_dict of articles

In [11]:
from vncorenlp import VnCoreNLP
VnCoreNLP_jar_file = '../vncorenlp/VnCoreNLP-1.1.1.jar'
rdrsegmenter = VnCoreNLP(VnCoreNLP_jar_file, annotators='wseg')
embedding_dim=768

In [12]:
rdrsegmenter.tokenize('Tôi là sinh viên trường đại học Công nghệ.')[0]

['Tôi', 'là', 'sinh_viên', 'trường', 'đại_học', 'Công_nghệ', '.']

In [14]:
def preprocess_news(df):
    sapos = df.sapo.values
    articleIds = df.articleID.values

    news = {} 

    for i in range(len(articleIds)):
        if articleIds[i] not in news:
            tokenized_words = rdrsegmenter.tokenize(sapos[i])[0]
            news[articleIds[i]] = tokenized_words

    
    word_dict_raw = {'PADDING': [0,999999]}
    for articleId in news:
        for word in news[articleId]:
            if word in word_dict_raw:
                word_dict_raw[word][1] += 1 # increase freq
            else:
                word_dict_raw[word] = [len(word_dict_raw), 1] # format: [index, freq]
                
    word_dict = {}
    for i in word_dict_raw:
        if word_dict_raw[i][1] >= 2:
            word_dict[i] = [len(word_dict), word_dict_raw[i][1]]
    print('len word_dict (freq>=2 vs raw):', len(word_dict), len(word_dict_raw)) # chỉ để so sánh (loại bỏ freq =1)
    
    print('leng news (tokenizer):',len(news))

    news_words = [ [0]*30 ] # 
    news_index = {0:0}
    
    for articleId in news: # quét các article
        word_id = []
        news_index[articleId] = len(news_index)
        for word in news[articleId]: # quét các tokens
            if word in word_dict:
                word_id.append(word_dict[word][0])
        word_id = word_id[:30] # lấy word_id của article (embedd)
        news_words.append(word_id + [0]*(30-len(word_id))) # max 30 tokens, <30 cho =0
    
    news_words = np.array(news_words, dtype='int32')

    return word_dict, news_words, news_index, news

In [15]:
%%time
word_dict, news_words, news_index, news = preprocess_news(df)

len word_dict (freq>=2 vs raw): 3111 7097
leng news (tokenizer): 2279
CPU times: user 4.81 s, sys: 268 ms, total: 5.07 s
Wall time: 8.48 s


In [16]:
file = open('./model/thuytt_ver2/phobert_news_preprocess.pkl', 'wb')
pickle.dump((word_dict, news_words, news_index, news), file)
file.close()

In [5]:
file = open('./model/thuytt_ver2/phobert_news_preprocess.pkl', 'rb')
word_dict, news_words, news_index, news = pickle.load(file)
file.close()

## Phobert + embedding_matrix

In [19]:
import argparse

# from transformers import RobertaConfig, RobertaModel

from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary

In [20]:
vocab = Dictionary()
vocab.add_from_file('../PhoBERT_base_transformers/dict.txt')

In [21]:
# load bert model
from fairseq.models.roberta import RobertaModel
import os
phobert = RobertaModel.from_pretrained('../PhoBERT_base_fairseq', checkpoint_file='model.pt')
phobert.eval()  # disable dropout (or leave in train mode to finetune)

# Incorporate the BPE encoder into PhoBERT-base 
from fairseq.data.encoders.fastbpe import fastBPE  
from fairseq import options  
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="../PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
phobert.bpe = fastBPE(args) #Incorporate the BPE encoder into PhoBERT

In [22]:
def get_embedding(word_dict):
    embedding_dict = {}

    for word in word_dict:
        input_ids = vocab.encode_line(word, append_eos=False, add_if_not_exist=False).long()
        embedding_tensor = phobert.extract_features(input_ids)
        embedding_dict[word] = embedding_tensor.data.cpu().numpy()[0][0]
        
    embedding_matrix = [0]*len(word_dict)
    cand = []

    for i in embedding_dict:
        embedding_matrix[word_dict[i][0]] = np.array(embedding_dict[i], dtype='float32')
        cand.append(embedding_matrix[word_dict[i][0]])
    
    cand = np.array(cand, dtype='float32')
    mu = np.mean(cand, axis=0)
    Sigma = np.cov(cand.T)
    norm = np.random.multivariate_normal(mu, Sigma, 1)

    for i in range(len(embedding_matrix)):
        if type(embedding_matrix[i]) == int: # unknown words
            embedding_matrix[i] = np.reshape(norm, embedding_dim)
    
    embedding_matrix[0] = np.zeros(embedding_dim, dtype='float32')
    embedding_matrix = np.array(embedding_matrix, dtype='float32')

    print(embedding_matrix.shape)
    return embedding_matrix

In [23]:
%%time
embedding_mat = get_embedding(word_dict)

(3111, 768)
CPU times: user 32min 14s, sys: 1min 1s, total: 33min 16s
Wall time: 1min 23s


In [24]:
file = open('./model/thuytt_ver2/phobert_embed_mat.pkl', 'wb')
pickle.dump(embedding_mat, file)
file.close()

In [6]:
file = open('./model/thuytt_ver2/phobert_embed_mat.pkl', 'rb')
embedding_mat = pickle.load(file)
file.close()
embedding_dim=768

## Load data and model

In [3]:
# load thuytt_ver2
working_dir = '/home/thuytt/test_bert/NCKH/model/thuytt_ver2/'
os.chdir(working_dir)

file = open('dataloader.pkl', 'rb')
user_count, all_train_pn, all_label, all_train_id, all_test_pn, all_test_label, all_test_id, all_user_pos, all_test_user_pos, all_test_index = pickle.load(file)
file.close()

file = open('phobert_news_preprocess.pkl', 'rb')
word_dict, news_words, news_index, news = pickle.load(file)
file.close()

file = open('phobert_embed_mat.pkl', 'rb')
embedding_mat = pickle.load(file)
file.close()
embedding_dim=768

In [4]:
all_user_pos.shape

(2598, 50)

In [14]:
all_test_user_pos.shape

(95, 50)

# Model

## Metric

In [6]:
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)

In [7]:
def _articles_to_index(arr_np):
    arr_index_articles = []
    for arr_articles in arr_np:
        index_article = []
        for article in arr_articles:
            index_article.append(news_index[article])
        arr_index_articles.append(index_article)
    arr_index_articles = np.array(arr_index_articles, dtype='int32')
    return arr_index_articles

## generate batch

In [8]:
# test generate_batch_data_train
batch_size = 32
inputid = np.arange(len(all_label))
np.random.shuffle(inputid)
y=all_label
batches = [inputid[range(batch_size*i, min(len(y), batch_size*(i+1)))] for i in range(len(y)//batch_size+1)]
for i in batches:
    index_all_train_pn = _articles_to_index(all_train_pn[i])
    candidate = news_words[index_all_train_pn]
    candidate_split=[candidate[:,k,:] for k in range(candidate.shape[1])]
    
    #
    index_all_user_pos = _articles_to_index(all_user_pos[i])
    browsed_news=news_words[index_all_user_pos]
    browsed_news_split=[browsed_news[:,k,:] for k in range(browsed_news.shape[1])]
    userid=np.expand_dims(all_train_id[i],axis=1)
    label=all_label[i]

In [9]:
def generate_batch_data_train(all_train_pn,all_label,all_train_id,all_user_pos,batch_size):
    inputid = np.arange(len(all_label))
    np.random.shuffle(inputid)
    y=all_label
    batches = [inputid[range(batch_size*i, min(len(y), batch_size*(i+1)))] for i in range(len(y)//batch_size+1)]

    while (True):
        for i in batches:
            if(i.size ==0):
                continue
            index_all_train_pn = _articles_to_index(all_train_pn[i])
            candidate = news_words[index_all_train_pn]
            candidate_split=[candidate[:,k,:] for k in range(candidate.shape[1])]

            #
            index_all_user_pos = _articles_to_index(all_user_pos[i])
            browsed_news=news_words[index_all_user_pos]
            browsed_news_split=[browsed_news[:,k,:] for k in range(browsed_news.shape[1])]
            userid=np.expand_dims(all_train_id[i],axis=1)
            label=all_label[i]
            yield (candidate_split +browsed_news_split+[userid], label)

In [4]:
# test generate_batch_data_test
inputid = np.arange(len(all_test_label))
y=all_test_label
batch_size=6
batches = [inputid[range(batch_size*i, min(len(y), batch_size*(i+1)))] for i in range(len(y)//batch_size+1)]

for i in batches:
    index_all_test_pn = [news_index[x] for x in all_test_pn[i]]
    candidate = news_words[index_all_test_pn]
    
    tmp = _articles_to_index(all_test_user_pos[i])
    browsed_news=news_words[_articles_to_index(all_test_user_pos[i])]
    browsed_news_split=[browsed_news[:,k,:] for k in range(browsed_news.shape[1])]
    userid=np.expand_dims(all_test_id[i],axis=1)
    label=all_test_label[i]

NameError: name 'np' is not defined

In [10]:
def generate_batch_data_test(all_test_pn,all_test_label,all_test_id,all_test_user_pos,batch_size):
    inputid = np.arange(len(all_test_label))
    y=all_test_label
    batches = [inputid[range(batch_size*i, min(len(y), batch_size*(i+1)))] for i in range(len(y)//batch_size+1)]

    while (True):
        for i in batches:
            if(i.size ==0):
                continue
            index_all_test_pn = [news_index[x] for x in all_test_pn[i]]
            candidate = news_words[index_all_test_pn]

            browsed_news=news_words[_articles_to_index(all_test_user_pos[i])]
            browsed_news_split=[browsed_news[:,k,:] for k in range(browsed_news.shape[1])]
            userid=np.expand_dims(all_test_id[i],axis=1)
            label=all_test_label[i]

            yield ([candidate]+ browsed_news_split+[userid], label)

## Keras model

In [11]:
import keras
from keras.layers import *
from keras.models import Model
from keras import backend as K
from keras.optimizers import *
from sklearn.metrics import roc_auc_score
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

Using TensorFlow backend.


In [12]:
MAX_SENT_LENGTH=30
MAX_SENTS=50 # maximum clicked news for user embedding
npratio=4
batch_size=32
n_epoch=100

In [13]:
def seed_everything(SEED):
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    tf.compat.v1.set_random_seed(SEED)
    
seed_everything(42)

In [14]:
results=[]


# user embedding
user_id = Input(shape=(1,), dtype='uint64')

user_embedding_layer= Embedding(user_count, MAX_SENTS, trainable=True) #ouput leng == max clicked_news
user_embedding= user_embedding_layer(user_id)
user_embedding_word= Dense(200,activation='relu')(user_embedding)
user_embedding_word= Flatten()(user_embedding_word)
user_embedding_news= Dense(200,activation='relu')(user_embedding)
user_embedding_news= Flatten()(user_embedding_news)


# news embedding architecture
news_input = Input(shape=(MAX_SENT_LENGTH,), dtype='k')
embedding_layer = Embedding(len(word_dict) , embedding_dim, weights=[embedding_mat],trainable=True)
embedded_sequences = embedding_layer(news_input)
embedded_sequences =Dropout(0.2)(embedded_sequences)

cnnouput = Conv1D(padding='same', activation='relu', strides=1, filters=embedding_dim, kernel_size=3)(embedded_sequences)
cnnouput=Dropout(0.2)(cnnouput)

attention_a = Dot((2, 1))([cnnouput, Dense(embedding_dim,activation='tanh')(user_embedding_word)])
attention_weight = Activation('softmax')(attention_a)
news_rep=keras.layers.Dot((1, 1))([cnnouput, attention_weight])
newsEncoder = Model([news_input,user_id], news_rep)


# clicked news embedding
all_news_input = [keras.Input((MAX_SENT_LENGTH,), dtype='int32') for _ in range(MAX_SENTS)]
browsed_news_rep = [newsEncoder([news,user_id]) for news in all_news_input]
browsed_news_rep = concatenate([Lambda(lambda x: K.expand_dims(x,axis=1))(news) for news in browsed_news_rep],axis=1)


# User Embedding
attention_news = keras.layers.Dot((2, 1))([browsed_news_rep, Dense(embedding_dim,activation='tanh')(user_embedding_news)])
attention_weight_news = Activation('softmax')(attention_news)
user_rep=keras.layers.Dot((1, 1))([browsed_news_rep, attention_weight_news])


# candidate news embedding
candidates = [keras.Input((MAX_SENT_LENGTH,), dtype='int32') for _ in range(1+npratio)]
candidate_vecs = [ newsEncoder([candidate,user_id]) for candidate in candidates]

# Click Predictor???
logits = [keras.layers.dot([user_rep, candidate_vec], axes=-1) for candidate_vec in candidate_vecs]
logits = keras.layers.Activation(keras.activations.softmax)(keras.layers.concatenate(logits))


model = Model(candidates+all_news_input+[user_id], logits)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.00025), metrics=['acc'])


candidate_one = keras.Input((MAX_SENT_LENGTH,))
candidate_one_vec = newsEncoder([candidate_one,user_id])
score = keras.layers.Activation(keras.activations.sigmoid)(keras.layers.dot([user_rep, candidate_one_vec], axes=-1))
model_test = keras.Model([candidate_one]+all_news_input+[user_id], score)

In [15]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            (None, 30)           0                                            
____________________________________________________________________________________________

In [500]:
inputid = np.arange(len(all_test_label))
y=all_test_label
batch_size=100
batches = [inputid[range(batch_size*i, min(len(y), batch_size*(i+1)))] for i in range(len(y)//batch_size+1)]
# print(batches)

for i in batches:
    if(i.size ==0):
        continue
    index_all_test_pn = [news_index[x] for x in all_test_pn[i]]
    candidate = news_words[index_all_test_pn]

    browsed_news=news_words[_articles_to_index(all_test_user_pos[i])]
    browsed_news_split=[browsed_news[:,k,:] for k in range(browsed_news.shape[1])]
    userid=np.expand_dims(all_test_id[i],axis=1)
    label=all_test_label[i]


In [501]:
print(browsed_news_split[0].shape)
print(len(browsed_news_split))
print(browsed_news.shape)
print(batches[0])
browsed_news_split[5]

(95, 30)
50
(95, 50, 30)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94]


array([[ 366,  367,  204, ...,    0,    0,    0],
       [ 366,  367,  204, ...,    0,    0,    0],
       [ 366,  367,  204, ...,    0,    0,    0],
       ...,
       [2473,  296,   38, ...,    0,    0,    0],
       [2473,  296,   38, ...,    0,    0,    0],
       [2473,  296,   38, ...,    0,    0,    0]], dtype=int32)

In [509]:
# test generate_batch_data_train
batch_size = 100
inputid = np.arange(len(all_label))
np.random.shuffle(inputid)
y=all_label
batches = [inputid[range(batch_size*i, min(len(y), batch_size*(i+1)))] for i in range(len(y)//batch_size+1)]
for i in batches:
    index_all_train_pn = _articles_to_index(all_train_pn[i])
    candidate = news_words[index_all_train_pn]
    candidate_split=[candidate[:,k,:] for k in range(candidate.shape[1])]
    
    #
    index_all_user_pos = _articles_to_index(all_user_pos[i])
    browsed_news=news_words[index_all_user_pos]
    browsed_news_split=[browsed_news[:,k,:] for k in range(browsed_news.shape[1])]
    userid=np.expand_dims(all_train_id[i],axis=1)
    label=all_label[i]

[  15 2448  584 1328 2345    8 2415 1633    2 1908 1498 2425  606  761
  129  778  228  884  341 1009 2261 2408 2364  825 1882 1133 2124 2521
 1217 1851 1586 2144 1569 1694 2318 1182 1450 2055 2188  434 2140  166
 1919 2192 2447 1784  649 1465   34 1464 1527 1726 1889  669 1421 2270
 1933 2285 2187  863  776 1347 2111 2122 2074   98 2386 2368 2269   50
 1879  112 1302 1249  190 1966  200 1669  941  868  621 1474  543 2566
 1881    6 2171  998 1378 1045 1135 1068 2359 2350  484 1103  694  957
 1481 1778]
[1535 2339  164 1468 2580  155 1764  174  798 1487 1544 2556 1444  466
 1979  435 2342 1235  388 2248 1855 1070 2541 1094 1159  842 1136  978
 1360 2575 1111 1309  807 2589 1695  331  726  795    1 2097  461 1529
 2532 1300  492  302 1453  206 1040  512  975 1197  883 1073 2467  749
 2578  233 2087 2010  305 1252 1559 2510 2139 2059 2597 1256 1191 2189
 1396 2117 2041 1654 1290 2115 1429  121  712 1575 2070   84 1000  722
  561 1166  517 1840  973 2217 1198  494  697 1348 1162  324  165

In [513]:
print(len(y))
print(len(all_user_pos))
print(len(all_train_pn))
print(browsed_news_split[0].shape)
print(len(browsed_news_split))
print(browsed_news.shape)
print(len(batches))
print((batches[0]))
browsed_news_split[5]

2598
2598
2598
(98, 30)
50
(98, 50, 30)
26
[  15 2448  584 1328 2345    8 2415 1633    2 1908 1498 2425  606  761
  129  778  228  884  341 1009 2261 2408 2364  825 1882 1133 2124 2521
 1217 1851 1586 2144 1569 1694 2318 1182 1450 2055 2188  434 2140  166
 1919 2192 2447 1784  649 1465   34 1464 1527 1726 1889  669 1421 2270
 1933 2285 2187  863  776 1347 2111 2122 2074   98 2386 2368 2269   50
 1879  112 1302 1249  190 1966  200 1669  941  868  621 1474  543 2566
 1881    6 2171  998 1378 1045 1135 1068 2359 2350  484 1103  694  957
 1481 1778]


array([[  97,    4,   29, ...,    0,    0,    0],
       [   4,  110,   61, ...,    0,    0,    0],
       [1681,  533,   24, ...,    0,    0,    0],
       ...,
       [ 158,    0,    0, ...,    0,    0,    0],
       [ 171,  172,    0, ...,    0,    0,    0],
       [2133, 1564,  344, ...,    0,    0,    0]], dtype=int32)

In [515]:
click_score.shape

(95, 1)

In [64]:
batch_size = 100
testgen=generate_batch_data_test(all_test_pn, all_test_label, all_test_id, all_test_user_pos, batch_size)
click_score = model_test.predict_generator(testgen, steps=len(all_test_id)//batch_size, verbose=1)

AttributeError: 'int' object has no attribute 'assign'

In [443]:
news_rec = all_test_pn[click_score.argmax()]
df[df.articleID== news_rec]

Unnamed: 0,uid,articleID,sapo
64,1002681074,4102465,Mi Smart Compact Projector hỗ trợ kích thước t...


In [403]:
all_test_pn.shape
all_test_user_pos.shape
# all_test_label[0:5]
# all_test_id[0:5]

(95, 50)

In [419]:
userid_dict

{1002681074: 0,
 1027182531: 1,
 1004465291: 2,
 1045106859: 3,
 1039533691: 4,
 1065193564: 5,
 1065294702: 6,
 1044039998: 7,
 1007658224: 8,
 1039926550: 9,
 1005981237: 10,
 1054047771: 11,
 1061488042: 12,
 1002624136: 13,
 1062874128: 14,
 1049360283: 15,
 1026748786: 16,
 1013734256: 17,
 1003347668: 18}

In [427]:
all_test_pn[0]

4102465

In [441]:
df[(df.uid==1027182531)]

Unnamed: 0,uid,articleID,sapo
65,1027182531,4243840,Mohamed Salah liên tục lắc đầu và tỏ vẻ ngán n...
66,1027182531,4243738,Liverpool là đội vô địch đầu tiên thua liền nă...
67,1027182531,4243397,"Theo tay vợt số hai thế giới Rafael Nadal, phá..."
68,1027182531,4243189,Scotland
69,1027182531,4242602,HLV Ronald Koeman muốn gia hạn hợp đồng với Ba...
...,...,...,...
2085,1027182531,4135545,HLV Liverpool Jurgen Klopp tuyên bố đứng ngoài...
2086,1027182531,4135545,HLV Liverpool Jurgen Klopp tuyên bố đứng ngoài...
2093,1027182531,4135808,HLV Jurgen Klopp bất bình khi đồng nghiệp bên ...
2094,1027182531,4135810,Ngoại hạng Anh mùa 2020-2021 bắt đầu từ ngày 1...


In [428]:
df[df.articleID == all_test_pn[0]]

Unnamed: 0,uid,articleID,sapo
64,1002681074,4102465,Mi Smart Compact Projector hỗ trợ kích thước t...


In [426]:
click_score.argmax()

0

In [451]:
tmp = next(generate_batch_data_test(all_test_pn, all_test_label, all_test_id[0:5], all_test_user_pos, batch_size))
print(len(tmp[0]))

52


In [546]:
tmp1 = next(generate_batch_data_train(all_train_pn, all_label, all_train_id, all_user_pos, batch_size))
print(len(tmp1[0]))

56


In [454]:
tmp1[0][23][0]

array([171, 172,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0], dtype=int32)

In [152]:
tmp[0][50][0]

array([475, 486, 273, 273, 139, 314, 487, 255, 488,  17,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0], dtype=int32)

## Train

In [13]:
# metrhistoryt of [auc, mrr, ndcg@5, ndcg@10]
best_metric = [0., 0., 0., 0.]

In [1]:
# test
batch_size = 32
for ep in range(2):
    traingen=generate_batch_data_train(all_train_pn, all_label, all_train_id, all_user_pos, batch_size)
    model.fit_generator(traingen, epochs=1, steps_per_epoch=25)
    testgen=generate_batch_data_test(all_test_pn, all_test_label, all_test_id, all_test_user_pos, batch_size)
    click_score = model_test.predict_generator(testgen, steps=len(all_test_id)//batch_size, verbose=1)

    all_auc=[]
    all_mrr=[]
    all_ndcg=[]
    all_ndcg2=[]
    for m in all_test_index:
        if np.sum(all_test_label[m[0]:m[1]])!=0 and m[1]<len(click_score):
            all_auc.append(roc_auc_score(all_test_label[m[0]:m[1]], click_score[m[0]:m[1],0]))
            all_mrr.append(mrr_score(all_test_label[m[0]:m[1]], click_score[m[0]:m[1],0]))
            all_ndcg.append(ndcg_score(all_test_label[m[0]:m[1]], click_score[m[0]:m[1],0],k=5))
            all_ndcg2.append(ndcg_score(all_test_label[m[0]:m[1]], click_score[m[0]:m[1],0],k=10))
    results.append([np.mean(all_auc),np.mean(all_mrr),np.mean(all_ndcg),np.mean(all_ndcg2)])
    
    metric = [np.mean(all_auc),np.mean(all_mrr),np.mean(all_ndcg),np.mean(all_ndcg2)]
    if metric[0] > best_metric[0] and np.mean(metric) > np.mean(best_metric):
        best_metric = metric
        print('Best model: True')
        print(metric)
    else:
        print('Best model: False')
        print(f'AUC: {metric[0]:.4f}')

NameError: name 'n_epoch' is not defined

In [534]:
click_score.shape

(64, 1)

In [526]:
print(m)
print(all_test_label[m[0]:m[1]])
print(click_score[m[0]:m[1]])

[90, 95]
[1 0 0 0 0]
[]


In [15]:
# full n_ecoch
for ep in range(n_epoch):
    traingen=generate_batch_data_train(all_train_pn, all_label, all_train_id, all_user_pos, batch_size)
    model.fit_generator(traingen, epochs=1, steps_per_epoch=len(all_train_id)//batch_size)
    testgen=generate_batch_data_test(all_test_pn, all_test_label, all_test_id, all_test_user_pos, batch_size)
    click_score = model_test.predict_generator(testgen, steps=len(all_test_id)//batch_size, verbose=1)

    all_auc=[]
    all_mrr=[]
    all_ndcg=[]
    all_ndcg2=[]
    for m in all_test_index:
        if np.sum(all_test_label[m[0]:m[1]])!=0 and m[1]<len(click_score):
            all_auc.append(roc_auc_score(all_test_label[m[0]:m[1]], click_score[m[0]:m[1],0]))
            all_mrr.append(mrr_score(all_test_label[m[0]:m[1]], click_score[m[0]:m[1],0]))
            all_ndcg.append(ndcg_score(all_test_label[m[0]:m[1]], click_score[m[0]:m[1],0],k=5))
            all_ndcg2.append(ndcg_score(all_test_label[m[0]:m[1]], click_score[m[0]:m[1],0],k=10))
    results.append([np.mean(all_auc),np.mean(all_mrr),np.mean(all_ndcg),np.mean(all_ndcg2)])
    
    metric = [np.mean(all_auc),np.mean(all_mrr),np.mean(all_ndcg),np.mean(all_ndcg2)]
    if metric[0] > best_metric[0] and np.mean(metric) > np.mean(best_metric):
        best_metric = metric
        print('Best model: True')
        print(metric)
    else:
        print('Best model: False')
        print(f'AUC: {metric[0]:.4f}')

Epoch 1/1
Best model: True
[0.3958333333333333, 0.35000000000000003, 0.5084868257705297, 0.5084868257705297]
Epoch 1/1
Best model: True
[0.6041666666666666, 0.5236111111111111, 0.6418834579783793, 0.6418834579783793]
Epoch 1/1
Best model: False
AUC: 0.3750
Epoch 1/1
Best model: True
[0.6666666666666666, 0.5583333333333333, 0.669482037067506, 0.669482037067506]
Epoch 1/1
Best model: False
AUC: 0.6042
Epoch 1/1
Best model: False
AUC: 0.3750
Epoch 1/1
Best model: False
AUC: 0.3750
Epoch 1/1
Best model: False
AUC: 0.5417
Epoch 1/1
Best model: False
AUC: 0.3958
Epoch 1/1
Best model: False
AUC: 0.6250
Epoch 1/1
Best model: False
AUC: 0.6250
Epoch 1/1
Best model: False
AUC: 0.6042
Epoch 1/1
Best model: False
AUC: 0.5417
Epoch 1/1
Best model: False
AUC: 0.5208
Epoch 1/1
Best model: False
AUC: 0.5208
Epoch 1/1
Best model: False
AUC: 0.5625
Epoch 1/1
Best model: False
AUC: 0.5000
Epoch 1/1
Best model: True
[0.7291666666666666, 0.6416666666666667, 0.7316368389957429, 0.7316368389957429]
Epoch 1/1

Best model: False
AUC: 0.7604
Epoch 1/1
Best model: False
AUC: 0.8021
Epoch 1/1
Best model: False
AUC: 0.7604
Epoch 1/1
Best model: False
AUC: 0.7396
Epoch 1/1
Best model: False
AUC: 0.7917
Epoch 1/1
Best model: False
AUC: 0.7396
Epoch 1/1
Best model: False
AUC: 0.7812
Epoch 1/1
Best model: False
AUC: 0.7812
Epoch 1/1
Best model: False
AUC: 0.7708
Epoch 1/1
Best model: False
AUC: 0.7292
Epoch 1/1
Best model: False
AUC: 0.7708


In [17]:
model_dict = {
    'model' : model,
    'model_test' : model_test
}
file = open('model_ver1.pkl', 'wb')
pickle.dump(model_dict, file)
file.close()

In [18]:
!ls

dataloader.pkl	phobert_embed_mat.pkl
model_ver1.pkl	phobert_news_preprocess.pkl
