In [6]:
from transformers import AutoModel,BertModel

b = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
b.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
import numpy as np
import re
from torch.utils.data import Dataset
from utils.utils import newsample, getId2idx, my_collate
from data.configs.demo import config
from torch.utils.data import DataLoader

In [12]:
class MIND_bert(Dataset):
    """ Map Style Dataset for MIND, use bert tokenizer

    Args:
        config(dict): pre-defined dictionary of hyper parameters
        news_file(str): path of news_file
        behaviors_file(str): path of behaviors_file
        shuffle(bool): whether to shuffle the order of impressions
    """

    def __init__(self, config, news_file, behaviors_file, shuffle_pos=False, validate=False):
        from transformers import BertTokenizerFast
        # initiate the whole iterator
        self.npratio = config.npratio
        self.shuffle_pos = shuffle_pos

        self.news_file = news_file
        self.behaviors_file = behaviors_file
        self.col_spliter = '\t'
        self.batch_size = config.batch_size
        self.title_size = config.title_size
        self.abs_size = config.abs_size
        self.his_size = config.his_size

        self.k = config.k
        self.mode = re.search(
            'MIND/.*_(.*)/news', news_file).group(1)

        # there are only two types of vocabulary
        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

        self.nid2index = getId2idx(
            'data/dictionaries/nid2idx_{}_{}.json'.format(config.scale, self.mode))
        self.uid2index = getId2idx(
            'data/dictionaries/uid2idx_{}.json'.format(config.scale))
        if validate:
            self.mode = 'dev'

        self.init_news()
        self.init_behaviors()

    def init_news(self):
        """
            init news information given news file, such as news_title_array.
        """

        # VERY IMPORTANT!!! FIXME
        # The nid2idx dictionary must follow the original order of news in news.tsv

        documents = ['hello BERT']

        with open(self.news_file, "r", encoding='utf-8') as rd:
            for idx in rd:
                nid, vert, subvert, title, ab, url, _, _ = idx.strip("\n").split(self.col_spliter)
                # concat all fields to form the document
                # try:
                #     self.tokenizer.tokenize(' '.join([title, ab, vert, subvert]))
                # except:
                #     print(' '.join([title, ab, vert, subvert]))
                documents.append(' '.join([title, ab, vert, subvert]))

        encoded_dict = self.tokenizer(documents, add_special_tokens=False, padding=True, truncation=True, max_length=512, return_tensors='np')
        self.encoded_news = encoded_dict.input_ids
        self.attn_mask = encoded_dict.attention_mask


    def init_behaviors(self):
        """
            init behavior logs given behaviors file.
        """
        # list of list of history news index
        self.histories = []
        # list of user index
        self.uindexes = []
        # list of list of history padding length
        self.his_pad = []
        # list of impression indexes
        # self.impr_indexes = []

        impr_index = 0

        # only store positive behavior
        if self.mode == 'train':
            # list of list of clicked candidate news index along with its impression index
            self.imprs = []
            # dictionary of list of unclicked candidate news index
            self.negtives = {}

            with open(self.behaviors_file, "r", encoding='utf-8') as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split(self.col_spliter)
                    # important to subtract 1 because all list related to behaviors start from 0

                    history = [self.nid2index[i] for i in history.split()]
                    if self.k:
                        # guarantee there are at least k history not masked
                        self.his_pad.append(
                            min(max(self.his_size - len(history), 0), self.his_size - self.k))
                    else:
                        self.his_pad.append(max(self.his_size - len(history), 0))

                    # tailor user's history or pad 0
                    history = history[:self.his_size] + [0] * (self.his_size - len(history))
                    impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
                    labels = [int(i.split("-")[1]) for i in impr.split()]
                    # user will always in uid2index
                    uindex = self.uid2index[uid]

                    # store negative samples of each impression
                    negatives = []

                    for news, label in zip(impr_news, labels):
                        if label == 1:
                            self.imprs.append((impr_index, news))
                        else:
                            negatives.append(news)

                    # 1 impression correspond to 1 of each of the following properties
                    self.histories.append(history)
                    self.negtives[impr_index] = negatives
                    self.uindexes.append(uindex)

                    impr_index += 1

        # store every behavior
        elif self.mode == 'dev':
            # list of every candidate news index along with its impression index and label
            self.imprs = []

            with open(self.behaviors_file, "r", encoding='utf-8') as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split(self.col_spliter)

                    history = [self.nid2index[i] for i in history.split()]
                    if self.k:
                        # guarantee there are at least k history not masked
                        self.his_pad.append(
                            min(max(self.his_size - len(history), 0), self.his_size - self.k))
                    else:
                        self.his_pad.append(max(self.his_size - len(history), 0))

                    # tailor user's history or pad 0
                    history = history[:self.his_size] + [0] * (self.his_size - len(history))
                    impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
                    labels = [int(i.split("-")[1]) for i in impr.split()]
                    # user will always in uid2index
                    uindex = self.uid2index[uid]

                    # store every impression
                    for news, label in zip(impr_news, labels):
                        self.imprs.append((impr_index, news, label))

                    # 1 impression correspond to 1 of each of the following properties
                    self.histories.append(history)
                    self.uindexes.append(uindex)

                    impr_index += 1

        # store every behavior
        elif self.mode == 'test':
            # list of every candidate news index along with its impression index and label
            self.imprs = []

            with open(self.behaviors_file, "r", encoding='utf-8') as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split(self.col_spliter)

                    history = [self.nid2index[i] for i in history.split()]
                    if self.k:
                        # guarantee there are at least k history not masked
                        self.his_pad.append(
                            min(max(self.his_size - len(history), 0), self.his_size - self.k))
                    else:
                        self.his_pad.append(max(self.his_size - len(history), 0))

                    # tailor user's history or pad 0
                    history = history[:self.his_size] + [0] * (self.his_size - len(history))
                    impr_news = [self.nid2index[i] for i in impr.split()]
                    # user will always in uid2index
                    uindex = self.uid2index[uid]

                    # store every impression
                    for news in impr_news:
                        self.imprs.append((impr_index, news))

                    # 1 impression correspond to 1 of each of the following properties
                    self.histories.append(history)
                    self.uindexes.append(uindex)

                    impr_index += 1


    def __len__(self):
        """
            return length of the whole dataset
        """
        return len(self.imprs)

    def __getitem__(self,index):
        """ return data
        Args:
            index: the index for stored impression

        Returns:
            back_dic: dictionary of data slice
        """

        impr = self.imprs[index] # (impression_index, news_index)
        impr_index = impr[0]
        impr_news = impr[1]


        user_index = [self.uindexes[impr_index]]

        # each time called to return positive one sample and its negative samples
        if self.mode == 'train':
            # user's unclicked news in the same impression
            negs = self.negtives[impr_index]
            neg_list, neg_pad = newsample(negs, self.npratio)

            cdd_ids = [impr_news] + neg_list
            label = [1] + [0] * self.npratio

            if self.shuffle_pos:
                s = np.arange(0, len(label), 1)
                np.random.shuffle(s)
                cdd_ids = np.asarray(cdd_ids)[s]
                label = np.asarray(label)[s]

            # true means the corresponding history news is padded
            his_mask = np.zeros((self.his_size, 1), dtype=bool)
            his_ids = self.histories[impr_index]

            # in case the user has no history records, do not mask
            if self.his_pad[impr_index] == self.his_size or self.his_pad[impr_index] == 0:
                his_mask = his_mask
            else:
                his_mask[-self.his_pad[impr_index]:] = [True]

            # pad in candidate
            # candidate_mask = [1] * neg_pad + [0] * (self.npratio + 1 - neg_pad)

            cdd_encoded_index = self.encoded_news[cdd_ids]
            his_encoded_index = self.encoded_news[his_ids]
            cdd_attn_mask = self.attn_mask[cdd_ids]
            his_attn_mask = self.attn_mask[his_ids]

            back_dic = {
                "user_index": np.asarray(user_index),
                # "cdd_mask": np.asarray(neg_pad),
                'cdd_id': np.asarray(cdd_ids),
                'his_id': np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "his_mask": his_mask,
                "labels": label
            }

            return back_dic

        # each time called return one sample, and no labels
        elif self.mode == 'dev':
            cdd_ids = [impr_news]

            # true means the corresponding history news is padded
            his_mask = np.zeros((self.his_size, 1), dtype=bool)
            his_ids = self.histories[impr_index]

            user_index = [self.uindexes[impr_index]]
            label = impr[2]

            # in case the user has no history records, do not mask
            if self.his_pad[impr_index] == self.his_size or self.his_pad[impr_index] == 0:
                his_mask = his_mask
            else:
                his_mask[-self.his_pad[impr_index]:] = [True]

            cdd_encoded_index = self.encoded_news[cdd_ids]
            his_encoded_index = self.encoded_news[his_ids]
            cdd_attn_mask = self.attn_mask[cdd_ids]
            his_attn_mask = self.attn_mask[his_ids]

            back_dic = {
                "impression_index": impr_index + 1,
                "user_index": np.asarray(user_index),
                'cdd_id': np.asarray(cdd_ids),
                'his_id': np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "his_mask": his_mask,
                "labels": np.asarray([label])
            }
            return back_dic

        elif self.mode == 'test':
            cdd_ids = [impr_news]

            # true means the corresponding history news is padded
            his_mask = np.zeros((self.his_size, 1), dtype=bool)
            his_ids = self.histories[impr_index]

            user_index = [self.uindexes[impr_index]]

            # in case the user has no history records, do not mask
            if self.his_pad[impr_index] == self.his_size or self.his_pad[impr_index] == 0:
                his_mask = his_mask
            else:
                his_mask[-self.his_pad[impr_index]:] = [True]

            cdd_encoded_index = self.encoded_news[cdd_ids]
            his_encoded_index = self.encoded_news[his_ids]
            cdd_attn_mask = self.attn_mask[cdd_ids]
            his_attn_mask = self.attn_mask[his_ids]

            back_dic = {
                "impression_index": impr_index + 1,
                "user_index": np.asarray(user_index),
                'cdd_id': np.asarray(cdd_ids),
                'his_id': np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "his_mask": his_mask
            }
            return back_dic

        else:
            raise ValueError("Mode {} not defined".format(self.mode))


In [13]:
train_path = "C:/MIND/MINDdemo_train/"
a = MIND_bert(config, train_path + 'news.tsv', train_path + 'behaviors.tsv')

In [15]:
a[10]['cdd_encoded_index'].shape

(5, 512)

In [None]:
loader_train = DataLoader(a, batch_size=config.batch_size, pin_memory=False, num_workers=0, drop_last=False, shuffle=False, collate_fn=my_collate)

In [None]:
next(iter(loader_train))

In [17]:
from transformers import BertTokenizerFast
tok = BertTokenizerFast.from_pretrained('bert-base-uncased')

['i', 'love', 'you', 'you', 'love']

In [19]:
import torch

In [1]:
from models.Encoders.CNN import CNN_Encoder
from models.Encoders.FIM import FIM_Encoder

In [2]:
a = CNN_Encoder()

TypeError: __init__() missing 1 required positional argument: 'config'

In [1]:
from models.Encoders.FIM import FIM_Encoder
import torch
from data.configs.demo import config
config.embedding_dim = 5
config.hidden_dim = 6
a = torch.rand(2,3,4,5)

enc = FIM_Encoder(config)
b = enc(a)

In [3]:
b[0].size(),b[1].size()

(torch.Size([2, 3, 4, 3, 6]), torch.Size([2, 3, 6]))