In [6]:
import re
import os
import pickle
import numpy as np
import logging
import torch
import math
import time
from torch.utils.data import Dataset
from utils.utils import newsample, getId2idx, tokenize, getVocab, my_collate, Partition_Sampler
from data.configs.demo import config
from torch.utils.data import DataLoader
from collections import defaultdict

from transformers import BertTokenizer,BertModel,BertTokenizerFast
from utils.MIND import MIND

logger = logging.getLogger(__name__)

In [None]:
class MIND(Dataset):
    """ Map Style Dataset for MIND, use bert tokenizer

    Args:
        config(dict): pre-defined dictionary of hyper parameters
        news_file(str): path of news_file
        behaviors_file(str): path of behaviors_file
        shuffle(bool): whether to shuffle the order of impressions
    """

    def __init__(self, config, news_file, behaviors_file, shuffle_pos=False):
        # initiate the whole iterator
        self.npratio = config.npratio
        self.shuffle_pos = shuffle_pos
        self.signal_length = config.signal_length
        self.his_size = config.his_size
        self.impr_size = config.impr_size
        self.k = config.k
        self.ascend_history = config.ascend_history
        self.reducer = config.reducer

        pat = re.search("MIND/(.*_(.*)/)news", news_file)
        self.mode = pat.group(2)

        self.cache_directory = "/".join(["data/cache", config.embedding, pat.group(1)])
        self.news_path = self.cache_directory + "news.pkl"
        self.behav_path = self.cache_directory + "{}/{}".format(self.impr_size, re.search("(\w*\.)tsv", behaviors_file).group(1) + "pkl")

        # only preprocess on the master node, the worker can directly load the cache
        if config.rank in [-1, 0]:
            if os.path.exists(self.behav_path):
                logger.info("loading cached user behavior from {}".format(self.behav_path))
                with open(self.behav_path, "rb") as f:
                    behaviors = pickle.load(f)
                    for k,v in behaviors.items():
                        setattr(self, k, v)

            else:
                logger.info("encoding user behaviors of {}...".format(behaviors_file))
                os.makedirs(self.cache_directory + str(self.impr_size), exist_ok=True)
                self.behaviors_file = behaviors_file
                self.nid2index = getId2idx("data/dictionaries/nid2idx_{}_{}.json".format(config.scale, self.mode))
                self.uid2index = getId2idx("data/dictionaries/uid2idx_{}.json".format(config.scale))

                self.init_behaviors()

            if os.path.exists(self.news_path):
                logger.info("loading cached news tokenization from {}".format(self.news_path))
                with open(self.news_path, "rb") as f:
                    news = pickle.load(f)
                    for k,v in news.items():
                        setattr(self, k, v)
            else:
                from transformers import BertTokenizerFast
                logger.info("encoding news of {}...".format(news_file))
                self.news_file = news_file
                self.max_news_length = 512
                # there are only two types of vocabulary
                self.tokenizer = BertTokenizerFast.from_pretrained(config.bert, cache=config.path + "bert_cache/")
                self.nid2index = getId2idx("data/dictionaries/nid2idx_{}_{}.json".format(config.scale, self.mode))
                self.init_news()

        if config.world_size > 1:
            dist.barrier()
            if config.rank != 0:
                logger.info("child process NO.{} loading cached user behavior from {}".format(config.rank, self.behav_path))
                with open(self.behav_path, "rb") as f:
                    behaviors = pickle.load(f)
                    for k,v in behaviors.items():
                        setattr(self, k, v)
                logger.info("child process NO.{} loading cached news tokenization from {}".format(config.rank, self.news_path))
                with open(self.news_path, "rb") as f:
                    news = pickle.load(f)
                    for k,v in news.items():
                        setattr(self, k, v)

        self.reduction_path = self.cache_directory + self.reducer + ".pkl"
        if config.reducer == "bm25":
            from .utils import BM25
            reducer = BM25(self.signal_length)
        elif config.reducer == "bow":
            from .utils import BagOfWords
            reducer = BagOfWords(self.signal_length, self.k)
        elif config.reducer == "matching":
            from utils.utils import DeDuplicate
            reducer = DeDuplicate(self.signal_length, self.k, ~config.no_dedup)
        else:
            reducer=None
        logger.info("reducing news of {}...".format(news_file))
        self.init_reduction(reducer)


    def init_news(self):
        """
            init news information given news file, such as news_title_array.

        Args:
            bm25: whether to sort the terms by bm25 score
        """

        # VERY IMPORTANT!!!
        # The nid2idx dictionary must follow the original order of news in news.tsv

        documents = [""]
        subwords = [[]]
        with open(self.news_file, "r", encoding="utf-8") as rd:
            for idx in rd:
                nid, vert, subvert, title, ab, url, _, _ = idx.strip("\n").split("\t")
                document = " ".join(["[CLS]", title, vert, subvert, ab])
                tokens = self.tokenizer.tokenize(document)[:512]

                # index for 1 entry
                subword = []

                i = -1
                j = -1
                for token in tokens:
                    if token.startswith('##'):
                        j += 1
                        subword.append([i,j])

                    else:
                        i += 1
                        j += 1
                        subword.append([i,j])

                documents.append(document)
                subwords.append(subword)

        encoded_dict = self.tokenizer(documents, add_special_tokens=False, padding=True, truncation=True, max_length=self.max_news_length, return_tensors="np")
        self.encoded_news = encoded_dict.input_ids
        self.attn_mask = encoded_dict.attention_mask

        max_token_length = self.encoded_news.shape[1]
        for i,subword in enumerate(subwords):
            subwords[i].extend([[0,0]] * (max_token_length - len(subword)))
        self.subwords = torch.as_tensor(subwords)

        with open(self.news_path, "wb") as f:
            pickle.dump(
                {
                    "encoded_news": self.encoded_news,
                    "subwords": self.subwords,
                    "attn_mask": self.attn_mask
                },
                f
            )

    def init_reduction(self, reducer):
        """
            init reduced news
        """
        if not reducer:
            return

        reduced_news_mask = reducer(self.encoded_news, self.attn_mask)
        self.reduced_news = reduced_news_mask[0]

        if self.reducer == "bow":
            self.attn_mask = reduced_news_mask[1]
            self.attn_mask_reduced = reduced_news_mask[2]

        else:
            self.attn_mask_reduced = reduced_news_mask[1]

    def init_behaviors(self):
        """
            init behavior logs given behaviors file.
        """
        # list of list of history news index
        histories = []
        # list of user index
        uindexes = []
        # list of impression indexes
        # self.impr_indexes = []

        impr_index = 0

        # only store positive behavior
        if self.mode == "train":
            # list of lists, each list represents a
            imprs = []
            negatives = []

            with open(self.behaviors_file, "r", encoding="utf-8") as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split("\t")

                    history = [self.nid2index[i] for i in history.split()]

                    impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
                    labels = [int(i.split("-")[1]) for i in impr.split()]

                    # user will always in uid2index
                    uindex = self.uid2index[uid]
                    # store negative samples of each impression
                    negative = []

                    for news, label in zip(impr_news, labels):
                        if label == 1:
                            imprs.append((impr_index, news))
                        else:
                            negative.append(news)

                    # 1 impression correspond to 1 of each of the following properties
                    histories.append(history)
                    negatives.append(negative)
                    uindexes.append(uindex)

                    impr_index += 1

            self.imprs = imprs
            self.histories = histories
            self.negatives = negatives
            self.uindexes = uindexes

            save_dict = {
                "imprs": self.imprs,
                "histories": self.histories,
                "negatives": self.negatives,
                "uindexes": self.uindexes
            }

        # store every behavior
        elif self.mode == "dev":
            # list of every cdd news index along with its impression index and label
            imprs = []

            with open(self.behaviors_file, "r", encoding="utf-8") as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split("\t")

                    history = [self.nid2index[i] for i in history.split()]

                    impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
                    labels = [int(i.split("-")[1]) for i in impr.split()]
                    # user will always in uid2index
                    uindex = self.uid2index[uid]

                    # store every impression
                    for i in range(0, len(impr_news), self.impr_size):
                        imprs.append((impr_index, impr_news[i:i+self.impr_size], labels[i:i+self.impr_size]))

                    # 1 impression correspond to 1 of each of the following properties
                    histories.append(history)
                    uindexes.append(uindex)

                    impr_index += 1

            self.imprs = imprs
            self.histories = histories
            self.uindexes = uindexes

            save_dict = {
                "imprs": self.imprs,
                "histories": self.histories,
                "uindexes": self.uindexes
            }

        # store every behavior
        elif self.mode == "test":
            # list of every cdd news index along with its impression index and label
            imprs = []

            with open(self.behaviors_file, "r", encoding="utf-8") as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split("\t")

                    history = [self.nid2index[i] for i in history.split()]

                    impr_news = [self.nid2index[i] for i in impr.split()]
                    # user will always in uid2index
                    uindex = self.uid2index[uid]

                    # store every impression
                    for i in range(0, len(impr_news), self.impr_size):
                        imprs.append((impr_index, impr_news[i:i+self.impr_size]))

                    # 1 impression correspond to 1 of each of the following properties
                    histories.append(history)
                    uindexes.append(uindex)

                    impr_index += 1

            self.imprs = imprs
            self.histories = histories
            self.uindexes = uindexes

            save_dict = {
                "imprs": self.imprs,
                "histories": self.histories,
                "uindexes": self.uindexes
            }

        with open(self.behav_path, "wb") as f:
            pickle.dump(save_dict, f)


    def __len__(self):
        """
            return length of the whole dataset
        """
        return len(self.imprs)

    def __getitem__(self,index):
        """ return data
        Args:
            index: the index for stored impression

        Returns:
            back_dic: dictionary of data slice
        """

        impr = self.imprs[index] # (impression_index, news_index)
        impr_index = impr[0]
        impr_news = impr[1]


        user_index = [self.uindexes[impr_index]]

        # each time called to return positive one sample and its negative samples
        if self.mode == "train":
            # user"s unhis news in the same impression
            negs = self.negatives[impr_index]
            neg_list, neg_pad = newsample(negs, self.npratio)

            cdd_ids = [impr_news] + neg_list
            cdd_size = self.npratio + 1

            label = np.asarray([1] + [0] * self.npratio)

            if self.shuffle_pos:
                s = np.arange(0, len(label), 1)
                np.random.shuffle(s)
                cdd_ids = np.asarray(cdd_ids)[s]
                label = np.asarray(label)[s]

            label = np.arange(0, len(cdd_ids), 1)[label == 1][0]

            his_ids = self.histories[impr_index][:self.his_size]

            cdd_mask = torch.ones((cdd_size, 1))
            cdd_mask[-neg_pad:] = 0

            # true means the corresponding history news is padded
            his_mask = torch.zeros((self.his_size, 1), dtype=bool)
            his_mask[:len(his_ids)] = 1

            if self.ascend_history:
                his_ids = his_ids + [0] * (self.his_size - len(his_ids))
            else:
                his_ids = his_ids[::-1] + [0] * (self.his_size - len(his_ids))

            cdd_encoded_index = self.encoded_news[cdd_ids][:, :self.signal_length]
            cdd_attn_mask = self.attn_mask[cdd_ids][:, :self.signal_length]
            his_encoded_index = self.encoded_news[his_ids][:, :self.signal_length]
            his_attn_mask = self.attn_mask[his_ids][:, :self.signal_length]

            cdd_subword_index_all = self.subwords[cdd_ids][:, :self.signal_length]
            cdd_subword_index = cdd_subword_index_all[:, :, 0] * self.signal_length + cdd_subword_index_all[:, :, 1]
            his_subword_index_all = self.subwords[his_ids][:, :self.signal_length]
            his_subword_index = his_subword_index_all[:, :, 0] * self.signal_length + his_subword_index_all[:, :, 1]

            cdd_dest = torch.zeros((cdd_size, self.signal_length * self.signal_length))
            cdd_subword_prefix = cdd_dest.index_fill(dim=-1, index=cdd_subword_index, value=1) * cdd_mask
            cdd_subword_prefix = cdd_subword_prefix.view(cdd_size, self.signal_length, self.signal_length)

            his_dest = torch.zeros((self.his_size, self.signal_length * self.signal_length))
            his_subword_prefix = his_dest.index_fill(dim=-1, index=his_subword_index, value=1) * his_mask
            his_subword_prefix = his_subword_prefix.view(self.his_size, self.signal_length, self.signal_length)

            back_dic = {
                "user_index": np.asarray(user_index),
                # "cdd_mask": np.asarray(neg_pad),
                "cdd_id": np.asarray(cdd_ids),
                "his_id": np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "cdd_subword_prefix": cdd_subword_prefix,
                "his_subword_prefix": his_subword_prefix,
                "his_mask": his_mask,
                "label": label
            }

            if self.reducer == "bm25":
                his_reduced_index = self.reduced_news[his_ids][:, :self.k + 1]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.k + 1]
                back_dic["his_reduced_index"] = his_reduced_index
                back_dic["his_reduced_mask"] = his_reduced_mask

            elif self.reducer == "matching":
                cdd_reduced_mask = self.attn_mask_reduced[cdd_ids][:, :self.signal_length]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.signal_length]
                back_dic["cdd_reduced_mask"] = cdd_reduced_mask
                back_dic["his_reduced_mask"] = his_reduced_mask

            elif self.reducer == "bow":
                his_reduced_index = self.reduced_news[his_ids][:, :self.signal_length]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.signal_length]
                back_dic["his_reduced_index"] = his_reduced_index
                back_dic["his_reduced_mask"] = his_reduced_mask

            return back_dic

        # each time called return one sample, and no labels
        elif self.mode == "dev":
            cdd_ids = impr_news
            cdd_size = len(cdd_ids)

            his_ids = self.histories[impr_index][:self.his_size]
            # true means the corresponding history news is padded
            his_mask = torch.zeros((self.his_size, 1), dtype=bool)
            his_mask[:len(his_ids)] = 1

            if self.ascend_history:
                his_ids = his_ids + [0] * (self.his_size - len(his_ids))
            else:
                his_ids = his_ids[::-1] + [0] * (self.his_size - len(his_ids))

            user_index = [self.uindexes[impr_index]]
            label = impr[2]

            cdd_encoded_index = self.encoded_news[cdd_ids][:, :self.signal_length]
            cdd_attn_mask = self.attn_mask[cdd_ids][:, :self.signal_length]
            his_encoded_index = self.encoded_news[his_ids][:, :self.signal_length]
            his_attn_mask = self.attn_mask[his_ids][:, :self.signal_length]

            cdd_subword_index_all = self.subwords[cdd_ids][:, :self.signal_length]
            cdd_subword_index = cdd_subword_index_all[:, :, 0] * self.signal_length + cdd_subword_index_all[:, :, 1]
            his_subword_index_all = self.subwords[his_ids][:, :self.signal_length]
            his_subword_index = his_subword_index_all[:, :, 0] * self.signal_length + his_subword_index_all[:, :, 1]

            cdd_dest = torch.zeros((cdd_size, self.signal_length * self.signal_length))
            cdd_subword_prefix = cdd_dest.index_fill(dim=-1, index=cdd_subword_index, value=1)
            cdd_subword_prefix = cdd_subword_prefix.view(cdd_size, self.signal_length, self.signal_length)

            his_dest = torch.zeros((self.his_size, self.signal_length * self.signal_length))
            his_subword_prefix = his_dest.index_fill(dim=-1, index=his_subword_index, value=1) * his_mask
            his_subword_prefix = his_subword_prefix.view(self.his_size, self.signal_length, self.signal_length)

            back_dic = {
                "impr_index": impr_index + 1,
                "user_index": np.asarray(user_index),
                "cdd_id": np.asarray(cdd_ids),
                "his_id": np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "cdd_subword_prefix": cdd_subword_prefix,
                "his_subword_prefix": his_subword_prefix,
                "his_mask": his_mask,
                "label": np.asarray(label)
            }

            if self.reducer == "bm25":
                his_reduced_index = self.reduced_news[his_ids][:, :self.k + 1]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.k + 1]
                back_dic["his_reduced_index"] = his_reduced_index
                back_dic["his_reduced_mask"] = his_reduced_mask

            elif self.reducer == "matching":
                cdd_reduced_mask = self.attn_mask_reduced[cdd_ids][:, :self.signal_length]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.signal_length]
                back_dic["cdd_reduced_mask"] = cdd_reduced_mask
                back_dic["his_reduced_mask"] = his_reduced_mask

            elif self.reducer == "bow":
                his_reduced_index = self.reduced_news[his_ids][:, :self.signal_length]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.signal_length]
                back_dic["his_reduced_index"] = his_reduced_index
                back_dic["his_reduced_mask"] = his_reduced_mask

            return back_dic

        elif self.mode == "test":
            cdd_ids = impr_news
            cdd_size = len(cdd_ids)

            his_ids = self.histories[impr_index][:self.his_size]
            # true means the corresponding history news is padded
            his_mask = torch.zeros((self.his_size, 1), dtype=bool)
            his_mask[:len(his_ids)] = 1

            if self.ascend_history:
                his_ids = his_ids + [0] * (self.his_size - len(his_ids))
            else:
                his_ids = his_ids[::-1] + [0] * (self.his_size - len(his_ids))

            user_index = [self.uindexes[impr_index]]

            cdd_encoded_index = self.encoded_news[cdd_ids][:, :self.signal_length]
            cdd_attn_mask = self.attn_mask[cdd_ids][:, :self.signal_length]
            his_encoded_index = self.encoded_news[his_ids][:, :self.signal_length]
            his_attn_mask = self.attn_mask[his_ids][:, :self.signal_length]

            cdd_subword_index_all = self.subwords[cdd_ids][:, :self.signal_length]
            cdd_subword_index = cdd_subword_index_all[:, :, 0] * self.signal_length + cdd_subword_index_all[:, :, 1]
            his_subword_index_all = self.subwords[his_ids][:, :self.signal_length]
            his_subword_index = his_subword_index_all[:, :, 0] * self.signal_length + his_subword_index_all[:, :, 1]

            cdd_dest = torch.zeros((cdd_size, self.signal_length * self.signal_length))
            cdd_subword_prefix = cdd_dest.index_fill(dim=-1, index=cdd_subword_index, value=1)
            cdd_subword_prefix = cdd_subword_prefix.view(cdd_size, self.signal_length, self.signal_length)

            his_dest = torch.zeros((self.his_size, self.signal_length * self.signal_length))
            his_subword_prefix = his_dest.index_fill(dim=-1, index=his_subword_index, value=1) * his_mask
            his_subword_prefix = his_subword_prefix.view(self.his_size, self.signal_length, self.signal_length)

            back_dic = {
                "impr_index": impr_index + 1,
                "user_index": np.asarray(user_index),
                "cdd_id": np.asarray(cdd_ids),
                "his_id": np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "cdd_subword_prefix": cdd_subword_prefix,
                "his_subword_prefix": his_subword_prefix,
                "his_mask": his_mask,
            }

            if self.reducer == "bm25":
                his_reduced_index = self.reduced_news[his_ids][:, :self.k + 1]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.k + 1]
                back_dic["his_reduced_index"] = his_reduced_index
                back_dic["his_reduced_mask"] = his_reduced_mask

            elif self.reducer == "matching":
                cdd_reduced_mask = self.attn_mask_reduced[cdd_ids][:, :self.signal_length]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.signal_length]
                back_dic["cdd_reduced_mask"] = cdd_reduced_mask
                back_dic["his_reduced_mask"] = his_reduced_mask

            elif self.reducer == "bow":
                his_reduced_index = self.reduced_news[his_ids][:, :self.signal_length]
                his_reduced_mask = self.attn_mask_reduced[his_ids][:, :self.signal_length]
                back_dic["his_reduced_index"] = his_reduced_index
                back_dic["his_reduced_mask"] = his_reduced_mask

            return back_dic

        else:
            raise ValueError("Mode {} not defined".format(self.mode))

In [2]:
# config.reducer = 'bm25'
# config.reducer = 'bow'
# config.reducer = 'matching'

# config.signal_length = 10
# config.scale = 'large'
path = config.path + 'MIND/MINDdemo_train/'
a = MIND(config, path + 'news.tsv', path + 'behaviors.tsv')

[2021-08-29 15:32:04,898] INFO (utils.MIND) loading cached user behavior from data/cache/bert/MINDdemo_train/10/behaviors..pkl
[2021-08-29 15:32:04,914] INFO (utils.MIND) loading cached news tokenization from data/cache/bert/MINDdemo_train/news.pkl
[2021-08-29 15:32:05,753] INFO (utils.MIND) reducing news of ../../../Data/MIND/MINDdemo_train/news.tsv...
[2021-08-29 15:32:05,755] INFO (utils.utils) unmasking at least k...
[2021-08-29 15:32:06,019] INFO (utils.utils) deduplicating...


In [3]:
loader1 = DataLoader(a, batch_size=1, pin_memory=False, num_workers=0, drop_last=False, shuffle=False)
records1 = list(loader1)

In [7]:
records1[0]['his_attn_mask']

tensor([[[1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [18]:
sub = a.subwords[1:3][:,10:20]
index = sub[:,:,0] * 10 + sub[:,:,1]

dest = torch.zeros((2, 10 * 10))
index

tensor([[110, 121, 132, 143, 154, 165, 166, 167, 178, 189],
        [ 90, 101, 112, 123, 134, 145, 156, 167, 178, 189]])

In [28]:
dest[torch.arange(2), ]

tensor([1., 0.])

In [None]:
dic = pickle.load(open('/data/workspace/Peitian/Code/Document-Reduction/Code/data/cache/bert/MINDdemo_dev/10/behaviors.pkl', 'rb'))
dic['imprs']

In [7]:
t = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [37]:
res = t(["[CLS] Newark Liberty Airport's Terminal One a $2.7 billion 'transformative' project","shit bro"], add_special_tokens=False, padding=True, max_length=20)
res

{'input_ids': [[101, 12948, 7044, 3199, 1005, 1055, 5536, 2028, 1037, 1002, 1016, 1012, 1021, 4551, 1005, 10938, 8082, 1005, 2622], [4485, 22953, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [21]:
re.sub('##','',' '.join(tokens))

"[CLS] newark liberty airport ' s terminal one a $ 2 . 7 billion ' transform ative ' project ?"

In [30]:
words = []
for tok in tokens:
    if tok.startswith('##'):
        words[-1] += tok[2:]
    else:
        words.append(tok)

In [48]:
' '.join(np.array(['I','you','she'],dtype=object))

'I you she'

In [42]:
res.token_to_word()

TypeError: token_to_word() missing 1 required positional argument: 'batch_or_token_index'