In [1]:
import re
import os
import pickle
import numpy as np
import pandas as pd
import json
import logging
import torch
import math
import time
from torch.utils.data import Dataset
from utils.utils import newsample, getId2idx, tokenize, getVocab, my_collate, Partition_Sampler
from data.configs.demo import config
from torch.utils.data import DataLoader
from collections import defaultdict
from transformers import BertTokenizer,BertModel,BertTokenizerFast,DebertaTokenizer,DebertaTokenizerFast, AutoTokenizer
from utils.MIND import MIND
from utils.Manager import Manager

logger = logging.getLogger(__name__)

In [2]:
t = AutoTokenizer.from_pretrained('bert-base-uncased', cache_dir=config.path + "bert_cache/")
t2 = DebertaTokenizerFast.from_pretrained('microsoft/deberta-base', cache_dir=config.path + "bert_cache/")

In [4]:
t.vocab['_']

1035

In [6]:
sorted([1031,1012,1004,1008,1006,1007,1009,1027,1013,1032,1026,1028,1010,999,1029,1025,1024,1066,1036,1030,1001,1002,1003,1034,1033,1011,1529,1035])

TypeError: sequence item 0: expected str instance, int found

In [3]:
t2.tokenize('Word embeddings.')

['Word', 'Ġembed', 'd', 'ings', '.']

In [6]:
token = '[PAD]'
t.convert_tokens_to_ids(token), t2.convert_tokens_to_ids(token)

(0, 0)

In [None]:
special_token_map = {
    "[CLS]":{
        "bert-base-uncased": 101,
        "deberta-base": 1
    },
    "SEP":{
        "bert-base-uncased": 102,
        "deberta-base": 2
    }
}

In [4]:
t.save_pretrained('.')

('./vocab.txt',)

In [6]:
t.convert_tokens_to_string(['ings', '.'])

'ings.'

In [5]:
t.tokenize("A is embeddings.")

['A', 'Ġis', 'Ġembed', 'd', 'ings', '.']

In [3]:
# config.reducer = 'bm25'
# config.reducer = 'bow'
# config.reducer = 'matching'

# config.signal_length = 10
# config.scale = 'large'
# config.impr_size = 100
# config.mode = 'test'

config.bert = 'microsoft/deberta-base'
config.embedding = 'deberta'

# config.bert = 'bert-base-uncased
# config.embedding = 'bert'

manager = Manager(config)
path = manager.path + 'MIND/MINDdemo_train/'
a = MIND(manager, path + 'news.tsv', path + 'behaviors.tsv')

[2021-09-10 16:16:53,024] INFO (utils.MIND) encoding user behaviors of ../../../Data/MIND/MINDdemo_train/behaviors.tsv...
2000it [00:00, 17043.71it/s]
[2021-09-10 16:17:03,591] INFO (utils.MIND) encoding news of ../../../Data/MIND/MINDdemo_train/news.tsv...
51282it [00:19, 2593.66it/s]
[2021-09-10 16:17:23,392] INFO (utils.utils) computing BM25 scores...
[2021-09-10 16:17:26,383] INFO (utils.MIND) tokenizing news...
[2021-09-10 16:18:32,037] INFO (utils.MIND) tokenizing bm25 ordered news...
[2021-09-10 16:18:56,648] INFO (utils.MIND) tokenizing entities...
[2021-09-10 16:19:08,314] INFO (utils.MIND) process NO.0 loading cached user behavior from data/cache/deberta/MINDdemo_train/10/behaviors..pkl
[2021-09-10 16:19:08,324] INFO (utils.MIND) process NO.0 loading cached news tokenization from data/cache/deberta/MINDdemo_train/news.pkl
[2021-09-10 16:19:09,034] INFO (utils.utils) deduplicating...


In [25]:
parse_texts_deberta(t, ["Trump is Elizabeth"], 8)

(array([[   1, 7565,   16, 4690,    0,    0,    0,    0]]),
 array([[1, 1, 1, 1, 0, 0, 0, 0]]),
 array([[[0, 0],
         [1, 1],
         [2, 2],
         [3, 3],
         [0, 0],
         [0, 0],
         [0, 0],
         [0, 0]]]),
 array([[[0, 0],
         [1, 1],
         [2, 2],
         [3, 3],
         [0, 0],
         [0, 0],
         [0, 0],
         [0, 0]]]))

In [26]:
t.convert_ids_to_tokens([   1, 7565,   16, 4690,    0,    0,    0,    0])

['[CLS]', 'Trump', 'Ġis', 'ĠElizabeth', '[PAD]', '[PAD]', '[PAD]', '[PAD]']

In [16]:
tokens = t.tokenize("[CLS] I love you embeddings.")
tokens

['[CLS]', 'ĠI', 'Ġlove', 'Ġyou', 'Ġembed', 'd', 'ings', '.']

In [4]:
news = pickle.load(open('/data/workspace/Peitian/Code/Document-Reduction/Code/data/cache/deberta/MINDdemo_train/news.pkl', 'rb'))
ids = news['encoded_news']
subwords = news['subwords_all']

In [13]:
t.convert_tokens_to_ids([i for i in r"[.&*()+=/\<>,!?;:~`@#$%^]"])

[1031,
 1012,
 1004,
 1008,
 1006,
 1007,
 1009,
 1027,
 1013,
 1032,
 1026,
 1028,
 1010,
 999,
 1029,
 1025,
 1024,
 1066,
 1036,
 1030,
 1001,
 1002,
 1003,
 1034,
 1033]