In [1]:
from IPython import get_ipython
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [94]:
import os
os.environ['HF_HOME'] = '/raid/xd/.cache/torch'
from types import MethodType

import torch
from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
from torch.utils.data.dataloader import DataLoader

from transformers.data.data_collator import DataCollator, default_data_collator
from transformers import AutoConfig, pipeline
from transformers import RobertaForMaskedLM, RobertaTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import HfArgumentParser, Trainer, TrainingArguments, set_seed
from transformers.trainer_utils import EvaluationStrategy

from utils import *

In [8]:
models = {}

In [40]:
cache_dir = '/nas/xd/.cache/torch/transformers/'  # for models besides t5-3b/11b

In [39]:
model = RobertaForMaskedLM.from_pretrained('roberta-large', cache_dir=cache_dir)

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', cache_dir=cache_dir)

models['roberta-large'] = (model, tokenizer)

In cached_path: url_or_filename = https://huggingface.co/roberta-large/resolve/main/config.json
In cached_path: output_path = /nas/xd/.cache/torch/transformers/roberta-large-config.json
In cached_path: url_or_filename = https://huggingface.co/roberta-large/resolve/main/pytorch_model.bin
In cached_path: output_path = /nas/xd/.cache/torch/transformers/roberta-large-pytorch_model.bin


Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-large and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
tokenizer = T5Tokenizer.from_pretrained('t5-11b')
tokenizer.decode_strip_special_tokens = MethodType(decode_strip_special_tokens, tokenizer)
tokenizer.decode_old = MethodType(decode_old, tokenizer)

In cached_path: url_or_filename = https://huggingface.co/t5-11b/resolve/main/spiece.model
In cached_path: output_path = /raid/xd/.cache/torch/transformers/0172c8f05db06fdc1d9f5be691fa907b7da289cf4b777506b956dc76d9bf1ceb.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d


In [6]:
model_name = 't5-11b'
proxies = {'http': '192.168.50.1:1081'}
model = model11b = T5ForConditionalGeneration.from_pretrained(model_name, proxies=proxies)

In cached_path: url_or_filename = https://huggingface.co/t5-11b/resolve/main/config.json
In cached_path: output_path = /raid/xd/.cache/torch/transformers/81569faf106ccedd12b4204414bfaf719ce664c51e4192954c3e2a45c2b6183d.f3d4f63c19654eaafefa7926880e38ff43a93df01b4df7b7b60c00bb1b10e9a6
In cached_path: url_or_filename = https://huggingface.co/t5-11b/resolve/main/pytorch_model.bin
In cached_path: output_path = /raid/xd/.cache/torch/transformers/3ec200f21984c6b177d08534a7166201616fe542ddd5c6e61927f7908bf9f75f.200226855d13a9f5ec82e28b352f6f771748fba71f240e65cd3dfe99889b4ccc


In [36]:
models['t5-11b'] = model, tokenizer

In [10]:
device_map = {0: list(range(0, 6)), 1: list(range(6, 15)), 2: list(range(15, 24))}
model.parallelize(device_map)

device = torch.device('cuda:0')

In [76]:
models['roberta-large'] = models['roberta-large'] + (torch.device('cpu'),)
models['t5-11b'] = models['t5-11b'] + (torch.device('cuda:0'),)

In [135]:
# model, tokenizer, device = models['t5-11b']
model, tokenizer, device = models['roberta-large']

masked_lm = tokenizer.mask_token is not None and len(tokenizer.additional_special_tokens) == 0
mask_token = tokenizer.mask_token if masked_lm else tokenizer.additional_special_tokens[0] # '<mask>' for roberta and '<sxtra_id_0>' for t5
if masked_lm: nlp = pipeline('fill-mask', model=model, tokenizer=tokenizer, top_k=5)

In [131]:
texts = [
    'Big is to small as fast is to _',
    'Bread is to eat as gun is to _',
    'big: small, fast: _',
    'bread: eat, gun: _ .',
    'flower: fragrant, fire: hot, bread: delicious, gun: _ ',
    'Big and small are _ .',
]
text = texts[-1]
_text = text.replace('_', mask_token)

In [108]:
if masked_lm:
    print(_text, ['%s %.3f' % (i['token_str'], i['score']) for i in nlp(_text)])
else:
    inputs = tokenizer.encode_plus(_text, return_tensors='pt')
    inputs = prepare_inputs(inputs, model.device)
    outputs = model.generate(**inputs)
    print(_text, tokenizer.decode(outputs[0]))
    print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))
    print(tokenizer.convert_ids_to_tokens(outputs[0]))

bread: eat, gun: <extra_id_0>. <pad> <extra_id_0> shoot<extra_id_1>.<extra_id_2> shoot<extra_id_3>.<extra_id_4> kill<extra_id_5> kill<extra_id_6>..
['▁bread', ':', '▁', 'eat', ',', '▁gun', ':', '<extra_id_0>', '▁', '.', '</s>']
['<pad>', '<extra_id_0>', '▁shoot', '<extra_id_1>', '▁', '.', '<extra_id_2>', '▁shoot', '<extra_id_3>', '▁', '.', '<extra_id_4>', '▁kill', '<extra_id_5>', '▁kill', '<extra_id_6>', '▁', '.', '▁', '.']


In [132]:
input_ids = tokenizer.encode_plus(_text)['input_ids']
print(tokenizer.convert_ids_to_tokens(input_ids))
outputs = model.generate(torch.LongTensor([input_ids]).to(model.device))
print(_text, tokenizer.decode(outputs[0]))

input_ids = input_ids[: -3] + input_ids[-2:]
print(tokenizer.convert_ids_to_tokens(input_ids))
outputs = model.generate(torch.LongTensor([input_ids]).to(model.device))
print(_text, tokenizer.decode(outputs[0]))

['▁Big', '▁and', '▁small', '▁are', '<extra_id_0>', '▁', '.', '</s>']
Big and small are <extra_id_0> . <pad> <extra_id_0> welcome<extra_id_1>.</s>
['▁Big', '▁and', '▁small', '▁are', '<extra_id_0>', '.', '</s>']
Big and small are <extra_id_0> . <pad> <extra_id_0> welcome<extra_id_1> welcome.</s>


In [136]:
for name, p in model.named_parameters():
    print(name, p.size())

roberta.embeddings.word_embeddings.weight torch.Size([50265, 1024])
roberta.embeddings.position_embeddings.weight torch.Size([514, 1024])
roberta.embeddings.token_type_embeddings.weight torch.Size([1, 1024])
roberta.embeddings.LayerNorm.weight torch.Size([1024])
roberta.embeddings.LayerNorm.bias torch.Size([1024])
roberta.encoder.layer.0.attention.self.query.weight torch.Size([1024, 1024])
roberta.encoder.layer.0.attention.self.query.bias torch.Size([1024])
roberta.encoder.layer.0.attention.self.key.weight torch.Size([1024, 1024])
roberta.encoder.layer.0.attention.self.key.bias torch.Size([1024])
roberta.encoder.layer.0.attention.self.value.weight torch.Size([1024, 1024])
roberta.encoder.layer.0.attention.self.value.bias torch.Size([1024])
roberta.encoder.layer.0.attention.output.dense.weight torch.Size([1024, 1024])
roberta.encoder.layer.0.attention.output.dense.bias torch.Size([1024])
roberta.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([1024])
roberta.encoder.layer.0