In [1]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)lve/main/config.json: 100%|██████████| 1.35k/1.35k [00:00<?, ?B/s]
Downloading model.safetensors: 100%|██████████| 5.31G/5.31G [00:54<00:00, 96.7MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 3.91MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.33MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<?, ?B/s]
Downloading (…)okenizer_config.json: 100%|██████████| 200/200 [00:00<?, ?B/s] 


In [2]:
# 토크나이징을 통한 인코딩(입력문이 하나인 경우)
input = tokenizer.encode("I evaluated the performance of GPT-Neo developed by OpenAI.", return_tensors="pt")
print(input[0])
print(tokenizer.decode(input[0]))

tensor([   40, 16726,   262,  2854,   286,   402, 11571,    12,  8199,    78,
         4166,   416,  4946, 20185,    13])
I evaluated the performance of GPT-Neo developed by OpenAI.


In [3]:
# 토크나이징을 통한 인코딩(입력문이 복수인 경우)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
input = tokenizer.batch_encode_plus(["I evaluated the performance of GPT-Neo developed by OpenAI.","I evaluated the performance of GPT developed by OpenAI."], padding=True, truncation=True, return_tensors="pt")
print(input['input_ids'])
print([tokenizer.decode(input['input_ids'][i]) for i in range(len(input['input_ids']))])


tensor([[   40, 16726,   262,  2854,   286,   402, 11571,    12,  8199,    78,
          4166,   416,  4946, 20185,    13],
        [   40, 16726,   262,  2854,   286,   402, 11571,  4166,   416,  4946,
         20185,    13, 50257, 50257, 50257]])
['I evaluated the performance of GPT-Neo developed by OpenAI.', 'I evaluated the performance of GPT developed by OpenAI. [PAD] [PAD] [PAD]']


In [7]:
# 토크나이징
input = tokenizer.batch_encode_plus(["I evaluated the performance of GPT2 developed by OpenAI.", 
                                     "Vaccine for new coronavirus in the UK",
                                     "3.1415926535"], max_length=5, truncation=True, padding=True, return_tensors="pt")
input['input_ids']
generated = model.generate(input['input_ids'])
generated_text = tokenizer.batch_decode(generated)

for i, sentence in enumerate(generated_text):
  print(f'No.{i+1}')
  print(f"{sentence}\n")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


No.1
I evaluated the performance of the proposed method on the real-world dataset. The results are shown in

No.2
Vaccine for new-borns

The vaccine for new-borns is a vaccine

No.3
3.1415926535897932384626433832795028841971693



In [14]:
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelWithLMHead.from_pretrained("distilgpt2")
model.to(device)

# 토크나이징. 출력은 파이토치 텐서(pt)로 받음
input_ids = tokenizer.encode("I like gpt because it's", return_tensors='pt')
greedy_output = model.generate(input_ids.to(device), max_length=12).cpu()
print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

cuda


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I like gpt because it's a good thing to have


In [18]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")
model.to(device)

input_ids = tokenizer.encode("I like gpt because it's", return_tensors='pt')
output = model.generate(input_ids.to(device), max_length=12).cpu()
print("====출력====")
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


====출력====
I like gpt because it's a good way to get


In [22]:
from transformers import pipeline

unmasker = pipeline('fill-mask', model='bert-base-uncased')
print("="*100)
print(unmasker("MLM and NSP is the [MASK] task of BERT."))
print("="*100)
unmasker = pipeline('fill-mask', model='distilbert-base-uncased')
print(unmasker("MLM and NSP is the [MASK] task of BERT."))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.25727882981300354, 'token': 2364, 'token_str': 'main', 'sequence': 'mlm and nsp is the main task of bert.'}, {'score': 0.2074068784713745, 'token': 3078, 'token_str': 'primary', 'sequence': 'mlm and nsp is the primary task of bert.'}, {'score': 0.06773312389850616, 'token': 2034, 'token_str': 'first', 'sequence': 'mlm and nsp is the first task of bert.'}, {'score': 0.06548517942428589, 'token': 2430, 'token_str': 'central', 'sequence': 'mlm and nsp is the central task of bert.'}, {'score': 0.06167394295334816, 'token': 3937, 'token_str': 'basic', 'sequence': 'mlm and nsp is the basic task of bert.'}]
[{'score': 0.2590245306491852, 'token': 3078, 'token_str': 'primary', 'sequence': 'mlm and nsp is the primary task of bert.'}, {'score': 0.1630989909172058, 'token': 2364, 'token_str': 'main', 'sequence': 'mlm and nsp is the main task of bert.'}, {'score': 0.081827811896801, 'token': 4563, 'token_str': 'core', 'sequence': 'mlm and nsp is the core task of bert.'}, {'score': 0.0

In [23]:
# 모델명이 바뀌었음에 유의
unmasker = pipeline('fill-mask', model='albert-base-v2')
unmasker("mlm and nsp is the [MASK] task of bert.")

Downloading (…)lve/main/config.json: 100%|██████████| 684/684 [00:00<?, ?B/s] 
Downloading model.safetensors: 100%|██████████| 47.4M/47.4M [00:03<00:00, 13.1MB/s]
Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.weight', 'albert.pooler.bias']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading (…)ve/main/spiece.model: 100%|██████████| 760k/760k [00:00<00:00, 21.1MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.31M/1.31M [00:00<00:00, 6.60MB/s]


[{'score': 0.047601066529750824,
  'token': 6612,
  'token_str': 'ultimate',
  'sequence': 'mlm and nsp is the ultimate task of bert.'},
 {'score': 0.024472149088978767,
  'token': 20766,
  'token_str': 'hardest',
  'sequence': 'mlm and nsp is the hardest task of bert.'},
 {'score': 0.023495100438594818,
  'token': 1256,
  'token_str': 'primary',
  'sequence': 'mlm and nsp is the primary task of bert.'},
 {'score': 0.02157510444521904,
  'token': 407,
  'token_str': 'main',
  'sequence': 'mlm and nsp is the main task of bert.'},
 {'score': 0.018088050186634064,
  'token': 18369,
  'token_str': 'foremost',
  'sequence': 'mlm and nsp is the foremost task of bert.'}]