In [5]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2", 
                                                    bos_token='<s>', # begin of sentence token
                                                    eos_token='</s>', # end of sentence token
                                                    unk_token='<unk>', # unknown token
                                                    pad_token='<pad>', # pad token(문장의 길이를 맞추기 위해 사용)
                                                    mask_token='<mask>') # mask token(마스킹을 통해 어떤 단어를 예측할지 결정)

Downloading (…)/main/tokenizer.json: 100%|██████████| 2.83M/2.83M [00:01<00:00, 1.53MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.00k/1.00k [00:00<00:00, 635kB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [6]:
tokenizer.tokenize("안녕하세요. 반갑습니다.")

['▁안녕', '하', '세', '요.', '▁반', '갑', '습니다.']

In [8]:
import torch
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")
text = "안녕하세요. 반갑습니다. 저는"
input_ids = tokenizer.encode(text, return_tensors="pt")
gen_ids = model.generate(input_ids,
                         max_length=100,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_ids=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)

generated = tokenizer.decode(gen_ids[0,:].tolist())
print(generated)

ValueError: The following `model_kwargs` are not used by the model: ['eos_token_ids'] (note: typos in the generate arguments will also show up in this list)

We will use kogpt based GPT-3 later.

https://huggingface.co/kakaobrain/kogpt

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
  'kakaobrain/kogpt', 
  revision = 'KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
  bos_token = '[BOS]', # begin of sentence token
  eos_token = '[EOS]', # end of sentence token
  unk_token = '[UNK]', # unknown token
  pad_token = '[PAD]', # 패드 토큰: 패딩(전체 문장 길이를 맞추기 위해 빈 공간을 채우는 것)을 위해 사용
  mask_token = '[MASK]' # 마스킹 토큰: 마스킹을 통해 언어 모델이 어떤 단어를 예측해야 하는지 알려줌
)

Downloading (…)okenizer_config.json: 100%|██████████| 252/252 [00:00<00:00, 49.3kB/s]
Downloading (…)oat16/tokenizer.json: 100%|██████████| 2.51M/2.51M [00:01<00:00, 1.66MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 88.0/88.0 [00:00<00:00, 57.3kB/s]


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    'kakaobrain/kogpt',
    revision = 'KoGPT6B-ryan1.5b-float16',
    pad_token_id = tokenizer.eos_token_id
    torch_dtype = 'auto',
    low_cpu_mem_usage = True).to(device = 'cuda', non_blocking = True)

_ = model.eval()

