# 2) 문장 만들기

In [1]:
# GPT-Neo : GPT의 오픈소스 버전

In [4]:
# ! pip install transformers sentencepiece

from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# 모델 및 토크나이저 불러오기

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
#           형태소 분석, 정수 인코딩

In [5]:
# 텍스트=>정수 인코딩(1개의 문장)

input = tokenizer.encode("I evaluated the performance of GPT-Neo developed by OpenAI.",
                         return_tensors="pt")
#                                        pt : pytoch / ts : tensor

print(input[0])
print(tokenizer.decode(input[0]))

tensor([   40, 16726,   262,  2854,   286,   402, 11571,    12,  8199,    78,
         4166,   416,  4946, 20185,    13])
I evaluated the performance of GPT-Neo developed by OpenAI.


In [6]:
# 정수 인코딩(복수 문장)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#                                           제로패딩

input = tokenizer.batch_encode_plus(
    ["I evaluated the performance of GPT-Neo developed by OpenAI.",
     "I evaluated the performance of GPT developed by OpenAI."]
     , padding=True, truncation=True, return_tensors="pt")

print(input['input_ids'])
print([tokenizer.decode(input['input_ids'][i]) for i in range(len(input['input_ids']))])

tensor([[   40, 16726,   262,  2854,   286,   402, 11571,    12,  8199,    78,
          4166,   416,  4946, 20185,    13],
        [   40, 16726,   262,  2854,   286,   402, 11571,  4166,   416,  4946,
         20185,    13, 50257, 50257, 50257]])
['I evaluated the performance of GPT-Neo developed by OpenAI.', 'I evaluated the performance of GPT developed by OpenAI. [PAD] [PAD] [PAD]']


In [7]:
# 복수의 문장 인코딩

input = tokenizer.batch_encode_plus(

    ["I evaluated the performance of GPT2 developed by OpenAI.",
     "Vaccine for new coronavirus in the UK",
    "3.1415926535"]
    , max_length=5, truncation=True, padding=True, return_tensors="pt")

In [8]:
# 인코딩 결과 확인

input['input_ids']

tensor([[   40, 16726,   262,  2854,   286],
        [   53,  4134,   500,   329,   649],
        [   18,    13,  1415, 19707, 22980]])

In [9]:
# 문장 만들기

generated = model.generate(input['input_ids'])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [10]:
# 생성된 문장 디코딩

generated_text = tokenizer.batch_decode(generated)

for i, sentence in enumerate(generated_text):
  print(f'No.{i+1}')
  print(f"{sentence}\n")

No.1
I evaluated the performance of the proposed method on the real-world dataset. The results are shown in

No.2
Vaccine for new-borns

The vaccine for new-borns is a vaccine

No.3
3.1415926535897932384626433832795028841971693



In [11]:
# DistilGPT2 모형 활용

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model

Downloading (…)lve/main/config.json: 100%|██████████| 762/762 [00:00<00:00, 71.6kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 5.40MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.29MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 42.6MB/s]
Downloading model.safetensors: 100%|██████████| 353M/353M [00:03<00:00, 113MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 41.4kB/s]


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [12]:
# DistilGPT2 모형으로 문장 만들기

input_ids = tokenizer.encode("I like gpt because it's", return_tensors='pt')
greedy_output = model.generate(input_ids, max_length=12)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I like gpt because it's a good thing to have


In [13]:
# DialoGPT 모형 활용
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")

model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
model

Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 8.60kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 641/641 [00:00<00:00, 214kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 5.47MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 40.9MB/s]
Downloading model.safetensors: 100%|██████████| 351M/351M [00:03<00:00, 106MB/s]  
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 11.4kB/s]


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [14]:
#문장 만들기

input_ids = tokenizer.encode("I like gpt because it's", return_tensors='pt')
greedy_output = model.generate(input_ids, max_length=30)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

#좀더 자연스러운 문장이 만들어짐

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I like gpt because it's a good way to get a feel for the game.


In [15]:
input_ids = tokenizer.encode("Covid19 delta is spreading", return_tensors='pt')
greedy_output = model.generate(input_ids, max_length=10)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Covid19 delta is spreading the word


In [16]:
# MLM(Masked Language Model) : 문서의 일부를 가리고(mask) 원래 단어를 추측하는 빈칸 채우기

from transformers import pipeline

unmasker = pipeline('fill-mask', model='bert-base-uncased')
#                                                 대소문자 구분 X

# [MASK]가 포함된 문장 입력

unmasker("I [MASK] apple.")

# [MASK] 위치에 입력될 수 있는 단어들을 추천하여 문장 완성

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 190kB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:04<00:00, 104MB/s]  
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 2.75kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232

[{'score': 0.1829017698764801,
  'token': 2293,
  'token_str': 'love',
  'sequence': 'i love apple.'},
 {'score': 0.12623995542526245,
  'token': 2066,
  'token_str': 'like',
  'sequence': 'i like apple.'},
 {'score': 0.11780297756195068,
  'token': 2359,
  'token_str': 'wanted',
  'sequence': 'i wanted apple.'},
 {'score': 0.0684230700135231,
  'token': 2215,
  'token_str': 'want',
  'sequence': 'i want apple.'},
 {'score': 0.05560746043920517,
  'token': 3866,
  'token_str': 'loved',
  'sequence': 'i loved apple.'}]

In [17]:
from transformers import pipeline

# distilbert 모델

unmasker = pipeline('fill-mask', model='distilbert-base-uncased')
unmasker("I [MASK] apple.")

# 모형에 따라 추천되는 단어가 달라짐

[{'score': 0.06269748508930206,
  'token': 8823,
  'token_str': 'ate',
  'sequence': 'i ate apple.'},
 {'score': 0.0586443729698658,
  'token': 2293,
  'token_str': 'love',
  'sequence': 'i love apple.'},
 {'score': 0.056702181696891785,
  'token': 3866,
  'token_str': 'loved',
  'sequence': 'i loved apple.'},
 {'score': 0.051368582993745804,
  'token': 6283,
  'token_str': 'hated',
  'sequence': 'i hated apple.'},
 {'score': 0.04913158714771271,
  'token': 4521,
  'token_str': 'eat',
  'sequence': 'i eat apple.'}]

In [18]:
from transformers import pipeline

# albert 모델 : bert의 정확도를 높이고 경량화한 모형

unmasker = pipeline('fill-mask', model='albert-base-v2')
unmasker("I [MASK] apple.")

Downloading (…)lve/main/config.json: 100%|██████████| 684/684 [00:00<00:00, 68.6kB/s]
Downloading model.safetensors: 100%|██████████| 47.4M/47.4M [00:00<00:00, 112MB/s]
Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading (…)ve/main/spiece.model: 100%|██████████| 760k/760k [00:00<00:00, 54.2MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.31M/1.31M [00:00<00:00, 6.29MB/s]


[{'score': 0.1261998564004898,
  'token': 339,
  'token_str': 'love',
  'sequence': 'i love apple.'},
 {'score': 0.092204749584198,
  'token': 3345,
  'token_str': 'liked',
  'sequence': 'i liked apple.'},
 {'score': 0.056264329701662064,
  'token': 2199,
  'token_str': 'loved',
  'sequence': 'i loved apple.'},
 {'score': 0.04441859945654869,
  'token': 5285,
  'token_str': 'hated',
  'sequence': 'i hated apple.'},
 {'score': 0.03994071111083031,
  'token': 3223,
  'token_str': 'hate',
  'sequence': 'i hate apple.'}]