# 10) MobileBERT / DistilGPT2 / DialoGPT

Mobile BERT : BERT를 압축하고 속도를 개선한 모형

In [2]:
from transformers import MobileBertTokenizer, MobileBertModel
import torch

# 모델 및 토크나이저 불러오기
tokenizer_mbert = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')

model_mbert = MobileBertModel.from_pretrained('google/mobilebert-uncased')
model_mbert

MobileBertModel(
  (embeddings): MobileBertEmbeddings(
    (word_embeddings): Embedding(30522, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 512)
    (token_type_embeddings): Embedding(2, 512)
    (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
    (LayerNorm): NoNorm()
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): MobileBertEncoder(
    (layer): ModuleList(
      (0-23): 24 x MobileBertLayer(
        (attention): MobileBertAttention(
          (self): MobileBertSelfAttention(
            (query): Linear(in_features=128, out_features=128, bias=True)
            (key): Linear(in_features=128, out_features=128, bias=True)
            (value): Linear(in_features=512, out_features=128, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): MobileBertSelfOutput(
            (dense): Linear(in_features=128, out_features=128, bias=True)
            (LayerNorm): NoNorm()
          )
      

In [3]:
from transformers import BertTokenizer, BertModel

# 모델 및 토크나이저 불러오기
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

model_bert = BertModel.from_pretrained('bert-base-uncased')
model_bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
import torch

text = "Mobile bert is more practical than bert."

# Mobile BERT 토크나이징
inputs = tokenizer_mbert.tokenize(text)
print(inputs)

# BERT 토크나이징
inputs = tokenizer_bert.tokenize(text)
print(inputs)

# 두 모델의 실행 결과가 같음

['mobile', 'bert', 'is', 'more', 'practical', 'than', 'bert', '.']
['mobile', 'bert', 'is', 'more', 'practical', 'than', 'bert', '.']


In [5]:
import torch

text = "Mobile bert is more practical than bert."

inputs = tokenizer_mbert.encode(text)
#                          단어 => 숫자
# squeeze() 사이즈가 1인 차원 제거, unsqueeze() 첫번째 위치에 1인 차원 추가
outputs = model_mbert(torch.tensor(inputs).unsqueeze(0))
print(outputs.last_hidden_state.shape)

inputs = tokenizer_bert.encode(text)
outputs = model_bert(torch.tensor(inputs).unsqueeze(0))
print(outputs.last_hidden_state.shape)

# torch.Size([1, 10, 512])  BERT 모형
# torch.Size([1, 10, 768])  Mobile BERT 모형

torch.Size([1, 10, 512])
torch.Size([1, 10, 768])


In [6]:
# Mobile BERT 추론

from transformers import MobileBertTokenizer, MobileBertForMaskedLM
import torch

# 토크나이저 및 모델 불러오기
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
model = MobileBertForMaskedLM.from_pretrained('google/mobilebert-uncased')

# 마스크한 문장 및 정답 문장을 각기 토크나이징
inputs = tokenizer("The capital of Korea is [MASK].", return_tensors="pt")
labels = tokenizer("The capital of Korea is Seoul.", return_tensors="pt")["input_ids"]
outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits

print(' '.join([tokenizer.decode(i.item()).replace(" ", "") for i in logits.argmax(-1)[0]][1:-1]))

the capital of korea is seoul .


In [7]:
# BERT 추론
# Mobile BERT 때와 비교하여 토크나이저와 모델이 다름

from transformers import BertTokenizer, BertForMaskedLM
import torch

# 토크나이저 및 모델 불러오기
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
inputs = tokenizer("The capital of Korea is [MASK].", return_tensors="pt")
labels = tokenizer("The capital of Korea is Seoul.", return_tensors="pt")["input_ids"]
outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits

print(' '.join([tokenizer.decode(i.item()).replace(" ", "") for i in logits.argmax(-1)[0]][1:-1]))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


the capital of korea is seoul .


In [8]:
from transformers import AutoTokenizer, AutoModelWithLMHead

# 토크나이저 및 모델 불러오기
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelWithLMHead.from_pretrained("distilgpt2")



In [9]:
# 문장 완성

input_ids = tokenizer.encode("I like gpt because it's", return_tensors='pt')
greedy_output = model.generate(input_ids, max_length=12)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I like gpt because it's a good thing to have


In [10]:
from transformers import AutoTokenizer, AutoModelWithLMHead

# 토크나이저 및 모델 불러오기
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small")

In [11]:
input_ids = tokenizer.encode("I like gpt because it's", return_tensors='pt')
greedy_output = model.generate(input_ids, max_length=30)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
I like gpt because it's a good way to get a feel for the game.
