In [1]:
pip install transformers



## 1. 다음 문장 예측 모델과 토크나이저

In [2]:
import torch
from transformers import BertForNextSentencePrediction
from transformers import AutoTokenizer

In [3]:
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



## 2. BERT의 입력

In [4]:
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "pizza is eaten with the use of a knife and fork. In casual settings, however, it is cut into wedges to be eaten while held in the hand."

In [5]:
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

In [6]:
print(encoding['input_ids'])

tensor([[  101,  1999,  3304,  1010, 10733,  2366,  1999,  5337, 10906,  1010,
          2107,  2004,  2012,  1037,  4825,  1010,  2003,  3591,  4895, 14540,
          6610,  2094,  1012,   102, 10733,  2003,  8828,  2007,  1996,  2224,
          1997,  1037,  5442,  1998,  9292,  1012,  1999, 10017, 10906,  1010,
          2174,  1010,  2009,  2003,  3013,  2046, 17632,  2015,  2000,  2022,
          8828,  2096,  2218,  1999,  1996,  2192,  1012,   102]])


In [7]:
print(tokenizer.cls_token, ':', tokenizer.cls_token_id)
print(tokenizer.sep_token, ':' , tokenizer.sep_token_id)

[CLS] : 101
[SEP] : 102


In [8]:
print(tokenizer.decode(encoding['input_ids'][0]))

[CLS] in italy, pizza served in formal settings, such as at a restaurant, is presented unsliced. [SEP] pizza is eaten with the use of a knife and fork. in casual settings, however, it is cut into wedges to be eaten while held in the hand. [SEP]


In [9]:
print(encoding['token_type_ids'])

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


## 3. 다음 문장 예측하기

In [10]:
pred = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])
probs = torch.nn.functional.softmax(pred.logits, dim=1)  # Softmax 적용하여 확률 얻기
print(probs)

tensor([[1.0000e+00, 2.8382e-06]], grad_fn=<SoftmaxBackward0>)


In [11]:
next_sentence_label = torch.argmax(probs, dim=1).item()  # 예측된 라벨 얻기
print('최종 예측 레이블 :', next_sentence_label)

최종 예측 레이블 : 0


In [12]:
# 상관없는 두 개의 문장
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

pred = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])
probs = torch.nn.functional.softmax(pred.logits, dim=1)  # Softmax 적용하여 확률 얻기
next_sentence_label = torch.argmax(probs, dim=1).item()  # 예측된 라벨 얻기
print('최종 예측 레이블 :', next_sentence_label)

최종 예측 레이블 : 1


## 4. 한국어 모델의 다음 문장 예측 모델과 토크나이저

In [2]:
import torch
from transformers import BertForNextSentencePrediction
from transformers import AutoTokenizer

In [3]:
model = BertForNextSentencePrediction.from_pretrained('klue/bert-base')
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



## 5. 다음 문장 예측하기

In [4]:
# 이어지는 두 개의 문장
prompt = "2002년 월드컵 축구대회는 일본과 공동으로 개최되었던 세계적인 큰 잔치입니다."
next_sentence = "여행을 가보니 한국의 2002년 월드컵 축구대회의 준비는 완벽했습니다."
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

pred = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])
probs = torch.nn.functional.softmax(pred.logits, dim=1)  # Softmax 적용하여 확률 얻기
next_sentence_label = torch.argmax(probs, dim=1).item()  # 예측된 라벨 얻기
print('최종 예측 레이블 :', next_sentence_label)

최종 예측 레이블 : 0


In [5]:
# 상관없는 두 개의 문장
prompt = "2002년 월드컵 축구대회는 일본과 공동으로 개최되었던 세계적인 큰 잔치입니다."
next_sentence = "극장가서 로맨스 영화를 보고싶어요"
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

pred = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])
probs = torch.nn.functional.softmax(pred.logits, dim=1)  # Softmax 적용하여 확률 얻기
next_sentence_label = torch.argmax(probs, dim=1).item()  # 예측된 라벨 얻기
print('최종 예측 레이블 :', next_sentence_label)

최종 예측 레이블 : 1
