In [13]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# 모델과 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [17]:
# 예제 문장과 토큰 위치
sentence =  "Have you ever read/watched Game of Thrones? In that show House Lannister has crimson red as their family color. [SEP] I have heard so much about It, I have never watched it before! It looks like the series is ending next year after eight seasons!!"
# token_index = 6  # [MASK]로 대체할 토큰의 위치
candidates = ['"crimson. I prefer baby pink"', 'red', 'Game of Thrones', 'House Lannister']  # 후보 단어 리스트

# 문장을 토큰화하고 [MASK] 토큰으로 지정된 위치의 단어 대체
tokens = tokenizer.tokenize(sentence)
#print index and token in tokens
print([(i,t) for i,t in enumerate(tokens)])

[(0, 'have'), (1, 'you'), (2, 'ever'), (3, 'read'), (4, '/'), (5, 'watched'), (6, 'game'), (7, 'of'), (8, 'throne'), (9, '##s'), (10, '?'), (11, 'in'), (12, 'that'), (13, 'show'), (14, 'house'), (15, 'lan'), (16, '##nist'), (17, '##er'), (18, 'has'), (19, 'crimson'), (20, 'red'), (21, 'as'), (22, 'their'), (23, 'family'), (24, 'color'), (25, '.'), (26, '[SEP]'), (27, 'i'), (28, 'have'), (29, 'heard'), (30, 'so'), (31, 'much'), (32, 'about'), (33, 'it'), (34, ','), (35, 'i'), (36, 'have'), (37, 'never'), (38, 'watched'), (39, 'it'), (40, 'before'), (41, '!'), (42, 'it'), (43, 'looks'), (44, 'like'), (45, 'the'), (46, 'series'), (47, 'is'), (48, 'ending'), (49, 'next'), (50, 'year'), (51, 'after'), (52, 'eight'), (53, 'seasons'), (54, '!'), (55, '!')]


In [18]:
token_index = 33
tokens[token_index] = '[MASK]'
mask_token_index = tokens.index('[MASK]')

# [CLS]와 [SEP] 토큰 추가
tokens = ['[CLS]'] + tokens + ['[SEP]']
mask_token_index += 1  # [CLS] 토큰이 추가되어 인덱스 조정
tokens

['[CLS]',
 'have',
 'you',
 'ever',
 'read',
 '/',
 'watched',
 'game',
 'of',
 'throne',
 '##s',
 '?',
 'in',
 'that',
 'show',
 'house',
 'lan',
 '##nist',
 '##er',
 'has',
 'crimson',
 'red',
 'as',
 'their',
 'family',
 'color',
 '.',
 '[SEP]',
 'i',
 'have',
 'heard',
 'so',
 'much',
 'about',
 '[MASK]',
 ',',
 'i',
 'have',
 'never',
 'watched',
 'it',
 'before',
 '!',
 'it',
 'looks',
 'like',
 'the',
 'series',
 'is',
 'ending',
 'next',
 'year',
 'after',
 'eight',
 'seasons',
 '!',
 '!',
 '[SEP]']

In [21]:
# 예제 문장과 토큰 위치
sentence =  "Have you ever read/watched Game of Thrones? In that show House Lannister has crimson red as their family color. [SEP] I have heard so much about It, I have never watched it before! It looks like the series is ending next year after eight seasons!!"
# token_index = 6  # [MASK]로 대체할 토큰의 위치
candidates = ['"crimson. I prefer baby pink"', 'red', 'Game of Thrones', 'House Lannister']  # 후보 단어 리스트

# 문장을 토큰화하고 [MASK] 토큰으로 지정된 위치의 단어 대체
tokens = tokenizer.tokenize(sentence)
#print index and token in tokens
# print([(i,t) for i,t in enumerate(tokens)])

token_index = 33
tokens[token_index] = '[MASK]'
mask_token_index = tokens.index('[MASK]')

# [CLS]와 [SEP] 토큰 추가
tokens = ['[CLS]'] + tokens + ['[SEP]']
mask_token_index += 1  # [CLS] 토큰이 추가되어 인덱스 조정

# 토큰을 모델의 입력 형태로 변환
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_tensor = torch.tensor([input_ids])

# [MASK] 위치에 대한 예측 수행
with torch.no_grad():
    outputs = model(input_tensor)
    predictions = outputs[0]

# 후보 단어들의 확률 계산
candidate_ids = tokenizer.convert_tokens_to_ids(candidates)
probs = torch.softmax(predictions[0, mask_token_index], dim=-1)
candidate_probs = probs[candidate_ids]

# 후보 단어들과 그 확률 출력
results = {}
for word, prob in zip(candidates, candidate_probs):
    # print(f"{word}: {prob.item()}")
    results[word] = prob.item()

# 확률이 높은 순서대로 출력
sorted(results.items(), key=lambda x: x[1], reverse=True)

[('red', 0.0003169028495904058),
 ('"crimson. I prefer baby pink"', 8.241360660576902e-08),
 ('Game of Thrones', 8.241360660576902e-08),
 ('House Lannister', 8.241360660576902e-08)]

In [22]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# 모델과 토크나이저 초기화
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

# 원본 문장과 후보 단어들
sentence_template = "Have you ever read/watched Game of Thrones? In that show House Lannister has crimson red as their family color. [SEP] I have heard so much about {}, I have never watched it before! It looks like the series is ending next year after eight seasons!!"
candidates = ['"crimson. I prefer baby pink"', 'red', 'Game of Thrones', 'House Lannister']

def score(sentence):
    tokenize_input = tokenizer.encode(sentence)
    tensor_input = torch.tensor([tokenize_input])
    with torch.no_grad():
        loss = model(tensor_input, labels=tensor_input)[0]
    return -loss.item()

# 각 후보에 대한 문장의 확률 계산
scores = {}
for candidate in candidates:
    candidate_sentence = sentence_template.format(candidate)
    candidate_score = score(candidate_sentence)
    scores[candidate] = candidate_score

# 확률이 높은 순서대로 후보 출력
sorted_candidates = sorted(scores.items(), key=lambda x: x[1], reverse=True)

for candidate, score in sorted_candidates:
    print(f"{candidate}: {score}")


100%|██████████| 1042301/1042301 [00:00<00:00, 2608032.34B/s]
100%|██████████| 456318/456318 [00:00<00:00, 1456247.89B/s]
100%|██████████| 665/665 [00:00<00:00, 677322.04B/s]
100%|██████████| 548118077/548118077 [00:18<00:00, 29041753.87B/s]
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens

Game of Thrones: -3.566688060760498
House Lannister: -3.587468385696411
red: -3.770730972290039
"crimson. I prefer baby pink": -4.092601299285889


"Game of Thrones"를 가장 자연스러운 선택으로 예측하는 데 있어서 GPT-2가 BERT보다 더 성공적이었던 이유는 다음과 같은 모델의 설계 및 학습 방식의 차이에서 기인할 수 있습니다:

학습 목표와 방식의 차이: GPT-2는 연속된 텍스트를 예측하는 언어 생성 모델입니다. 이는 주어진 문맥을 바탕으로 다음에 올 텍스트를 예측하는 방식으로 학습되며, 전체 문장의 흐름과 문맥적 자연스러움에 중점을 둡니다. 반면, BERT는 문장 내 빈칸을 채우는 방식(Masked Language Model, MLM)으로 학습되어, 주어진 문맥 내에서 단어나 구의 적합성을 평가하는 데 초점을 맞춥니다. 이러한 차이로 인해, 전체 문장의 자연스러움을 평가하는 작업에서 GPT-2가 더 유리할 수 있습니다.

언어 이해 및 생성 능력: GPT-2는 문장 생성 작업에 특화된 모델로, 주어진 문맥을 기반으로 문장을 이어 나가는 능력이 매우 뛰어납니다. 이는 모델이 전체 문장 구조와 문맥을 더 효과적으로 이해하고, 그에 따라 더 자연스러운 텍스트를 생성할 수 있음을 의미합니다. 따라서, "Game of Thrones"와 같이 특정 문맥에서 자연스럽게 등장할 수 있는 구나 문구를 예측하는 데 있어 GPT-2가 더 정확할 수 있습니다.

토큰 처리 방식의 차이: BERT는 주로 문장 내의 단일 토큰이나 짧은 구를 대상으로 학습되며, 한 번에 하나의 [MASK] 토큰만 예측합니다. 반면, GPT-2는 전체 문장을 생성하는 과정에서 여러 단어나 구를 연속적으로 예측할 수 있습니다. 이로 인해, "Game of Thrones"와 같이 여러 토큰으로 구성된 구나 문구를 더 자연스럽게 처리하고 예측할 수 있습니다.

이러한 이유들로 인해, 주어진 문장 내에서 "Game of Thrones"를 가장 적합한 선택으로 예측하는 데 GPT-2가 BERT보다 더 성공적일 수 있습니다. GPT-2의 학습 방식과 모델 구조는 전체 문장의 문맥적 자연스러움과 흐름을 더 잘 파악하고 반영할 수 있기 때문입니다.

In [25]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def evaluate_sentence_with_candidates(sentence_template, candidates):
    # 모델과 토크나이저 초기화
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.eval()

    # 로그 확률을 계산하는 함수
    def score(sentence):
        tokenize_input = tokenizer.encode(sentence)
        tensor_input = torch.tensor([tokenize_input])
        with torch.no_grad():
            loss = model(tensor_input, labels=tensor_input)[0]
        return -loss.item()

    # 각 후보에 대한 문장의 확률 계산
    scores = {}
    for candidate in candidates:
        candidate_sentence = sentence_template.format(candidate)
        candidate_score = score(candidate_sentence)
        scores[candidate] = candidate_score

    # 확률이 높은 순서대로 후보 정렬 및 출력
    sorted_candidates = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_candidates

# 함수 사용 예시
sentence_template = "Have you ever read/watched Game of Thrones? In that show House Lannister has crimson red as their family color. [SEP] I have heard so much about {}, I have never watched it before! It looks like the series is ending next year after eight seasons!!"
candidates = ['"crimson. I prefer baby pink"', 'red', 'Game of Thrones', 'House Lannister']
sorted_candidates = evaluate_sentence_with_candidates(sentence_template, candidates)
print(sorted_candidates)


{'context_text': ['My favorite color is red.',
                  'Red is at the end of the spectrum of light, its with orange '
                  'and opposite of violet.',
                  "I didn't know that. What else do you know about red?",
                  "It's actually a primary color for the RGB and CMYK color "
                  'model.',
                  'I learned about primary colors in school when I was little.',
                  'Well, the reason that Mars is red is because of the iron '
                  "oxide on it. That's pretty cool!",
                  'I wish we could travel to Mars.',
                  'Me too! Actually, red pigment is one of the first colors '
                  'that was used way back in prehistoric times.',
                  'I guess they got red pigment from the dirt or something.'],
 'doc_tokens': ['My',
                'favorite',
                'color',
                'is',
                'red.',
                'Red',
              

In [27]:
def evaluate_sentence_with_candidates(sentence_template, candidates):
    # 모델과 토크나이저 초기화
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.eval()

    # 로그 확률을 계산하는 함수
    def score(sentence):
        tokenize_input = tokenizer.encode(sentence)
        tensor_input = torch.tensor([tokenize_input])
        with torch.no_grad():
            loss = model(tensor_input, labels=tensor_input)[0]
        return -loss.item()

    # 각 후보에 대한 문장의 확률 계산
    scores = {}
    for candidate in candidates:
        candidate_sentence = sentence_template.format(candidate)
        candidate_score = score(candidate_sentence)
        scores[candidate] = candidate_score

    # 확률이 높은 순서대로 후보 정렬 및 출력
    sorted_candidates = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_candidates

def generate_filled_sentences(context, response, pronoun_index, candidates):
    # 문맥에서 마지막 두 문장을 가져옵니다.
    context_part = " ".join(context[-2:])
    
    # 대응하는 응답에서 대명사를 {}로 대체합니다.
    response_tokens = response.split()
    response_tokens[pronoun_index] = '{}'
    response_with_placeholder = " ".join(response_tokens)
    
    # 완성된 문장 템플릿을 생성합니다.
    sentence_template = f"{context_part} {response_with_placeholder}"
    
    # 각 후보에 대한 문장의 확률을 계산하고 정렬합니다.
    sorted_candidates = evaluate_sentence_with_candidates(sentence_template, candidates)
    
    return sorted_candidates

# 샘플 데이터
sample1 = {
    "context_text": [
        "My favorite color is red.  Do you have a favorite color? ",
        "Red is a good color, but I think I prefer pink, which is similar. I never knew it was named after a flower of the same name!",
        "What is your favorite shade of pink? My favorite shade of red is crimson.",
        "I prefer baby pink, I like the lighter shade of it, it's just so appealing to me, the name pink has been around for a long time, since the late 17th century!",
        "Wow that is so cool.  Have you ever read/watched Game of Thrones? In that show House Lannister has crimson red as their family color."
    ],
    "response": "I have heard so much about Game of Thrones, I have never watched it before! It looks like the series is ending next year after eight seasons!!",
    "pronoun_index": 17
}

sample2 = [
    {"text": "crimson. I prefer baby pink"},
    {"text": "red"},
    {"text": "Game of Thrones"},
    {"text": "House Lannister"}
]

# 샘플 데이터를 함수에 적용합니다.
candidates = [cand["text"] for cand in sample2]
sorted_candidates = generate_filled_sentences(
    sample1["context_text"],
    sample1["response"],
    sample1["pronoun_index"],
    candidates
)

sorted_candidates


This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.


[('Game of Thrones', -3.6056041717529297),
 ('red', -3.617605686187744),
 ('House Lannister', -3.619537353515625),
 ('crimson. I prefer baby pink', -3.6559503078460693)]

In [30]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting attrs>=19.2.0 (from jsonlines)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Downloading attrs-23.2.0-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m389.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: attrs, jsonlines
Successfully installed attrs-23.2.0 jsonlines-4.0.0


In [31]:

predictions = []

#read predictions_path (json file)
with open(predictions_path, 'r') as f:
    predictions = json.load(f)

In [34]:
print(len(all_info), len(predictions))

7411 7411


In [49]:
all_info[0]

{'qas_id': '275___6--0_0_2',
 'question_text': "Considering the context, 'Yes, it has, but what i think is just vanity too, thats why fitness just attractiveness', what exactly does 'it' referring to an object, concept, or situation?",
 'doc_tokens': ['I',
  'believe',
  'physical',
  'fitness',
  'is',
  'not',
  'just',
  'body',
  'but',
  'health',
  'as',
  'well',
  'I',
  'agree.',
  'It',
  'helps',
  'with',
  'a',
  'lot',
  'of',
  'things.',
  'I',
  'think',
  'it',
  'helps',
  'with',
  'confidence',
  'in',
  'oneself',
  'too.',
  'yes,',
  'physical',
  'fitness',
  'is',
  'often',
  'achieved',
  'by',
  'proper',
  'diet,',
  'rest,',
  'and',
  'activity',
  'I',
  'try',
  'to',
  'get',
  'exercise',
  'every',
  'day,',
  'especially',
  'as',
  'I',
  'get',
  'older',
  'I',
  'realize',
  'how',
  'important',
  'it',
  'is.',
  'Yes,',
  'me',
  'too,',
  'it',
  'is',
  'funny',
  'to',
  'think',
  'that',
  'before',
  'the',
  'industrial',
  'revolutio

In [45]:
predictions['0']

[{'orig_doc_start': 2,
  'orig_doc_end': 7,
  'text': 'physical fitness is not just body'},
 {'orig_doc_start': 9, 'orig_doc_end': 9, 'text': 'health'}]

In [53]:
#read all item in all_info, predictions at the same time
for i, sample1 in enumerate(all_info):
    if i==100:
        sample2 = predictions[str(i)]
        print(f'sample2: {sample2}')
        candidates = [cand["text"] for cand in sample2]
        print(f'candidates: {candidates}')
        print(f'context: {sample1["context_text"]}')
        print(f'orig_response: {sample1["orig_response"]}')
        sorted_candidates = generate_filled_sentences(
                                                        sample1["context_text"],
                                                        sample1["orig_response"],
                                                        sample1["pronoun_index"],
                                                        candidates
                                                    )   
        print(sorted_candidates)
        break
    else:
        continue

sample2: [{'orig_doc_start': 38, 'orig_doc_end': 38, 'text': 'vineyard'}, {'orig_doc_start': 152, 'orig_doc_end': 152, 'text': 'Chianti'}]
candidates: ['vineyard', 'Chianti']
context: ['have you ever tried wine tasting? it is a sensory examination and evaluation of wines', 'I have! We recently rewatched the movie Sideways, where there are some pretty funny scenes about wine tasting. Do you have a favorite vineyard or type of wine?', 'with all the varities of grapes and strains there are a lot of types, but they are all so good, I am not sure if i have a favorite!', "Of course a lot of people use wine tasting as an excuse to *drink* wine. Really, you're supposed to just hold it in your mouth for the taste, then spit it out!", 'Yeah there is certainly a portion of people that are more informal and do it for recreation in a much less analytical way', 'The most beautiful setting I ever did wine tasting was in Italy. Of course ;-) Nothing like being surrounded by the Tuscan countryside whil

This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens. Input is returned with no modification.
This tokenizer does not make use of special tokens.


[('Chianti', -4.431573390960693), ('vineyard', -4.527164936065674)]
