In [13]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# 모델과 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [17]:
# 예제 문장과 토큰 위치
sentence =  "Have you ever read/watched Game of Thrones? In that show House Lannister has crimson red as their family color. [SEP] I have heard so much about It, I have never watched it before! It looks like the series is ending next year after eight seasons!!"
# token_index = 6  # [MASK]로 대체할 토큰의 위치
candidates = ['"crimson. I prefer baby pink"', 'red', 'Game of Thrones', 'House Lannister']  # 후보 단어 리스트

# 문장을 토큰화하고 [MASK] 토큰으로 지정된 위치의 단어 대체
tokens = tokenizer.tokenize(sentence)
#print index and token in tokens
print([(i,t) for i,t in enumerate(tokens)])

[(0, 'have'), (1, 'you'), (2, 'ever'), (3, 'read'), (4, '/'), (5, 'watched'), (6, 'game'), (7, 'of'), (8, 'throne'), (9, '##s'), (10, '?'), (11, 'in'), (12, 'that'), (13, 'show'), (14, 'house'), (15, 'lan'), (16, '##nist'), (17, '##er'), (18, 'has'), (19, 'crimson'), (20, 'red'), (21, 'as'), (22, 'their'), (23, 'family'), (24, 'color'), (25, '.'), (26, '[SEP]'), (27, 'i'), (28, 'have'), (29, 'heard'), (30, 'so'), (31, 'much'), (32, 'about'), (33, 'it'), (34, ','), (35, 'i'), (36, 'have'), (37, 'never'), (38, 'watched'), (39, 'it'), (40, 'before'), (41, '!'), (42, 'it'), (43, 'looks'), (44, 'like'), (45, 'the'), (46, 'series'), (47, 'is'), (48, 'ending'), (49, 'next'), (50, 'year'), (51, 'after'), (52, 'eight'), (53, 'seasons'), (54, '!'), (55, '!')]


In [18]:
token_index = 33
tokens[token_index] = '[MASK]'
mask_token_index = tokens.index('[MASK]')

# [CLS]와 [SEP] 토큰 추가
tokens = ['[CLS]'] + tokens + ['[SEP]']
mask_token_index += 1  # [CLS] 토큰이 추가되어 인덱스 조정
tokens

['[CLS]',
 'have',
 'you',
 'ever',
 'read',
 '/',
 'watched',
 'game',
 'of',
 'throne',
 '##s',
 '?',
 'in',
 'that',
 'show',
 'house',
 'lan',
 '##nist',
 '##er',
 'has',
 'crimson',
 'red',
 'as',
 'their',
 'family',
 'color',
 '.',
 '[SEP]',
 'i',
 'have',
 'heard',
 'so',
 'much',
 'about',
 '[MASK]',
 ',',
 'i',
 'have',
 'never',
 'watched',
 'it',
 'before',
 '!',
 'it',
 'looks',
 'like',
 'the',
 'series',
 'is',
 'ending',
 'next',
 'year',
 'after',
 'eight',
 'seasons',
 '!',
 '!',
 '[SEP]']

In [21]:
# 예제 문장과 토큰 위치
sentence =  "Have you ever read/watched Game of Thrones? In that show House Lannister has crimson red as their family color. [SEP] I have heard so much about It, I have never watched it before! It looks like the series is ending next year after eight seasons!!"
# token_index = 6  # [MASK]로 대체할 토큰의 위치
candidates = ['"crimson. I prefer baby pink"', 'red', 'Game of Thrones', 'House Lannister']  # 후보 단어 리스트

# 문장을 토큰화하고 [MASK] 토큰으로 지정된 위치의 단어 대체
tokens = tokenizer.tokenize(sentence)
#print index and token in tokens
# print([(i,t) for i,t in enumerate(tokens)])

token_index = 33
tokens[token_index] = '[MASK]'
mask_token_index = tokens.index('[MASK]')

# [CLS]와 [SEP] 토큰 추가
tokens = ['[CLS]'] + tokens + ['[SEP]']
mask_token_index += 1  # [CLS] 토큰이 추가되어 인덱스 조정

# 토큰을 모델의 입력 형태로 변환
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_tensor = torch.tensor([input_ids])

# [MASK] 위치에 대한 예측 수행
with torch.no_grad():
    outputs = model(input_tensor)
    predictions = outputs[0]

# 후보 단어들의 확률 계산
candidate_ids = tokenizer.convert_tokens_to_ids(candidates)
probs = torch.softmax(predictions[0, mask_token_index], dim=-1)
candidate_probs = probs[candidate_ids]

# 후보 단어들과 그 확률 출력
results = {}
for word, prob in zip(candidates, candidate_probs):
    # print(f"{word}: {prob.item()}")
    results[word] = prob.item()

# 확률이 높은 순서대로 출력
sorted(results.items(), key=lambda x: x[1], reverse=True)

[('red', 0.0003169028495904058),
 ('"crimson. I prefer baby pink"', 8.241360660576902e-08),
 ('Game of Thrones', 8.241360660576902e-08),
 ('House Lannister', 8.241360660576902e-08)]

# utils_select_predictions.py

In [12]:
import logging
logging.basicConfig(level=logging.ERROR)

from pprint import pprint
import spacy
import json, jsonlines
import logging
from collections import Counter
import argparse

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

nlp = spacy.load("en_core_web_sm")

# 모델과 토크나이저 초기화
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

def write_json(data, file_path):
    """
    Write data to a JSON file. w/ args.output_file
    """
    logging.info(f'Writing {len(data)} items to {file_path}.')
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)

    
def evaluate_sentence_with_candidates(sentence_template, candidates):
    # 모델과 토크나이저 초기화
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.eval()

    # 로그 확률을 계산하는 함수
    def score(sentence):
        tokenize_input = tokenizer.encode(sentence)
        tensor_input = torch.tensor([tokenize_input])
        with torch.no_grad():
            loss = model(tensor_input, labels=tensor_input)[0]
        return -loss.item()

    # 각 후보에 대한 문장의 확률 계산
    scores = {}
    for candidate in candidates:
        candidate_sentence = sentence_template.format(candidate)
        candidate_score = score(candidate_sentence)
        scores[candidate] = candidate_score

    # 확률이 높은 순서대로 후보 정렬 및 출력
    sorted_candidates = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_candidates

# def generate_filled_sentences(found_pronoun, context, response, pronoun_index, candidates):
#     # 문맥에서 마지막 두 문장을 가져옵니다.
#     context_part = " ".join(context[-2:])
    
#     # 대응하는 응답에서 대명사를 {}로 대체합니다.
#     doc = nlp(response)
#     if doc[pronoun_index] == found_pronoun: # 지시대명사를 빈칸으로 대체하기
#         doc[pronoun_index] = '{}'
#     response_with_placeholder = " ".join(doc)
    
#     # 완성된 문장 템플릿을 생성합니다.
#     sentence_template = f"{context_part} {response_with_placeholder}"
    
#     # 각 후보에 대한 문장의 확률을 계산하고 정렬합니다.
#     sorted_candidates = evaluate_sentence_with_candidates(sentence_template, candidates)
    
#     return sorted_candidates

def generate_filled_sentences(found_pronoun, context, response, pronoun_index, candidates):
    # 문맥에서 마지막 두 문장을 가져옵니다.
    context_part = " ".join(context[-2:])
    
    # 대응하는 응답을 토큰화합니다.
    doc = nlp(response)
    tokens = [token.text for token in doc]  # spacy 토큰을 문자열 리스트로 변환
    
    # 지시대명사의 인덱스에 해당하는 토큰을 '{}'로 대체합니다.
    if tokens[pronoun_index] == found_pronoun:
        tokens[pronoun_index] = '{}'
    response_with_placeholder = " ".join(tokens)  # 수정된 토큰 리스트를 다시 문자열로 결합
    
    # 완성된 문장 템플릿을 생성합니다.
    sentence_template = f"{context_part} {response_with_placeholder}"
    
    # 각 후보에 대한 문장의 확률을 계산하고 정렬합니다.
    sorted_candidates = evaluate_sentence_with_candidates(sentence_template, candidates)
    
    return sorted_candidates

def select_predictions(frame_file, pred_file, output_file):
    # logging.info('>>>>>>>>>>>>>>>>> Start selecting predictions.')
    print('>>>>>>>>>>>>>>>>> Start selecting predictions.')
    frames = []
    predictions = []
    with jsonlines.open(frame_file) as reader:
        for line in reader:
            frames.append(line)
    with open(pred_file, 'r') as f:
        predictions = json.load(f)

    # assert len(frames) == len(predictions)
    # logging.info('>>>>>>>>>>>>>>>>> len(frames): {}'.format(len(frames)))
    # logging.info('>>>>>>>>>>>>>>>>> len(preds): {}'.format(len(predictions)))
    print('>>>>>>>>>>>>>>>>> len(frames): {}'.format(len(frames)))
    print('>>>>>>>>>>>>>>>>> len(preds): {}'.format(len(predictions)))
    return
    results = {}

    qasid_empty = []
    for i, frame in enumerate(frames):
        candidates = [cand["text"] for cand in predictions[str(i)]]
        sorted_candidates = generate_filled_sentences(
                                                        frame['found_pronoun'],
                                                        frame["context_text"],
                                                        frame["orig_response"],
                                                        frame["pronoun_index"],
                                                        candidates
                                                    )   
        
        if not sorted_candidates: # No predictions: "empty" in predictsion_.json, [] in nbest_predictions_.json
            logging.info(f'[Empty] ---> {frame["qas_id"]}')
            qasid_empty.append(frame['qas_id'])
        else:
            predicted_noun = sorted_candidates[0][0]
            logging.info(f'[{predicted_noun}] ---> {frame["qas_id"]}')
            results[frame['qas_id']] = [predicted_noun]
        # break
        # print logging every 10% in for loop
        if i % (len(frames) // 10) == 0:
            logging.info(f'>>>>>>>>>>>>>>>>> {i} / {len(frames)} done.')
    write_json(results, output_file)
    print(f'>>>>>>>>>>>>>>>>> The Number of empty predictions: {len(qasid_empty)} out of {len(frames)}.')


### Dialfact / Valid

In [13]:
# [1] dialfact valid
FRAME_FILE='/data/scratch/acw722/corefbert/result/resolved_data/dialfact/dialfact_valid_with_pronouns.jsonl'
PRED_FILE='/data/scratch/acw722/corefbert/result/inference/nbest_predictions_dialfact_valid.json'
OUTPUT_FILE='/data/scratch/acw722/corefbert/result/inference/thebest_predictions_dialfact_valid.json'

select_predictions(FRAME_FILE, PRED_FILE, OUTPUT_FILE)

>>>>>>>>>>>>>>>>> Start selecting predictions.


>>>>>>>>>>>>>>>>> len(frames): 7411
>>>>>>>>>>>>>>>>> len(preds): 7411


In [19]:
# [1] dialfact test
FRAME_FILE='/data/scratch/acw722/corefbert/result/resolved_data/dialfact/dialfact_test_with_pronouns.jsonl'
# PRED_FILE='/data/scratch/acw722/corefbert/result/inference/nbest_predictions_dialfact_test.json'
PRED_FILE='/data/scratch/acw722/corefbert/result/inference/predictions_dialfact_test.json'
OUTPUT_FILE='/data/scratch/acw722/corefbert/result/inference/thebest_predictions_dialfact_test.json'

select_predictions(FRAME_FILE, PRED_FILE, OUTPUT_FILE)

>>>>>>>>>>>>>>>>> Start selecting predictions.
>>>>>>>>>>>>>>>>> len(frames): 10155
>>>>>>>>>>>>>>>>> len(preds): 10155


In [16]:
# [3] augwow dev
FRAME_FILE='/data/scratch/acw722/corefbert/result/resolved_data/augwow/augwow_dev_with_pronouns.jsonl'
PRED_FILE='/data/scratch/acw722/corefbert/result/inference/nbest_predictions_augwow_dev.json'
OUTPUT_FILE='/data/scratch/acw722/corefbert/result/inference/thebest_predictions_augwow_dev.json'

select_predictions(FRAME_FILE, PRED_FILE, OUTPUT_FILE)

>>>>>>>>>>>>>>>>> Start selecting predictions.
>>>>>>>>>>>>>>>>> len(frames): 1463
>>>>>>>>>>>>>>>>> len(preds): 1463


In [20]:
# [4] augwow train
FRAME_FILE='/data/scratch/acw722/corefbert/result/resolved_data/augwow/augwow_train_with_pronouns.jsonl'
# PRED_FILE='/data/scratch/acw722/corefbert/result/inference/nbest_predictions_augwow_train.json'
PRED_FILE='/data/scratch/acw722/corefbert/result/inference/predictions_augwow_train.json'
OUTPUT_FILE='/data/scratch/acw722/corefbert/result/inference/thebest_predictions_augwow_train.json'

select_predictions(FRAME_FILE, PRED_FILE, OUTPUT_FILE)

>>>>>>>>>>>>>>>>> Start selecting predictions.
>>>>>>>>>>>>>>>>> len(frames): 168347
>>>>>>>>>>>>>>>>> len(preds): 168347


- "Game of Thrones"를 가장 자연스러운 선택으로 예측하는 데 있어서 GPT-2가 BERT보다 더 성공적이었던 이유?

    - 학습 목표와 방식의 차이: GPT-2는 연속된 텍스트를 예측하는 언어 생성 모델입니다. 이는 주어진 문맥을 바탕으로 다음에 올 텍스트를 예측하는 방식으로 학습되며, 전체 문장의 흐름과 문맥적 자연스러움에 중점을 둡니다. 반면, BERT는 문장 내 빈칸을 채우는 방식(Masked Language Model, MLM)으로 학습되어, 주어진 문맥 내에서 단어나 구의 적합성을 평가하는 데 초점을 맞춥니다. 이러한 차이로 인해, 전체 문장의 자연스러움을 평가하는 작업에서 GPT-2가 더 유리할 수 있습니다.

    - 언어 이해 및 생성 능력: GPT-2는 문장 생성 작업에 특화된 모델로, 주어진 문맥을 기반으로 문장을 이어 나가는 능력이 매우 뛰어납니다. 이는 모델이 전체 문장 구조와 문맥을 더 효과적으로 이해하고, 그에 따라 더 자연스러운 텍스트를 생성할 수 있음을 의미합니다. 따라서, "Game of Thrones"와 같이 특정 문맥에서 자연스럽게 등장할 수 있는 구나 문구를 예측하는 데 있어 GPT-2가 더 정확할 수 있습니다.

    - 토큰 처리 방식의 차이: BERT는 주로 문장 내의 단일 토큰이나 짧은 구를 대상으로 학습되며, 한 번에 하나의 [MASK] 토큰만 예측합니다. 반면, GPT-2는 전체 문장을 생성하는 과정에서 여러 단어나 구를 연속적으로 예측할 수 있습니다. 이로 인해, "Game of Thrones"와 같이 여러 토큰으로 구성된 구나 문구를 더 자연스럽게 처리하고 예측할 수 있습니다.

    - 이러한 이유들로 인해, 주어진 문장 내에서 "Game of Thrones"를 가장 적합한 선택으로 예측하는 데 GPT-2가 BERT보다 더 성공적일 수 있습니다. GPT-2의 학습 방식과 모델 구조는 전체 문장의 문맥적 자연스러움과 흐름을 더 잘 파악하고 반영할 수 있기 때문입니다.

# resolved all/subset 확인
- original/resolved 각각 매칭확인

In [8]:
def print_matching_responses(json_list1, json_list2):
    for json_obj1, json_obj2 in zip(json_list1, json_list2):
        if json_obj1['id'] == json_obj2['id']:
            print(json_obj1['response'])
            print(json_obj2['response'])

### Dialfact/test/subset

In [7]:
import jsonlines


# read dialfact all resolved
file_path = '/data/scratch/acw722/corefbert/result/resolved_all/dialfact/test_split.jsonl'

dialfact_resolved_all = []
with jsonlines.open(file_path, 'r') as reader:
    for line in reader:
        dialfact_resolved_all.append(line)
print(len(dialfact_resolved_all))
print(dialfact_resolved_all[0].keys())
print(dialfact_resolved_all[0])

# read dialfact all unresolved
file_path = '/data/scratch/acw722/corefbert/result/unresolved_all/dialfact/test_split.jsonl'

dialfact_unresolved_all = []
with jsonlines.open(file_path, 'r') as reader:
    for line in reader:
        dialfact_unresolved_all.append(line)
print(len(dialfact_unresolved_all))
print(dialfact_unresolved_all[0].keys())
print(dialfact_unresolved_all[0])

11809
dict_keys(['context_id', 'id', 'data_type', 'context', 'response', 'evidence_list', 'response_label', 'type_label'])
{'context_id': '588___9', 'id': '588___9--0', 'data_type': 'written', 'context': ['Hello! I was a cheerleader in high school. Do you have any experience with cheerleading? ', 'Cheerleading could be chanting or activity.', 'yes, it could be. By chanting, do you mean saying things in unison? ', 'Cheerleaders cheer for their team to root for them.', 'Yes, they do. I believe it gives the team a lot of momentum. ', 'Competitive routines can be from one to three minutes.', 'Yes, I like seeing the cheerleaders do coordinated actions. It looks really cool', 'Cheerleading was founded in the United States.', "Oh, that's cool. I didn't know that. When did it start? "], 'response': 'In 1997 ESPN broadcast a global Cheerleading competition.', 'evidence_list': [['Cheerleading', 'https://en.wikipedia.org/wiki/Cheerleading', 'The global presentation of cheerleading was led by the 

In [9]:
def print_matching_responses(json_list1, json_list2):
    idx = 0
    for json_obj1, json_obj2 in zip(json_list1, json_list2):
        idx += 1
        if idx == 10: break
        if json_obj1['id'] == json_obj2['id']:
            print(json_obj1['response'])
            print(json_obj2['response'])
            print()

print_matching_responses(dialfact_unresolved_all, dialfact_resolved_all)

In 1997 ESPN broadcast a global Cheerleading competition.
In 1997 ESPN broadcast a global Cheerleading competition.

"The King"'s music career began in Memphis, Tennessee.  His producer, Sam Philips, wanted to bring the sound of African American music to a wider audience.  He became popular quite quickly.
" The King " 's music career began in Memphis , Tennessee .   Elvis Aaron Presley producer , Sam Philips , wanted to bring the sound of African American music to a wider audience .   King of Rock and Roll became popular quite quickly .

His music career began in Nashville, Tennessee.  His producer, Sam Wilson, wanted to bring the sound of African American music to a wider audience.
Elvis Aaron Presley music career began in Nashville , Tennessee .   Elvis Aaron Presley producer , Sam Wilson , wanted to bring the sound of African American music to a wider audience .

His music career began in Miami.  His producer, Sam Samers, wanted to bring the sound of African American music to a wide

In [10]:
# read dialfact subset resolved
file_path = '/data/scratch/acw722/corefbert/result/resolved_subset/dialfact/test_split.jsonl'

dialfact_resolved_subset = []
with jsonlines.open(file_path, 'r') as reader:
    for line in reader:
        dialfact_resolved_subset.append(line)
print(len(dialfact_resolved_subset))
print(dialfact_resolved_subset[0].keys())
print(dialfact_resolved_subset[0])

# read dialfact subset unresolved
file_path = '/data/scratch/acw722/corefbert/result/unresolved_subset/dialfact/test_split.jsonl'

dialfact_unresolved_subset = []
with jsonlines.open(file_path, 'r') as reader:
    for line in reader:
        dialfact_unresolved_subset.append(line)
print(len(dialfact_unresolved_subset))
print(dialfact_unresolved_subset[0].keys())
print(dialfact_unresolved_subset[0])

5784
dict_keys(['context_id', 'id', 'data_type', 'context', 'response', 'evidence_list', 'response_label', 'type_label', 'words'])
{'context_id': '557___2', 'id': '557___2--0', 'data_type': 'written', 'context': ['Elvis Aaron Presley was a famous musician and singer known as the "King of Rock and Roll."  He was born in 1935, and died in 1977.  Are you a fan?', "I've definitely heard of him and his music, since he was so ridiculously popular. What else do you know about him?"], 'response': '" The King " \'s music career began in Memphis , Tennessee .   Elvis Aaron Presley producer , Sam Philips , wanted to bring the sound of African American music to a wider audience .   King of Rock and Roll became popular quite quickly .', 'evidence_list': [['Elvis Presley', 'https://en.wikipedia.org/wiki/Elvis_Presley', 'His music career began there in 1954, recording at Sun Records with producer Sam Phillips, who wanted to bring the sound of African American music to a wider audience.', '0']], 'resp

In [11]:
def print_matching_responses(json_list1, json_list2):
    idx = 0
    for json_obj1, json_obj2 in zip(json_list1, json_list2):
        idx += 1
        if idx == 10: break
        if json_obj1['id'] == json_obj2['id']:
            print(json_obj1['response'])
            print(json_obj2['response'])
            print()

print_matching_responses(dialfact_unresolved_subset, dialfact_resolved_subset)

"The King"'s music career began in Memphis, Tennessee.  His producer, Sam Philips, wanted to bring the sound of African American music to a wider audience.  He became popular quite quickly.
" The King " 's music career began in Memphis , Tennessee .   Elvis Aaron Presley producer , Sam Philips , wanted to bring the sound of African American music to a wider audience .   King of Rock and Roll became popular quite quickly .

His music career began in Nashville, Tennessee.  His producer, Sam Wilson, wanted to bring the sound of African American music to a wider audience.
Elvis Aaron Presley music career began in Nashville , Tennessee .   Elvis Aaron Presley producer , Sam Wilson , wanted to bring the sound of African American music to a wider audience .

His music career began in Miami.  His producer, Sam Samers, wanted to bring the sound of African American music to a wider audience.
Elvis Aaron Presley music career began in Miami .   Elvis Aaron Presley producer , Sam Samers , wanted to