In [1]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# model, tokenizer

In [2]:
additional_tokens = {'additional_special_tokens': ['[entity1]', '[entity2]', '[learn1]', '[learn2]', '[learn3]', '[learn4]', '[learn5]', '[learn6]']}

In [3]:
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
print_trainable_parameters(model)

trainable params: 124439808 || all params: 124439808 || trainable%: 100.0


In [5]:
num_added_toks = tokenizer.add_special_tokens(additional_tokens)

In [6]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50267, 768)

In [7]:
# save the tokenizer

tokenizer.save_pretrained('DocRED/GPT_w_ner[]/gpt2_tokenizer')

('DocRED/GPT_w_ner[]/gpt2_tokenizer/tokenizer_config.json',
 'DocRED/GPT_w_ner[]/gpt2_tokenizer/special_tokens_map.json',
 'DocRED/GPT_w_ner[]/gpt2_tokenizer/vocab.json',
 'DocRED/GPT_w_ner[]/gpt2_tokenizer/merges.txt',
 'DocRED/GPT_w_ner[]/gpt2_tokenizer/added_tokens.json')

# data

In [15]:
# load the json file from DocRED/data/test.json and DocRED/data/rel_info.json

import json

with open('DocRED/data/train_annotated.json') as f:
    train_set = json.load(f)


with open('DocRED/data/rel_info.json') as f:
    rel_info = json.load(f)

In [78]:
relation_dict = {
    'id': [],
    'text': [],
    'head': [],
    'tail': [],
    'head_first': [],
    'relation': [],
    'head_start_pos' : [],
    'tail_start_pos' : []
}

for i in range(len(train_set)):
    sents = ""
    for sent in train_set[i]['sents']:
        # flatten the sent list
        a = " ".join(sent)
        sents += a.lower() + " "
    # relation_dict['text'].append(sents)

    for relation_pair in train_set[i]['labels']:
        relation_dict['id'].append(i)
        relation_dict['text'].append(sents)
        head = []
        head_ = []
        head_start_pos = []
        head.append([[item['name'].lower()] for item in train_set[i]['vertexSet'][relation_pair['h']]])
        for j, item in enumerate(head[0]):
            if item not in head_:
                head_.append(item)
                head_start_pos.append(train_set[i]['vertexSet'][relation_pair['h']][j]['pos'][0])

        relation_dict['head'].append(head_)
        relation_dict['head_start_pos'].append(head_start_pos)

        tail = []
        tail_ = []
        tail_start_pos = []
        tail.append([[item['name'].lower()] for item in train_set[i]['vertexSet'][relation_pair['t']]])
        for j, item in enumerate(tail[0]):
            if item not in tail_:
                tail_.append(item)
                tail_start_pos.append(train_set[i]['vertexSet'][relation_pair['t']][j]['pos'][0])
        relation_dict['tail'].append(tail_)
        relation_dict['tail_start_pos'].append(tail_start_pos)

        
        if train_set[i]['vertexSet'][relation_pair['h']][0]['pos'][0] < train_set[i]['vertexSet'][relation_pair['t']][0]['pos'][0]:
            relation_dict['head_first'].append(1)
        else:
            relation_dict['head_first'].append(0)
        
        relation_dict['relation'].append(relation_pair['r'])
    # break


# save the relation_dict to a json file

with open('DocRED/data/DocRED_baseline_metadata/relation_dict.json', 'w') as f:
    json.dump(relation_dict, f)
        

In [16]:
ner = 1

In [17]:
import json



from datasets import Dataset

if ner:
    with open('DocRED/data/DocRED_baseline_metadata/relation_dict.json') as f:
        relation_dict = json.load(f)

    dataset = Dataset.from_dict(
        {
            'text': relation_dict['text'],
            'head': relation_dict['head'],
            'tail': relation_dict['tail'],
            'head_first': relation_dict['head_first'],
            'relation': relation_dict['relation']
        }
    )

else:
    with open('DocRED/data/DocRED_baseline_metadata/relation_dict_ner.json') as f:
        relation_dict = json.load(f)

    dataset = Dataset.from_dict(
        {
            'text': relation_dict['text'],
            'pair': relation_dict['pair'],
            'relation': relation_dict['relation']
        }
    )

In [18]:
dataset

Dataset({
    features: ['text', 'head', 'tail', 'head_first', 'relation'],
    num_rows: 38180
})

In [20]:
def pro_processing_ner(example, tokenizer, padding=True):
    texts = example['text']

    # special_tokens = [50259, 50260, 50261, 50262, 50263, 50264]

    output_texts = []

    text_ids = tokenizer(texts, add_special_tokens=False)['input_ids']

    for i in range(len(example['head'])):
        head = ""
        for item in example['head'][i]:
            head += item[0] + " ; "
        head = head[:-2]
        head += ". "

        tail = ""
        for item in example['tail'][i]:
            tail += item[0] + " ; "
        tail = tail[:-2]
        tail += ". "
        
        if example['head_first'][i] == 1:
            output_line = " [entity1] : " + head + "[entity2] : " + tail + "[learn1] [learn2] [learn3] [learn4] [learn5] [learn6] "
            output_line = output_line + f"the relation between source [entity1] and target [entity2] is {rel_info[example['relation'][i]]} . " + tokenizer.eos_token

        else:
            output_line = " [entity1] : " + tail + "[entity2] : " + head + "[learn1] [learn2] [learn3] [learn4] [learn5] [learn6] "
            output_line = output_line + f"the relation between source [entity2] and target [entity1] is {rel_info[example['relation'][i]]} . " + tokenizer.eos_token
        
        output_texts.append(output_line)


    output_ids = tokenizer(output_texts, add_special_tokens=False)['input_ids']

    # input_ids = []
    attention_mask = []

    count = 0
    for i, ids in enumerate(output_ids):
        if len(text_ids[i]) + len(ids) > 1024:
            text_ids[i] = text_ids[:1024 - len(ids)]
            count += 1
        text_ids[i] = text_ids[i] + output_ids[i]
        assert len(text_ids[i]) <= 1024
        attention_mask.append([1] * len(text_ids[i]) + [0] * (1024 - len(text_ids[i])))
    if count != 0:
        print(f"truncated {count} examples")

    if padding:
        for i, ids in enumerate(output_ids):
            output_ids[i] = ids + [tokenizer.pad_token_id] * (1024 - len(ids))
            text_ids[i] = text_ids[i] + [tokenizer.pad_token_id] * (1024 - len(text_ids[i]))

    return {
        'input_ids': text_ids,
        'attention_mask': attention_mask,
        }

In [21]:
tokenized_dataset = dataset.map(lambda example: pro_processing_ner(example, tokenizer), batched=True, remove_columns=['text', 'head', 'tail', 'head_first', 'relation'])

Map:   0%|          | 0/38180 [00:00<?, ? examples/s]

In [24]:
# save the datasets type tokenized_dataset

tokenized_dataset.save_to_disk('DocRED/data/DocRED_baseline_metadata/tokenized_dataset_w_ner_[]')


Saving the dataset (0/1 shards):   0%|          | 0/38180 [00:00<?, ? examples/s]

In [9]:
# load the tokenized_dataset

from datasets import load_from_disk

tokenized_dataset = load_from_disk('DocRED/data/DocRED_baseline_metadata/tokenized_dataset_w_ner_[]')

In [10]:
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [11]:
tokenizer.decode(tokenized_dataset[0]['input_ids'])

"zest airways, inc. operated as airasia zest ( formerly asian spirit and zest air ), was a low - cost airline based at the ninoy aquino international airport in pasay city, metro manila in the philippines. it operated scheduled domestic and international tourist services, mainly feeder services linking manila and cebu with 24 domestic destinations in support of the trunk route operations of other airlines. in 2013, the airline became an affiliate of philippines airasia operating their brand separately. its main base was ninoy aquino international airport, manila. the airline was founded as asian spirit, the first airline in the philippines to be run as a cooperative. on august 16, 2013, the civil aviation authority of the philippines ( caap ), the regulating body of the government of the republic of the philippines for civil aviation, suspended zest air flights until further notice because of safety issues. less than a year after airasia and zest air's strategic alliance, the airline h

# trainer

In [12]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="GPT2-normal",
    # notes="PubmedBERT-FT-NER_w_NERin_10epochs",
    name="GPT2-DocRED-w-ner-[]-5epochs"
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m309439737[0m ([33mtian1995[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
import transformers
from transformers import DataCollatorForLanguageModeling

trainer = transformers.Trainer(
    model=model, 
    train_dataset=tokenized_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1, 
        gradient_accumulation_steps=1,
        warmup_steps=1000, 
        num_train_epochs=5,
        learning_rate=2e-4, 
        # fp16=True,
        logging_steps=100, 
        report_to="wandb",
        save_strategy="epoch",
        output_dir='DocRED/GPT_w_ner[]'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [14]:
trainer.train()



  0%|          | 0/190900 [00:00<?, ?it/s]

{'loss': 31.3334, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 4.2714, 'learning_rate': 4e-05, 'epoch': 0.01}
{'loss': 3.5394, 'learning_rate': 6e-05, 'epoch': 0.01}
{'loss': 3.3138, 'learning_rate': 8e-05, 'epoch': 0.01}
{'loss': 3.2583, 'learning_rate': 0.0001, 'epoch': 0.01}
{'loss': 3.1792, 'learning_rate': 0.00012, 'epoch': 0.02}
{'loss': 3.1823, 'learning_rate': 0.00014, 'epoch': 0.02}
{'loss': 3.1748, 'learning_rate': 0.00016, 'epoch': 0.02}
{'loss': 3.1869, 'learning_rate': 0.00018, 'epoch': 0.02}
{'loss': 3.1112, 'learning_rate': 0.0002, 'epoch': 0.03}
{'loss': 3.1083, 'learning_rate': 0.0001998946814112691, 'epoch': 0.03}
{'loss': 3.0954, 'learning_rate': 0.0001997893628225382, 'epoch': 0.03}
{'loss': 3.0576, 'learning_rate': 0.00019968404423380726, 'epoch': 0.03}
{'loss': 2.9796, 'learning_rate': 0.00019957872564507637, 'epoch': 0.04}
{'loss': 2.9982, 'learning_rate': 0.00019947340705634545, 'epoch': 0.04}
{'loss': 3.0452, 'learning_rate': 0.00019936808846761453, 'epoch': 

TrainOutput(global_step=190900, training_loss=0.3814519633492479, metrics={'train_runtime': 46070.7, 'train_samples_per_second': 4.144, 'train_steps_per_second': 4.144, 'train_loss': 0.3814519633492479, 'epoch': 5.0})

In [15]:
wandb.finish()
trainer.save_model("DocRED/GPT_w_ner[]/model")

# save the tokenizer
# tokenizer.save_pretrained("DocRED/GPT_w_ner/tokenizer")

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▆▅▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,5.0
train/global_step,190900.0
train/learning_rate,0.0
train/loss,0.0436
train/total_flos,9.97612978176e+16
train/train_loss,0.38145
train/train_runtime,46070.7
train/train_samples_per_second,4.144
train/train_steps_per_second,4.144


# Inference

In [16]:
from transformers import AutoModelForCausalLM

checkpoint = "DocRED/GPT_w_ner[]/model"
# checkpoint = "DocRED/GPT_without_ner/model"

In [33]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("DocRED/GPT_w_ner[]/gpt2_tokenizer")
model = AutoModelForCausalLM.from_pretrained(checkpoint)

In [34]:
import torch

model.eval()
model.to("cpu")
inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label :", return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])



Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label : or get rid of a bottle from a public transport


test data

In [24]:
# load the json file from DocRED/data/test.json and DocRED/data/rel_info.json

import json

with open('DocRED/data/dev.json') as f:
    test_set = json.load(f)


with open('DocRED/data/rel_info.json') as f:
    rel_info = json.load(f)

In [54]:
# with ner
from tqdm.notebook import trange, tqdm

relation_dict = {
    'id': [],
    'text': [],
    'head': [],
    'tail': [],
    'head_first': [],
    'relation': [],
    'head_start_pos' : [],
    'tail_start_pos' : []
}

for i in tqdm(range(len(test_set))):
    sents = ""
    for sent in test_set[i]['sents']:
        # flatten the sent list
        a = " ".join(sent)
        sents += a.lower() + " "
    # relation_dict['text'].append(sents)

    for relation_pair in test_set[i]['labels']:
        relation_dict['id'].append(i)
        relation_dict['text'].append(sents)
        head = []
        head_ = []
        head_start_pos = []
        head.append([[item['name'].lower()] for item in test_set[i]['vertexSet'][relation_pair['h']]])
        for j, item in enumerate(head[0]):
            if item not in head_:
                head_.append(item)
                head_start_pos.append(test_set[i]['vertexSet'][relation_pair['h']][j]['pos'][0])

        relation_dict['head'].append(head_)
        relation_dict['head_start_pos'].append(head_start_pos)

        tail = []
        tail_ = []
        tail_start_pos = []
        tail.append([[item['name'].lower()] for item in test_set[i]['vertexSet'][relation_pair['t']]])
        for j, item in enumerate(tail[0]):
            if item not in tail_:
                tail_.append(item)
                tail_start_pos.append(test_set[i]['vertexSet'][relation_pair['t']][j]['pos'][0])
        relation_dict['tail'].append(tail_)
        relation_dict['tail_start_pos'].append(tail_start_pos)

        
        if test_set[i]['vertexSet'][relation_pair['h']][0]['pos'][0] < test_set[i]['vertexSet'][relation_pair['t']][0]['pos'][0]:
            relation_dict['head_first'].append(1)
        else:
            relation_dict['head_first'].append(0)
        
        relation_dict['relation'].append(relation_pair['r'])
    # break


# save the relation_dict to a json file

with open('DocRED/data/DocRED_baseline_metadata/dev_relation_dict.json', 'w') as f:
    json.dump(relation_dict, f)

  0%|          | 0/998 [00:00<?, ?it/s]

In [19]:
ner = 1

In [20]:
import json



from datasets import Dataset

if ner:
    with open('DocRED/data/DocRED_baseline_metadata/dev_relation_dict.json') as f:
        relation_dict = json.load(f)

    dataset = Dataset.from_dict(
        {
            'text': relation_dict['text'],
            'head': relation_dict['head'],
            'tail': relation_dict['tail'],
            'head_first': relation_dict['head_first'],
            'relation': relation_dict['relation']
        }
    )

else:
    with open('DocRED/data/DocRED_baseline_metadata/dev_relation_dict_without_ner.json') as f:
        relation_dict = json.load(f)

    dataset = Dataset.from_dict(
        {
            'text': relation_dict['text'],
            'pair': relation_dict['pair'],
            'relation': relation_dict['relation']
        }
    )

In [26]:
# load the rel_info

with open('DocRED/data/rel_info.json') as f:
    rel_info = json.load(f)

rel2id = {v: k for k, v in rel_info.items()}

In [25]:
n = 45
print(rel_info[dataset[n]['relation']])
dataset[n]

country of citizenship


{'text': 'conrad oberon johnson ( november 15 , 1915 – february 3 , 2008 ) was an american music educator , long associated with the city of houston , who was inducted into the texas bandmasters hall of fame in 2000 . born in victoria , texas , conrad johnson was nine when his family moved to houston . following studies at yates high school , he attended houston college for negroes and graduated from wiley college . he was an active member of omega psi phi fraternity . he started his career in music education in 1941 and , following a thirty - seven - year career , retired from his position at kashmere high school in 1978 , but continued to remain active in shaping music in houston by conducting summer programs and in - home tutoring . johnson was a proficient musician in his own right and , at one point , played with count basie . erskine hawkins tried to convince him to join his orchestra , but johnson declined , citing a love of teaching and obligations to his family . later , johns

In [40]:
def pro_processing_ner_infer(example, tokenizer):
    texts = example['text']

    # special_tokens = [50259, 50260, 50261, 50262, 50263, 50264]

    output_texts = []

    text_ids = tokenizer(texts, add_special_tokens=False)['input_ids']

    for i in range(len(example['head'])):
        head = ""
        for item in example['head'][i]:
            head += item[0] + " ; "
        head = head[:-2]
        head += ". "

        tail = ""
        for item in example['tail'][i]:
            tail += item[0] + " ; "
        tail = tail[:-2]
        tail += ". "
        
        if example['head_first'][i] == 1:
            output_line = "[entity1] : " + head + "[entity2] : " + tail + "[learn1] [learn2] [learn3] [learn4] [learn5] [learn6] "
            # output_line = output_line + f"the relation between source entity 1 and target entity 2 is {rel_info[example['relation'][i]]} . " + tokenizer.eos_token

        else:
            output_line = "[entity1] : " + tail + "[entity2] : " + head + "[learn1] [learn2] [learn3] [learn4] [learn5] [learn6] "
            # output_line = output_line + f"the relation between source entity 2 and target entity 1 is {rel_info[example['relation'][i]]} . " + tokenizer.eos_token
        
        output_texts.append(output_line)


    output_ids = tokenizer(output_texts, add_special_tokens=False)['input_ids']

    # input_ids = []
    # attention_mask = []

    count = 0
    for i, ids in enumerate(output_ids):
        if len(text_ids[i]) + len(ids) > 1024:
            text_ids[i] = text_ids[:1024 - len(ids)]
            count += 1
        text_ids[i] = text_ids[i] + output_ids[i]
        assert len(text_ids[i]) <= 1024
        # attention_mask.append([1] * len(text_ids[i]) + [0] * (1024 - len(text_ids[i])))
    if count != 0:
        print(f"truncated {count} examples")

    # if padding:
    #     for i, ids in enumerate(output_ids):
    #         output_ids[i] = ids + [tokenizer.pad_token_id] * (1024 - len(ids))
    #         text_ids[i] = text_ids[i] + [tokenizer.pad_token_id] * (1024 - len(text_ids[i]))

    return {
        'input_ids': text_ids,
        # 'attention_mask': attention_mask,
        }


In [24]:
dataset

Dataset({
    features: ['text', 'head', 'tail', 'head_first', 'relation'],
    num_rows: 12275
})

In [41]:
# tokenized_dataset = dataset.map(lambda example: pro_processing_without_ner_infer(example, tokenizer), batched=True, remove_columns=['text', 'pair', 'relation'])
tokenized_dataset = dataset.map(lambda example: pro_processing_ner_infer(example, tokenizer), batched=True, remove_columns=['text', 'head', 'tail', 'head_first', 'relation'])

Map:   0%|          | 0/12275 [00:00<?, ? examples/s]

In [42]:
# # save tokenizer_dataset

# tokenized_dataset.save_to_disk('DocRED/data/DocRED_baseline_metadata/dev_tokenized_dataset_dev_w_ner_[]')

Saving the dataset (0/1 shards):   0%|          | 0/12275 [00:00<?, ? examples/s]

In [43]:
from datasets import load_from_disk

tokenized_dataset = load_from_disk('DocRED/data/DocRED_baseline_metadata/dev_tokenized_dataset_dev_w_ner_[]')
# tokenized_dataset = load_from_disk('DocRED/data/DocRED_baseline_metadata/dev_tokenized_dataset_without_ner')

In [44]:
tokenized_dataset.set_format(type='torch', columns=['input_ids'])

In [45]:
tokenizer.decode(tokenized_dataset[45]['input_ids'])

'conrad oberon johnson ( november 15, 1915 – february 3, 2008 ) was an american music educator, long associated with the city of houston, who was inducted into the texas bandmasters hall of fame in 2000. born in victoria, texas, conrad johnson was nine when his family moved to houston. following studies at yates high school, he attended houston college for negroes and graduated from wiley college. he was an active member of omega psi phi fraternity. he started his career in music education in 1941 and, following a thirty - seven - year career, retired from his position at kashmere high school in 1978, but continued to remain active in shaping music in houston by conducting summer programs and in - home tutoring. johnson was a proficient musician in his own right and, at one point, played with count basie. erskine hawkins tried to convince him to join his orchestra, but johnson declined, citing a love of teaching and obligations to his family. later, johnson made his lasting contributio

inference

In [47]:
# load the rel_info

with open('DocRED/data/rel_info.json') as f:
    rel_info = json.load(f)

rel2id = {v: k for k, v in rel_info.items()}

In [49]:
from tqdm.notebook import trange, tqdm
import torch

model.eval()
outputs = []
model.to("cuda")
with torch.no_grad():
    for i in tqdm(range(len(tokenized_dataset))):
    # for i in range(1):
        output = model.generate(input_ids=tokenized_dataset["input_ids"][i].unsqueeze(0).to("cuda"), max_new_tokens=100, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
        output_text = tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=False)[0]
        try:
            outputs.append(output_text.split("[learn1] [learn2] [learn3] [learn4] [learn5] [learn6]")[1].strip())
        except:
            outputs.append(output_text.split("[learn1][learn2][learn3][learn4][learn5][learn6]")[1].strip())
        if i % 100 == 0:
            print(outputs[-1])

    # print(tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=False)[0])

  0%|          | 0/12275 [00:00<?, ?it/s]

the relation between source [entity1] and target [entity2] is country. <|endoftext|>
the relation between source [entity1] and target [entity2] is country. <|endoftext|>
the relation between source [entity1] and target [entity2] is contains administrative territorial entity. <|endoftext|>
the relation between source [entity2] and target [entity1] is country of citizenship. <|endoftext|>
the relation between source [entity2] and target [entity1] is located in the administrative territorial entity. <|endoftext|>
the relation between source [entity1] and target [entity2] is inception. <|endoftext|>
the relation between source [entity2] and target [entity1] is located in the administrative territorial entity. <|endoftext|>
the relation between source [entity2] and target [entity1] is mother. <|endoftext|>
the relation between source [entity1] and target [entity2] is country. <|endoftext|>
the relation between source [entity1] and target [entity2] is inception. <|endoftext|>
the relation be

In [52]:
outputs[45]

'the relation between source [entity1] and target [entity2] is country of citizenship. <|endoftext|>'

In [53]:
dataset[45]

{'text': 'conrad oberon johnson ( november 15 , 1915 – february 3 , 2008 ) was an american music educator , long associated with the city of houston , who was inducted into the texas bandmasters hall of fame in 2000 . born in victoria , texas , conrad johnson was nine when his family moved to houston . following studies at yates high school , he attended houston college for negroes and graduated from wiley college . he was an active member of omega psi phi fraternity . he started his career in music education in 1941 and , following a thirty - seven - year career , retired from his position at kashmere high school in 1978 , but continued to remain active in shaping music in houston by conducting summer programs and in - home tutoring . johnson was a proficient musician in his own right and , at one point , played with count basie . erskine hawkins tried to convince him to join his orchestra , but johnson declined , citing a love of teaching and obligations to his family . later , johns

In [54]:
rel_info['P27']

'country of citizenship'

In [55]:
# post processing for the outputs w ner
# (source, target, relation)
# (2, 1, relation)
pairs = []
count = 0
for output in outputs:
    # if the output doesn't end with "<|endoftext|>", find the lastest ";" of the output and only take the previous part
    try:
        source = output.split("between source [entity")[1].strip()
        source = source.split("] and target ")[0].strip()

        target = output.split(" and target [entity")[1].strip()
        target = target.split("] is ")[0].strip()

        relation = output.split(" is ")[-1].strip()
        relation = relation.split(". <|endoftext|>")[0].strip()

        try:
            relation = rel2id[relation]
        except:
            count += 1
            pass

        pairs.append((source, target, relation))
    except:
        pairs.append(("none", "none", "none"))

print(f"{count} / {len(outputs)}")

2 / 12275


In [56]:
outputs[45]

'the relation between source [entity1] and target [entity2] is country of citizenship. <|endoftext|>'

In [57]:
result = {
    "output": [],
    "label": []
}

for i, output in enumerate(pairs):
    result['output'].append(output)
    if dataset[i]['head_first'] == 1:
        result['label'].append(('1', '2', dataset[i]['relation']))
    else:
        result['label'].append(('2', '1', dataset[i]['relation']))

In [58]:
# save the result dictionary
# import pickle
# with open("DocRED/GPT_w_ner[]/result/epoch_5_result.pkl", "wb") as f:
#     pickle.dump(result, f)

In [59]:
result['label'][48]

('2', '1', 'P17')

In [60]:
result['output'][48]

('2', '1', 'P17')

# Analysis

w ner

In [61]:
import pickle
with open("DocRED/GPT_w_ner[]/result/epoch_5_result.pkl", "rb") as f:
    result = pickle.load(f)

In [62]:
print(f'the length: {len(result["output"])}, {len(result["label"])}')
print(f'instance:\n{result["output"][0]}\n{result["label"][0]}')

the length: 12275, 12275
instance:
('1', '2', 'P17')
('1', '2', 'P17')


In [63]:
# source and target, relation
st_tp = 0
st_fp = 0
st_fn = 0
st_tn = 0

r_tp = 0
r_fp = 0
r_fn = 0
r_tn = 0

tuple_tp = 0
tuple_fp = 0  
tuple_fn = 0
tuple_tn = 0


for output, label in zip(result['output'], result['label']):
    pair = False
    relation = False
    if output[0] == label[0] and output[1] == label[1]:
        st_tp += 1
        pair = True
    else:
        st_fn += 1
        st_fp += 1
    
    if output[2] == label[2]:
        r_tp += 1
        relation = True
    else:
        r_fn += 1
        r_fp += 1

    if pair and relation:
        tuple_tp += 1
    else:
        tuple_fn += 1
        tuple_fp += 1

In [64]:
# calculate the precision, recall and f1 score

# for source and target
st_precision = st_tp / (st_tp + st_fp)
st_recall = st_tp / (st_tp + st_fn)
st_f1 = 2 * st_precision * st_recall / (st_precision + st_recall)
print(f"source and target precision: {st_precision}, recall: {st_recall}, f1: {st_f1}")

# for relation
r_precision = r_tp / (r_tp + r_fp)
r_recall = r_tp / (r_tp + r_fn)
r_f1 = 2 * r_precision * r_recall / (r_precision + r_recall)
print(f"relation precision: {r_precision}, recall: {r_recall}, f1: {r_f1}")

# for tuple
tuple_precision = tuple_tp / (tuple_tp + tuple_fp)
tuple_recall = tuple_tp / (tuple_tp + tuple_fn)
tuple_f1 = 2 * tuple_precision * tuple_recall / (tuple_precision + tuple_recall)
print(f"tuple precision: {tuple_precision}, recall: {tuple_recall}, f1: {tuple_f1}")

source and target precision: 0.8537678207739308, recall: 0.8537678207739308, f1: 0.8537678207739308
relation precision: 0.7141344195519348, recall: 0.7141344195519348, f1: 0.7141344195519348
tuple precision: 0.6980855397148676, recall: 0.6980855397148676, f1: 0.6980855397148676
