In [2]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# model, tokenizer

In [1]:
additional_tokens = {'additional_special_tokens': ['[learn1]', '[learn2]', '[learn3]', '[learn4]', '[learn5]', '[learn6]']}

In [3]:
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
print_trainable_parameters(model)

trainable params: 124439808 || all params: 124439808 || trainable%: 100.0


In [5]:
num_added_toks = tokenizer.add_special_tokens(additional_tokens)

In [6]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50265, 768)

In [7]:
# save the tokenizer

tokenizer.save_pretrained('DocRED/GPT_w_ner_short_relation_onlyRC/gpt2_tokenizer')

('DocRED/GPT_w_ner_short_relation_onlyRC/gpt2_tokenizer/tokenizer_config.json',
 'DocRED/GPT_w_ner_short_relation_onlyRC/gpt2_tokenizer/special_tokens_map.json',
 'DocRED/GPT_w_ner_short_relation_onlyRC/gpt2_tokenizer/vocab.json',
 'DocRED/GPT_w_ner_short_relation_onlyRC/gpt2_tokenizer/merges.txt',
 'DocRED/GPT_w_ner_short_relation_onlyRC/gpt2_tokenizer/added_tokens.json')

# data

In [8]:
# load the json file from DocRED/data/test.json and DocRED/data/rel_info.json

import json

with open('DocRED/data/train_annotated.json') as f:
    train_set = json.load(f)


with open('DocRED/data/rel_info.json') as f:
    rel_info = json.load(f)

In [9]:
train_set[0]

{'vertexSet': [[{'pos': [0, 4],
    'type': 'ORG',
    'sent_id': 0,
    'name': 'Zest Airways, Inc.'},
   {'sent_id': 0,
    'type': 'ORG',
    'pos': [10, 15],
    'name': 'Asian Spirit and Zest Air'},
   {'name': 'AirAsia Zest', 'pos': [6, 8], 'sent_id': 0, 'type': 'ORG'},
   {'name': 'AirAsia Zest', 'pos': [19, 21], 'sent_id': 6, 'type': 'ORG'}],
  [{'name': 'Ninoy Aquino International Airport',
    'pos': [4, 8],
    'sent_id': 3,
    'type': 'LOC'},
   {'name': 'Ninoy Aquino International Airport',
    'pos': [26, 30],
    'sent_id': 0,
    'type': 'LOC'}],
  [{'name': 'Pasay City', 'pos': [31, 33], 'sent_id': 0, 'type': 'LOC'}],
  [{'name': 'Metro Manila', 'pos': [34, 36], 'sent_id': 0, 'type': 'LOC'}],
  [{'name': 'Philippines', 'pos': [38, 39], 'sent_id': 0, 'type': 'LOC'},
   {'name': 'Philippines', 'pos': [13, 14], 'sent_id': 4, 'type': 'LOC'},
   {'sent_id': 5,
    'type': 'LOC',
    'pos': [25, 29],
    'name': 'Republic of the Philippines'}],
  [{'name': 'Manila', 'pos': 

In [10]:
"""
the names of vertextSet can be the same, but the pos should be different
structure:
'vertexSet': 
    [
        (for the same entity but in different synonyms and different sentences)
        [
            {
                'pos':[start, end],
                'type': 'NER',
                'sent_id': 0,
                'name': 'string',
            },
            {}
        ],
        [entity-2]
    ]
'labels':
    [
        {
            'r': 'Pxx',
            'h': 0,
            't': 1,
            'evidence': [2, 3, 4],
        },
        {}
    ]
'title': 'string',
'sents':
    [
        ['word0', 'word1',]
        ['word0', 'word1',]
    ]
"""

"\nthe names of vertextSet can be the same, but the pos should be different\nstructure:\n'vertexSet': \n    [\n        {\n            'pos':[start, end],\n            'type': 'NER',\n            'sent_id': 0,\n            'name': 'string',\n        },\n        {}\n    ]\n'labels':\n    [\n        {\n            'r': 'Pxx',\n            'h': 0,\n            't': 1,\n            'evidence': [2, 3, 4],\n        },\n        {}\n    ]\n'title': 'string',\n'sents':\n    [\n        ['word0', 'word1',]\n        ['word0', 'word1',]\n    ]\n"

In [89]:
with open('DocRED/data/ner_info.json') as f:
    ner_info = json.load(f)

with open('DocRED/data/rel_info.json') as f:
    relation_info = json.load(f)

In [12]:
"""# doc-level is too long for gpt-2, so we need to split the doc-level into bi-sent-level

relation_dict = {
    'id': [],
    'text':[],
    'entity': [],
    'relation': []
}

for i in range(len(train_set)):
    # id
    relation_dict['id'].append(i)

    # text
    sents = ""
    for sent in train_set[i]['sents']:
        # flatten the sent list
        a = " ".join(sent)
        sents += a.lower() + " "
    # if there are space, delete the first and last space of the sents
    sents = sents.strip()
    # delete double space in the sents
    sents = sents.replace("  ", " ")
    relation_dict['text'].append(sents)
    del sents

    # entity
    entity = []
    entity_list = []
    entity_flat = {}
    entity_count = 0
    for sent_item in train_set[i]['vertexSet']:
        for item in sent_item:
            entity_item = []
            if item['name'].lower() not in entity_list:
                entity_list.append(item['name'].lower().strip())
                entity_item.append(item['name'].lower().strip())
                entity_item.append(ner_info[item['type']])

                entity.append(entity_item)
            
            # add the entity_flat
            entity_flat[entity_count] = item['name'].lower().strip()
            entity_count += 1

    # release the entity_list and entity_item
    del entity_item
    del entity_count
        

    # relation pairs
    relation_pairs = {}
    for relation_item in train_set[i]['labels']:
        pair = []
        head = entity_flat[relation_item['h']]
        tail = entity_flat[relation_item['t']]
        pair.append(head)
        pair.append(tail)

        relation  = relation_info[relation_item['r']]
        if relation not in relation_pairs.keys():
            relation_pairs[relation] = []

        relation_pairs[relation].append(pair)
    del pair
    del head
    del tail

    # add the entity and relation pairs to the relation_dict
    relation_dict['entity'].append(entity)
    relation_dict['relation'].append(relation_pairs)
    break


# save the relation_dict to a json file

# with open('DocRED/data/DocRED_baseline_metadata/relation_dict.json', 'w') as f:
#     json.dump(relation_dict, f)"""

In [91]:
relation_dict = {
    'text':[],
    'entity': [],
    'relation': []
}
id_count = 0

for i in range(len(train_set)):
    

    # text
    sent_pairs = []
    for sent_index, sent in enumerate(train_set[i]['sents']):
        sents = ""

        # flatten the sent list
        a = " ".join(sent)
        sents += a.lower() + " "

        # and the next_sent if it exists
        try:
            next_sent = train_set[i]['sents'][sent_index + 1]
            b = " ".join(next_sent)
            sents += b.lower() + " "
        except:
            pass
        # post process the sents for some spaces
        sents = sents.strip()
        sents = sents.replace("  ", " ")

        relation_dict['text'].append(sents)
            
    del sents


    # entity
    entity = []
    for index in range(len(train_set[i]['sents'])):
        # focus on the current sent and the next sent if it exists
        if index + 1 < len(train_set[i]['sents']):
            next_index = index + 1
        else:
            next_index = index
        # group the entities for every 2 sents, no repeated entities in one group
        c_sent_entity_lists = []
        next_sent_entity_lists = []
        entity_for_each_2_sents = []

        for entity_spans in train_set[i]['vertexSet']:
            for item in entity_spans:
                entity_item = []
                # if neither in the current sent nor in the next sent, continue
                if item['sent_id'] != index and item['sent_id'] != next_index:
                    continue
                # also store the first pos of the entity in the entity_item
                # it will look like this: [[entity_name, sent_index, pos1, ner_type]]
                entity_item = [item['name'].lower().strip(), item['sent_id'], item['pos'][0], ner_info[item['type']]]

                if entity_item[1] == index:
                    c_sent_entity_lists.append(entity_item)
                else:
                    next_sent_entity_lists.append(entity_item)

        # sort the c_sent_entity_lists and next_sent_entity_lists by the pos in ascending order
        c_sent_entity_lists.sort(key=lambda x: x[2])
        if index != next_index:
            next_sent_entity_lists.sort(key=lambda x: x[2])
        
        entity_list = []
        for item in c_sent_entity_lists:
            if item[0] not in entity_list:
                entity_list.append(item[0])
                entity_for_each_2_sents.append([item[0], item[3]])
            
        if index != next_index:
            for item in next_sent_entity_lists:
                if item[0] not in entity_list:
                    entity_list.append(item[0])
                    entity_for_each_2_sents.append([item[0], item[3]])

        relation_dict['entity'].append(entity_for_each_2_sents)

    del entity_item
    del c_sent_entity_lists
    del next_sent_entity_lists
    del entity_for_each_2_sents
        

    # relation pairs
    relation_pairs = []

    for index in range(len(train_set[i]['sents'])):
        relation_pairs_for_each_2_sents = {}
        # focus on the current sent and the next sent if it exists
        if index + 1 < len(train_set[i]['vertexSet']):
            next_index = index + 1
        else:
            next_index = index

        # heads, tails: ['entity_name', start_pos]
        for relation_item in train_set[i]['labels']:
            heads = []
            tails = []
            
            # head
            head_exist = False
            for head_span in train_set[i]['vertexSet'][relation_item['h']]:
                if head_span['sent_id'] == index or head_span['sent_id'] == next_index:
                    heads.append([head_span['name'].lower().strip(), head_span['pos'][0]])
                    head_exist = True
            if not head_exist:
                continue
    
            # tail
            tail_exist = False
            for tail_span in train_set[i]['vertexSet'][relation_item['t']]:
                if tail_span['sent_id'] == index or tail_span['sent_id'] == next_index:
                    tails.append([tail_span['name'].lower().strip(), tail_span['pos'][0]])
                    tail_exist = True
            if not tail_exist:
                continue
            

            if relation_info[relation_item['r']] not in relation_pairs_for_each_2_sents.keys():
                relation_pairs_for_each_2_sents[relation_info[relation_item['r']]] = []
            for head in heads:
                for tail in tails:
                    relation_pairs_for_each_2_sents[relation_info[relation_item['r']]].append([head[0], tail[0]])

        relation_dict['relation'].append(relation_pairs_for_each_2_sents)


    
    # break


# save the relation_dict to a json file

# with open('DocRED/data/bi-sent-pre-process.json', 'w') as f:
    # json.dump(relation_dict, f)

In [65]:
len(relation_dict['text']) == len(relation_dict['entity']) == len(relation_dict['relation'])

True

In [98]:
relation_dict['relation'][0]

{'headquarters location': [['zest airways, inc.', 'pasay city'],
  ['asian spirit and zest air', 'pasay city'],
  ['airasia zest', 'pasay city']],
 'country': [['zest airways, inc.', 'philippines'],
  ['asian spirit and zest air', 'philippines'],
  ['airasia zest', 'philippines'],
  ['pasay city', 'philippines'],
  ['manila', 'philippines'],
  ['metro manila', 'philippines'],
  ['ninoy aquino international airport', 'philippines']],
 'located in the administrative territorial entity': [['pasay city',
   'metro manila'],
  ['metro manila', 'philippines'],
  ['ninoy aquino international airport', 'pasay city']],
 'contains administrative territorial entity': [['philippines',
   'metro manila'],
  ['metro manila', 'pasay city']]}

In [9]:
import json

ner = 1
from datasets import Dataset

relation_dict = {}
if ner:
    with open('DocRED/data/bi-sent-pre-process.json') as f:
        relation_dict = json.load(f)

    dataset = Dataset.from_dict(
        {
            'text': relation_dict['text'],
            'entity': relation_dict['entity'],
            'relation': relation_dict['relation']
        }
    )

else:
    pass

In [11]:
dataset

Dataset({
    features: ['text', 'entity', 'relation'],
    num_rows: 24256
})

In [10]:
dataset[0]

{'text': 'zest airways , inc. operated as airasia zest ( formerly asian spirit and zest air ) , was a low - cost airline based at the ninoy aquino international airport in pasay city , metro manila in the philippines . it operated scheduled domestic and international tourist services , mainly feeder services linking manila and cebu with 24 domestic destinations in support of the trunk route operations of other airlines .',
 'entity': [['zest airways, inc.', 'organization'],
  ['airasia zest', 'organization'],
  ['asian spirit and zest air', 'organization'],
  ['ninoy aquino international airport', 'location'],
  ['pasay city', 'location'],
  ['metro manila', 'location'],
  ['philippines', 'location'],
  ['manila', 'location'],
  ['cebu', 'location'],
  ['24', 'number']],
 'relation': {'applies to jurisdiction': None,
  'author': None,
  'award received': None,
  'basin country': None,
  'capital': None,
  'capital of': None,
  'cast member': None,
  'chairperson': None,
  'characters':

In [12]:
len(dataset['entity'][0])

10

In [13]:
"""relation_info_dict = {}
for id, relation in enumerate(dataset[0]['relation'].keys()):
    relation_info_dict[relation] = id

with open('DocRED/data/relation-index.json', 'w') as f:
    json.dump(relation_info_dict, f)"""

with open('DocRED/data/relation-index.json') as f:
    relation_info_dict = json.load(f)

In [14]:
def pro_processing_ner(example, tokenizer, padding=True):
    texts = example['text']
    input_texts = []
    for index in range(len(texts)):
        # entity extraction and NER
        text = texts[index].lower().strip() + " [learn1] [learn2]"
        for entity in example['entity'][index]:
            text = text + " entity : " + entity[0] + " , type : " + entity[1] + " ;"
        text = text[:-1] + "."
        # print("1")
        # add relation classificaiton
        text = text.lower().strip() + " [learn3] [learn4]"
        for relation_type, relation_pair in example['relation'][index].items():
            if relation_pair:
                text_w_relation = text + " for the relation " + relation_type + " : 1 ." + tokenizer.eos_token

            else:
                text_w_relation = text + " for the relation " + relation_type + " : 0 ." + tokenizer.eos_token
            
            input_texts.append(text_w_relation)

    return {
        'input_ids': input_texts
        }

In [15]:
# feed the dataset:dataset to the pro_processing_ner() function with tokenizer, at each time, we feed 30 examples to the function, and then save the output to a json file
# each time the return of the function is a dict, we need to save the dict to a list, and then save the list to a json file

import json
from tqdm import tqdm


output = {"input_texts": []}

for i in tqdm(range(0, len(dataset), 30)):
    result = pro_processing_ner(dataset[i:i+30], tokenizer)
    output["input_texts"].extend(result["input_ids"])

100%|██████████| 809/809 [00:14<00:00, 54.76it/s]


In [16]:
len(output['input_texts'])

2328576

In [17]:
# make the output["input_texts"] into a dataset
from datasets import Dataset

input_text_dataset = Dataset.from_dict(
    {
        'input_texts': output['input_texts'],
    }
)


In [18]:
tokenized_dataset = input_text_dataset.map(lambda example: tokenizer(example['input_texts'], padding='max_length', truncation=True, max_length=1024, pad_to_max_length=True), batched=True)

Map:   0%|          | 0/2328576 [00:00<?, ? examples/s]

In [19]:
# remove the column of input_texts in the tokenized_dataset
tokenized_dataset.remove_columns('input_texts')

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2328576
})

In [20]:
# save the tokenized_dataset

tokenized_dataset.save_to_disk('DocRED/GPT_w_ner_short_relation_onlyRC/train_data_ner_short_relation_onlyRC')
# with open('DocRED/data/train_ner_short_relation.json', 'w') as f:
#     json.dump(tokenized_dataset, f)

Saving the dataset (0/27 shards):   0%|          | 0/2328576 [00:00<?, ? examples/s]

In [21]:
from datasets import Dataset

tokenized_dataset = Dataset.load_from_disk('DocRED/GPT_w_ner_short_relation_onlyRC/train_data_ner_short_relation_onlyRC')
tokenized_dataset = tokenized_dataset.remove_columns('input_texts')

In [26]:
# only take the first len(tokenized_dataset) // 50 examples to train the model

tokenized_dataset = tokenized_dataset.select(range(len(tokenized_dataset) // 50))

In [27]:
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [28]:
tokenizer.decode(tokenized_dataset[66]['input_ids'])

'zest airways, inc. operated as airasia zest ( formerly asian spirit and zest air ), was a low - cost airline based at the ninoy aquino international airport in pasay city, metro manila in the philippines. it operated scheduled domestic and international tourist services, mainly feeder services linking manila and cebu with 24 domestic destinations in support of the trunk route operations of other airlines. [learn1] [learn2] entity : zest airways, inc., type : organization ; entity : airasia zest, type : organization ; entity : asian spirit and zest air, type : organization ; entity : ninoy aquino international airport, type : location ; entity : pasay city, type : location ; entity : metro manila, type : location ; entity : philippines, type : location ; entity : manila, type : location ; entity : cebu, type : location ; entity : 24, type : number. [learn3] [learn4] for the relation participant of : 0.<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|>

In [29]:
tokenized_dataset.__getitems__([1,4])

[{'input_ids': tensor([   89,   395,  1633,  ..., 50258, 50258, 50258]),
  'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])},
 {'input_ids': tensor([   89,   395,  1633,  ..., 50258, 50258, 50258]),
  'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}]

# trainer

In [30]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="GPT2-intermediate",
    # notes="PubmedBERT-FT-NER_w_NERin_10epochs",
    name="GPT2-short_relation_onlyRC_DocRED-w-ner-5epochs"
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m309439737[0m ([33mtian1995[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [31]:
import transformers
from transformers import DataCollatorForLanguageModeling

trainer = transformers.Trainer(
    model=model, 
    train_dataset=tokenized_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2, 
        gradient_accumulation_steps=2,
        warmup_steps=1000, 
        num_train_epochs=5,
        learning_rate=2e-4, 
        # fp16=True,
        logging_steps=100, 
        report_to="wandb",
        save_strategy="epoch",
        output_dir='DocRED/GPT_w_ner_short_relation_onlyRC'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [32]:
trainer.train()



  0%|          | 0/1165 [00:00<?, ?it/s]

{'loss': 24.2481, 'learning_rate': 2e-05, 'epoch': 0.43}
{'loss': 1.1111, 'learning_rate': 4e-05, 'epoch': 0.86}
{'loss': 0.1935, 'learning_rate': 6e-05, 'epoch': 1.29}
{'loss': 0.1151, 'learning_rate': 8e-05, 'epoch': 1.72}
{'loss': 0.0929, 'learning_rate': 0.0001, 'epoch': 2.15}
{'loss': 0.0797, 'learning_rate': 0.00012, 'epoch': 2.58}
{'loss': 0.0785, 'learning_rate': 0.00014, 'epoch': 3.0}
{'loss': 0.0747, 'learning_rate': 0.00016, 'epoch': 3.43}
{'loss': 0.0629, 'learning_rate': 0.00018, 'epoch': 3.86}
{'loss': 0.0676, 'learning_rate': 0.0002, 'epoch': 4.29}
{'loss': 0.0568, 'learning_rate': 7.878787878787879e-05, 'epoch': 4.72}
{'train_runtime': 996.1626, 'train_samples_per_second': 4.673, 'train_steps_per_second': 1.169, 'train_loss': 2.25011079792301, 'epoch': 5.0}


TrainOutput(global_step=1165, training_loss=2.25011079792301, metrics={'train_runtime': 996.1626, 'train_samples_per_second': 4.673, 'train_steps_per_second': 1.169, 'train_loss': 2.25011079792301, 'epoch': 5.0})

In [33]:
wandb.finish()
trainer.save_model("DocRED/GPT_w_ner_short_relation_onlyRC/model")

# save the tokenizer
tokenizer.save_pretrained("DocRED/GPT_w_ner_short_relation_onlyRC/tokenizer")

0,1
train/epoch,▁▂▂▃▄▄▅▆▆▇██
train/global_step,▁▂▂▃▄▄▅▆▆▇██
train/learning_rate,▁▂▃▃▄▅▆▆▇█▃
train/loss,█▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,5.0
train/global_step,1165.0
train/learning_rate,8e-05
train/loss,0.0568
train/total_flos,2432628817920000.0
train/train_loss,2.25011
train/train_runtime,996.1626
train/train_samples_per_second,4.673
train/train_steps_per_second,1.169


('DocRED/GPT_w_ner_short_relation_onlyRC/tokenizer/tokenizer_config.json',
 'DocRED/GPT_w_ner_short_relation_onlyRC/tokenizer/special_tokens_map.json',
 'DocRED/GPT_w_ner_short_relation_onlyRC/tokenizer/vocab.json',
 'DocRED/GPT_w_ner_short_relation_onlyRC/tokenizer/merges.txt',
 'DocRED/GPT_w_ner_short_relation_onlyRC/tokenizer/added_tokens.json')

# Inference

In [2]:
from transformers import AutoModelForCausalLM

checkpoint = "DocRED/GPT_w_ner_short_relation_onlyRC/model"

In [3]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("DocRED/GPT_w_ner_short_relation/tokenizer")
model = AutoModelForCausalLM.from_pretrained(checkpoint)

In [4]:
# output all of the special tokens in the tokenizer
tokenizer.all_special_tokens

['<|startoftext|>',
 '<|endoftext|>',
 '<|pad|>',
 '[learn1]',
 '[learn2]',
 '[learn3]',
 '[learn4]',
 '[learn5]',
 '[learn6]']

In [37]:
import torch

model.eval()
model.to("cpu")
# inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. [learn1] [learn2] entity :", return_tensors="pt")

inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again.", return_tensors="pt", padding='max_length', max_length=1000)

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=20, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=False)[0])

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad

test data pre-processing

In [6]:
import json

from datasets import Dataset

ner = 1

test_relation_dict = {}
if ner:
    with open('DocRED/data/bi-sent-pre-process_test.json') as f:
        test_relation_dict = json.load(f)

    test_dataset = Dataset.from_dict(
        {
            'text': test_relation_dict['text'],
            'entity': test_relation_dict['entity'],
            'relation': test_relation_dict['relation']
        }
    )

else:
    pass


dataset = test_dataset

In [33]:
from datasets import Dataset

def pro_processing_ner(example, tokenizer, padding=True):
    texts = example['text']
    input_texts = []
    for index in range(len(texts)):
        # entity extraction and NER
        text = texts[index].lower().strip() + " [learn1] [learn2]"
        for entity in example['entity'][index]:
            text = text + " entity : " + entity[0] + " , type : " + entity[1] + " ;"
        text = text[:-1] + "."
        # print("1")
        # add relation classificaiton
        text = text.lower().strip() + " [learn3] [learn4]"
        for relation_type, relation_pair in example['relation'][index].items():
            if relation_pair:
                text_w_relation = text + " for the relation " + relation_type + " : 1 ." + tokenizer.eos_token

            else:
                text_w_relation = text + " for the relation " + relation_type + " : 0 ." + tokenizer.eos_token
            
            input_texts.append(text_w_relation)

    return {
        'input_ids': input_texts
        }

import json
from tqdm import tqdm


output = {"input_texts": []}

for i in tqdm(range(0, len(dataset), 30)):
    result = pro_processing_ner(dataset[i:i+30], tokenizer)
    output["input_texts"].extend(result["input_ids"])


from datasets import Dataset

input_text_dataset = Dataset.from_dict(
    {
        'input_texts': output['input_texts'],
    }
)


100%|██████████| 269/269 [00:04<00:00, 61.32it/s]


In [5]:
tokenized_dataset = input_text_dataset.map(lambda example: tokenizer(example['input_texts'], padding='max_length', truncation=True, max_length=1024, pad_to_max_length=True), batched=True)

tokenized_dataset.remove_columns('input_texts')

tokenized_dataset.save_to_disk('DocRED/GPT_w_ner_short_relation_onlyRC/test_data_ner_short_relation_onlyRC')
# with open('DocRED/data/train_ner_short_relation.json', 'w') as f:
#     json.dump(tokenized_dataset, f)



NameError: name 'input_text_dataset' is not defined

randomly select test data

In [7]:
from datasets import Dataset

tokenized_test_dataset = Dataset.load_from_disk('DocRED/GPT_w_ner_short_relation_onlyRC/test_data_ner_short_relation_onlyRC')
tokenized_test_dataset = tokenized_test_dataset.remove_columns('input_texts')

tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [10]:
len(tokenized_test_dataset)

773472

In [12]:
tokenizer.decode(tokenized_test_dataset[0]['input_ids'])

'skai tv is a greek free - to - air television network based in piraeus. it is part of the skai group, one of the largest media groups in the country. [learn1] [learn2] entity : skai tv, type : organization ; entity : greek, type : location ; entity : piraeus, type : location ; entity : skai group, type : organization. [learn3] [learn4] for the relation applies to jurisdiction : 0.<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|

whole inference

In [23]:
import json

def for_the_relation (index, input, learn_token=False, tokenizer=tokenizer):
    index = index % 96
    rel_info = {}
    with open('DocRED/data/relation-index.json') as f:
        rel_info = json.load(f)
    rel_info_list = [relation for relation in rel_info.keys()]
    if learn_token:
        relation_input = "[learn3] [learn4] for the relation " + rel_info_list[index].lower() + " : "
    else:
        relation_input = "for the relation " + rel_info_list[index].lower() + " : "
    tokenized_relation_input = tokenizer.encode(relation_input, add_special_tokens=False, return_tensors="pt")
    # print("before relation classification: ", tokenizer.decode(input))
    return (torch.cat((input, tokenized_relation_input[0]), dim=0))


In [37]:
from tqdm.notebook import trange, tqdm
import torch
import numpy as np


model.eval()
outputs = []
model.to("cuda")
output_results = []


right_relation_1 = []
right_relation_0 = []
wrong_relation_1 = []
wrong_relation_0 = []


with torch.no_grad():
    # feed the actually_input to the model by 10 examples each time
    for input_index, input_ids_list in enumerate(tqdm(tokenized_test_dataset)):
        # print(input_index + 1, " / ", len(tokenized_test_dataset))
        end = input_ids_list['input_ids'].tolist().index(tokenizer.convert_tokens_to_ids("[learn2]")) + 1
        input_ids = torch.tensor(input_ids_list['input_ids'][:end]).to("cuda")
        relation_classfication = False

        while((input_ids[-1].item() != tokenizer.eos_token_id) and (len(input_ids) < 1024)):
            output = model(input_ids=input_ids)
            current_output = np.array(output['logits'].cpu())
            max_index = np.argmax(current_output[-1, :], axis=0)

            # if len(input_ids) - end > 40 and (not relation_classfication):
            #     input_ids = torch.cat((input_ids, torch.tensor(max_index).unsqueeze(0).to("cuda")), dim=0)
            #     input_ids = for_the_relation(input_index, input_ids.to("cpu"), learn_token=True)
            #     input_ids = input_ids.to("cuda")
            #     relation_classfication = True
            #     continue

            if max_index == tokenizer.convert_tokens_to_ids("[learn4]") and (not relation_classfication):
                input_ids = torch.cat((input_ids, torch.tensor(max_index).unsqueeze(0).to("cuda")), dim=0)
                input_ids = for_the_relation(input_index, input_ids.to("cpu"))
                input_ids = input_ids.to("cuda")
                relation_classfication = True
                continue

            if relation_classfication:

                zero_index = tokenizer.convert_tokens_to_ids("0")
                one_index = tokenizer.convert_tokens_to_ids("1")
                zero_possibility = current_output[-1, zero_index]
                # print("zero_possibility: ", zero_possibility)
                one_possibility = current_output[-1, one_index]
                # print("one_possibility: ", one_possibility)


                if zero_possibility > one_possibility:
                    output_results.append(0)
                else:
                    output_results.append(1)
                
                if output_results[-1]:
                    if input_text_dataset[input_index]['input_texts'][-16] == "1":
                        right_relation_1.append(input_index)
                    else:
                        wrong_relation_1.append(input_index)

                else:
                    if input_text_dataset[input_index]['input_texts'][-16] == "0":
                        right_relation_0.append(input_index)
                    else:
                        wrong_relation_0.append(input_index)
                break
            
            input_ids = torch.cat((input_ids, torch.tensor(max_index).unsqueeze(0).to("cuda")), dim=0)
            
        if input_index % 96 == 0:
            print("accuracy for golden 1:", len(right_relation_1) / (len(right_relation_1) + len(wrong_relation_0) + 1e-10))
            print("accuracy for golden 0:", len(right_relation_0) / (len(right_relation_0) + len(wrong_relation_1) + 1e-10))
            print("total accuracy: ", (len(right_relation_1) + len(right_relation_0)) / (len(right_relation_1) + len(right_relation_0) + len(wrong_relation_1) + len(wrong_relation_0) + 1e-10))


        # # only take the first 96 * 100 examples to test
        # if input_index + 1 == 96 * 100:
        #     break
        
    # print(tokenizer.batch_decode(max_index, skip_special_tokens=False)[0])

  0%|          | 0/773472 [00:00<?, ?it/s]

  input_ids = torch.tensor(input_ids_list['input_ids'][:end]).to("cuda")


accuracy for golden 1: 0.0
accuracy for golden 0: 0.0
total accuracy:  0.0
accuracy for golden 1: 0.99999999995
accuracy for golden 0: 0.010526315789462604
total accuracy:  0.030927835051514505
accuracy for golden 1: 0.99999999995
accuracy for golden 0: 0.02094240837695239
total accuracy:  0.0310880829015383
accuracy for golden 1: 0.99999999995
accuracy for golden 0: 0.013937282229960301
total accuracy:  0.0207612456747333
accuracy for golden 1: 0.9999999999666667
accuracy for golden 0: 0.010471204188478935
total accuracy:  0.01818181818181346
accuracy for golden 1: 0.999999999975
accuracy for golden 0: 0.014675052410898391
total accuracy:  0.022869022869018114
accuracy for golden 1: 0.99999999998
accuracy for golden 0: 0.01398601398601154
total accuracy:  0.02253032928942417
accuracy for golden 1: 0.99999999998
accuracy for golden 0: 0.011976047904189824
total accuracy:  0.019316493313518673
accuracy for golden 1: 0.9999999999888889
accuracy for golden 0: 0.010526315789472298
total ac

ValueError: 50260 is not in list

In [39]:
input_ids_list['input_ids'].tolist().index(tokenizer.convert_tokens_to_ids("[learn2]")) + 1

ValueError: 50260 is not in list

In [41]:
tokenizer.decode(input_ids_list['input_ids'].tolist())

'the third strategic area is nanotechnology applied to the development of the smarter devices for the intermittent production industry. this technology can be applied to, for example, blood testing or recovering oil from existing fields. [learn1] [learn2. [learn3] [learn4] for the relation applies to jurisdiction : 0.<|endoftext|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|p

In [42]:
print("right_relation_1: ", len(right_relation_1))
print("right_relation_0: ", len(right_relation_0))
print("wrong_relation_1: ", len(wrong_relation_1))
print("wrong_relation_0: ", len(wrong_relation_0))
print("accuracy for golden 1:", len(right_relation_1) / (len(right_relation_1) + len(wrong_relation_0)))
print("accuracy for golden 0:", len(right_relation_0) / (len(right_relation_0) + len(wrong_relation_1)))
print("total accuracy: ", (len(right_relation_1) + len(right_relation_0)) / (len(right_relation_1) + len(right_relation_0) + len(wrong_relation_1) + len(wrong_relation_0)))

right_relation_1:  17
right_relation_0:  15
wrong_relation_1:  1792
wrong_relation_0:  0
accuracy for golden 1: 1.0
accuracy for golden 0: 0.008301051466519093
total accuracy:  0.017543859649122806


In [288]:
import json

from datasets import Dataset

ner = 1

test_relation_dict = {}
if ner:
    with open('DocRED/data/bi-sent-pre-process_test.json') as f:
        test_relation_dict = json.load(f)

    test_dataset = Dataset.from_dict(
        {
            'text': test_relation_dict['text'],
            'entity': test_relation_dict['entity'],
            'relation': test_relation_dict['relation']
        }
    )

else:
    pass


In [289]:
# save the output_texts to a json file

with open(f'DocRED/GPT_w_ner_short_relation/test_output_texts_for_test_dataset_{random_test_index}.json', 'w') as f:
    json.dump(output_texts, f)

In [290]:
rel_info = {}
with open('DocRED/data/relation-index.json') as f:
    rel_info = json.load(f)
rel_info_list = [relation for relation in rel_info.keys()]

In [291]:
test_dataset[random_test_index]

{'text': 'durgada is a rural village in gollaprolu mandal , east godavari district , andhra pradesh , india . the village was formerly known as durga ooda , durga vaahini .',
 'entity': [['durgada', 'location'],
  ['gollaprolu', 'location'],
  ['east godavari', 'location'],
  ['andhra pradesh', 'location'],
  ['india', 'location'],
  ['durga ooda', 'location'],
  ['durga vaahini', 'location']],
 'relation': {'applies to jurisdiction': None,
  'author': None,
  'award received': None,
  'basin country': None,
  'capital': None,
  'capital of': None,
  'cast member': None,
  'chairperson': None,
  'characters': None,
  'child': None,
  'composer': None,
  'conflict': None,
  'contains administrative territorial entity': [['india', 'andhra pradesh']],
  'continent': None,
  'country': [['east godavari', 'india'],
   ['durga ooda', 'india'],
   ['durga vaahini', 'india'],
   ['andhra pradesh', 'india'],
   ['gollaprolu', 'india'],
   ['durgada', 'india']],
  'country of citizenship': None,

In [292]:
output_texts[0]

'entity : italy, type : location ; entity : greece, type : location ; entity : st. louis, type : location ; entity : india, type : location ; entity : daimaru, type : location ; entity : cowichan lake, type : location ; entity : kahlo, type : head of government. [learn3] [learn4] for the relation applies to jurisdiction : 1. [learn5] [learn6] and the entity for the relation applies to jurisdiction are : head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india; head entity: italy, tail entity: india.<|endoftext|>'

In [308]:
right_relation_1 = []
right_relation_0 = []
wrong_relation_1 = []
wrong_relation_0 = []
entity_pairs = []

# text = output_texts[0]

# rel_index = 0

for rel_index in range(len(rel_info_list)):
    text = output_texts[rel_index]
    relation_text = "for the relation " + rel_info_list[rel_index].lower() + " : "
    relation_classfication = text.split(relation_text)[1].split(".")[0].strip()

    if "1" in relation_classfication:
        if test_dataset[random_test_index]['relation'][rel_info_list[rel_index]]:
            print("relation : ", rel_info_list[rel_index])
            print("gold truth: ", test_dataset[random_test_index]['relation'][rel_info_list[rel_index]])
            right_relation_1.append(rel_index)

            entity_pair_for_this_relation = []
            remain_text = text
            while("tail entity" in remain_text):
                if ";" in remain_text.split("tail entity")[1]:
                    head_entity = remain_text.split("head entity: ")[1].split(", tail entity: ")[0].strip()
                    tail_entity = remain_text.split("tail entity: ")[1].split(";")[0].strip()
                    if [head_entity, tail_entity] not in entity_pair_for_this_relation:
                        entity_pair_for_this_relation.append([head_entity, tail_entity])
                        print("head entity: ", head_entity)
                        print("tail entity: ", tail_entity)
                    remain_text = remain_text.split(";")[1].strip()
                else:
                    head_entity = remain_text.split("head entity: ")[1].split(", tail entity: ")[0].strip()
                    tail_entity = remain_text.split("tail entity: ")[1].split(".<|endoftext|>")[0].strip()
                    if [head_entity, tail_entity] not in entity_pair_for_this_relation:
                        entity_pair_for_this_relation.append([head_entity, tail_entity])
                        print("head entity: ", head_entity)
                        print("tail entity: ", tail_entity)
                    break
            entity_pairs.append(entity_pair_for_this_relation)

        else:
            wrong_relation_1.append(rel_index)
    else:
        if test_dataset[random_test_index]['relation'][rel_info_list[rel_index]]:
            wrong_relation_0.append(rel_index)
        else:
            right_relation_0.append(rel_index)

relation :  contains administrative territorial entity
[['india', 'andhra pradesh']]
head entity:  india
tail entity:  kahlo
relation :  country
[['east godavari', 'india'], ['durga ooda', 'india'], ['durga vaahini', 'india'], ['andhra pradesh', 'india'], ['gollaprolu', 'india'], ['durgada', 'india']]
head entity:  bharatiya
tail entity:  india
relation :  located in the administrative territorial entity
[['east godavari', 'andhra pradesh'], ['andhra pradesh', 'india'], ['gollaprolu', 'east godavari'], ['gollaprolu', 'andhra pradesh'], ['durgada', 'gollaprolu']]
head entity:  greece
tail entity:  india


In [306]:
print("right_relation_1: ", len(right_relation_1))
print("right_relation_0: ", len(right_relation_0))
print("wrong_relation_1: ", len(wrong_relation_1))
print("wrong_relation_0: ", len(wrong_relation_0))

right_relation_1:  3
right_relation_0:  4
wrong_relation_1:  89
wrong_relation_0:  0


In [307]:
entity_pairs

[[['india', 'kahlo']], [['bharatiya', 'india']], [['greece', 'india']]]

play ground

In [315]:
tokenizer.convert_tokens_to_ids("[learn5]")

50263

In [316]:
# find the index of 50262 in input_ids_lists[12]

tokenizer.decode(input_ids_lists[12][:input_ids_lists[12].index(50263) - 2])

'durgada is a rural village in gollaprolu mandal, east godavari district, andhra pradesh, india. the village was formerly known as durga ooda, durga vaahini. [learn1] [learn2] entity : durgada, type : location ; entity : gollaprolu, type : location ; entity : east godavari, type : location ; entity : andhra pradesh, type : location ; entity : india, type : location ; entity : durga ooda, type : location ; entity : durga vaahini, type : location. [learn3] [learn4] for the relation contains administrative territorial entity :'

In [440]:
input_ids = torch.tensor(input_ids_lists[12][:input_ids_lists[12].index(50263) - 2]).to("cuda")

In [434]:
with torch.no_grad():
    output = model(input_ids=input_ids)
    current_output = np.array(output['logits'].cpu())
    max_index = np.argmax(current_output[-1, :], axis=0)
    sorted_index = np.argsort(current_output[-1, :])[::-1]
    print(tokenizer.decode(max_index))

    

atar


In [439]:
tokenizer.decode(sorted_index[5])

'ito'

In [433]:
with torch.no_grad():
    input_ids = torch.cat((input_ids, torch.tensor(sorted_index [0]).unsqueeze(0).to("cuda")), dim=0)
    print(tokenizer.decode(input_ids))

durgada is a rural village in gollaprolu mandal, east godavari district, andhra pradesh, india. the village was formerly known as durga ooda, durga vaahini. [learn1] [learn2] entity : durgada, type : location ; entity : gollaprolu, type : location ; entity : east godavari, type : location ; entity : andhra pradesh, type : location ; entity : india, type : location ; entity : durga ooda, type : location ; entity : durga vaahini, type : location. [learn3] [learn4] for the relation contains administrative territorial entity : 1. [learn5] [learn6] and the entity for the relation contains administrative administrative territorial entity are : head entity: india, tail entity: godav
