In [1]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# model, tokenizer

In [2]:
additional_tokens = {'additional_special_tokens': ['[learn1]', '[learn2]', '[learn3]', '[learn4]', '[learn5]', '[learn6]']}

In [3]:
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

configuration = GPT2Config.from_pretrained('gpt2-medium', output_hidden_states=False)

model = GPT2LMHeadModel.from_pretrained("gpt2-medium", config=configuration)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
print_trainable_parameters(model)

trainable params: 354823168 || all params: 354823168 || trainable%: 100.0


In [5]:
num_added_toks = tokenizer.add_special_tokens(additional_tokens)

In [6]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50265, 1024)

In [86]:
# save the tokenizer

# tokenizer.save_pretrained('DocRED/GPT_w_ner[]/gpt2_tokenizer')

('DocRED/GPT_w_ner[]/gpt2_tokenizer/tokenizer_config.json',
 'DocRED/GPT_w_ner[]/gpt2_tokenizer/special_tokens_map.json',
 'DocRED/GPT_w_ner[]/gpt2_tokenizer/vocab.json',
 'DocRED/GPT_w_ner[]/gpt2_tokenizer/merges.txt',
 'DocRED/GPT_w_ner[]/gpt2_tokenizer/added_tokens.json')

# PEFT

In [7]:
import torch

for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

In [11]:
# more with LoRAconfig: https://huggingface.co/docs/peft/conceptual_guides/lora

from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftType

peft_config = LoraConfig(
    # r: the rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters.
    r=16,
    # alpha: LoRA scaling factor.
    lora_alpha=32, 
    # target_modules: The modules (for example, attention blocks) to apply the LoRA update matrices.
    target_modules=["c_attn", "c_proj","c_fc", "c_proj"],
    fan_in_fan_out=True,
    lora_dropout=0.05,
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 6291456 || all params: 361122816 || trainable%: 1.7421928832101266


In [12]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape, param.dtype)

base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight torch.Size([3072, 16]) torch.float32
base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.transformer.h.0.mlp.c_fc.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.transformer.h.0.mlp.c_fc.lora_B.default.weight torch.Size([4096, 16]) torch.float32
base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight torch.Size([16, 4096]) torch.float32
base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight torch

# data

In [87]:
# load the json file from DocRED/data/test.json and DocRED/data/rel_info.json

import json

with open('DocRED/data/train_annotated.json') as f:
    train_set = json.load(f)


with open('DocRED/data/rel_info.json') as f:
    rel_info = json.load(f)

In [88]:
train_set[0]

{'vertexSet': [[{'pos': [0, 4],
    'type': 'ORG',
    'sent_id': 0,
    'name': 'Zest Airways, Inc.'},
   {'sent_id': 0,
    'type': 'ORG',
    'pos': [10, 15],
    'name': 'Asian Spirit and Zest Air'},
   {'name': 'AirAsia Zest', 'pos': [6, 8], 'sent_id': 0, 'type': 'ORG'},
   {'name': 'AirAsia Zest', 'pos': [19, 21], 'sent_id': 6, 'type': 'ORG'}],
  [{'name': 'Ninoy Aquino International Airport',
    'pos': [4, 8],
    'sent_id': 3,
    'type': 'LOC'},
   {'name': 'Ninoy Aquino International Airport',
    'pos': [26, 30],
    'sent_id': 0,
    'type': 'LOC'}],
  [{'name': 'Pasay City', 'pos': [31, 33], 'sent_id': 0, 'type': 'LOC'}],
  [{'name': 'Metro Manila', 'pos': [34, 36], 'sent_id': 0, 'type': 'LOC'}],
  [{'name': 'Philippines', 'pos': [38, 39], 'sent_id': 0, 'type': 'LOC'},
   {'name': 'Philippines', 'pos': [13, 14], 'sent_id': 4, 'type': 'LOC'},
   {'sent_id': 5,
    'type': 'LOC',
    'pos': [25, 29],
    'name': 'Republic of the Philippines'}],
  [{'name': 'Manila', 'pos': 

In [10]:
"""
the names of vertextSet can be the same, but the pos should be different
structure:
'vertexSet': 
    [
        (for the same entity but in different synonyms and different sentences)
        [
            {
                'pos':[start, end],
                'type': 'NER',
                'sent_id': 0,
                'name': 'string',
            },
            {}
        ],
        [entity-2]
    ]
'labels':
    [
        {
            'r': 'Pxx',
            'h': 0,
            't': 1,
            'evidence': [2, 3, 4],
        },
        {}
    ]
'title': 'string',
'sents':
    [
        ['word0', 'word1',]
        ['word0', 'word1',]
    ]
"""

"\nthe names of vertextSet can be the same, but the pos should be different\nstructure:\n'vertexSet': \n    [\n        {\n            'pos':[start, end],\n            'type': 'NER',\n            'sent_id': 0,\n            'name': 'string',\n        },\n        {}\n    ]\n'labels':\n    [\n        {\n            'r': 'Pxx',\n            'h': 0,\n            't': 1,\n            'evidence': [2, 3, 4],\n        },\n        {}\n    ]\n'title': 'string',\n'sents':\n    [\n        ['word0', 'word1',]\n        ['word0', 'word1',]\n    ]\n"

In [89]:
with open('DocRED/data/ner_info.json') as f:
    ner_info = json.load(f)

with open('DocRED/data/rel_info.json') as f:
    relation_info = json.load(f)

In [12]:
"""# doc-level is too long for gpt-2, so we need to split the doc-level into bi-sent-level

relation_dict = {
    'id': [],
    'text':[],
    'entity': [],
    'relation': []
}

for i in range(len(train_set)):
    # id
    relation_dict['id'].append(i)

    # text
    sents = ""
    for sent in train_set[i]['sents']:
        # flatten the sent list
        a = " ".join(sent)
        sents += a.lower() + " "
    # if there are space, delete the first and last space of the sents
    sents = sents.strip()
    # delete double space in the sents
    sents = sents.replace("  ", " ")
    relation_dict['text'].append(sents)
    del sents

    # entity
    entity = []
    entity_list = []
    entity_flat = {}
    entity_count = 0
    for sent_item in train_set[i]['vertexSet']:
        for item in sent_item:
            entity_item = []
            if item['name'].lower() not in entity_list:
                entity_list.append(item['name'].lower().strip())
                entity_item.append(item['name'].lower().strip())
                entity_item.append(ner_info[item['type']])

                entity.append(entity_item)
            
            # add the entity_flat
            entity_flat[entity_count] = item['name'].lower().strip()
            entity_count += 1

    # release the entity_list and entity_item
    del entity_item
    del entity_count
        

    # relation pairs
    relation_pairs = {}
    for relation_item in train_set[i]['labels']:
        pair = []
        head = entity_flat[relation_item['h']]
        tail = entity_flat[relation_item['t']]
        pair.append(head)
        pair.append(tail)

        relation  = relation_info[relation_item['r']]
        if relation not in relation_pairs.keys():
            relation_pairs[relation] = []

        relation_pairs[relation].append(pair)
    del pair
    del head
    del tail

    # add the entity and relation pairs to the relation_dict
    relation_dict['entity'].append(entity)
    relation_dict['relation'].append(relation_pairs)
    break


# save the relation_dict to a json file

# with open('DocRED/data/DocRED_baseline_metadata/relation_dict.json', 'w') as f:
#     json.dump(relation_dict, f)"""

In [91]:
relation_dict = {
    'text':[],
    'entity': [],
    'relation': []
}
id_count = 0

for i in range(len(train_set)):
    

    # text
    sent_pairs = []
    for sent_index, sent in enumerate(train_set[i]['sents']):
        sents = ""

        # flatten the sent list
        a = " ".join(sent)
        sents += a.lower() + " "

        # and the next_sent if it exists
        try:
            next_sent = train_set[i]['sents'][sent_index + 1]
            b = " ".join(next_sent)
            sents += b.lower() + " "
        except:
            pass
        # post process the sents for some spaces
        sents = sents.strip()
        sents = sents.replace("  ", " ")

        relation_dict['text'].append(sents)
            
    del sents


    # entity
    entity = []
    for index in range(len(train_set[i]['sents'])):
        # focus on the current sent and the next sent if it exists
        if index + 1 < len(train_set[i]['sents']):
            next_index = index + 1
        else:
            next_index = index
        # group the entities for every 2 sents, no repeated entities in one group
        c_sent_entity_lists = []
        next_sent_entity_lists = []
        entity_for_each_2_sents = []

        for entity_spans in train_set[i]['vertexSet']:
            for item in entity_spans:
                entity_item = []
                # if neither in the current sent nor in the next sent, continue
                if item['sent_id'] != index and item['sent_id'] != next_index:
                    continue
                # also store the first pos of the entity in the entity_item
                # it will look like this: [[entity_name, sent_index, pos1, ner_type]]
                entity_item = [item['name'].lower().strip(), item['sent_id'], item['pos'][0], ner_info[item['type']]]

                if entity_item[1] == index:
                    c_sent_entity_lists.append(entity_item)
                else:
                    next_sent_entity_lists.append(entity_item)

        # sort the c_sent_entity_lists and next_sent_entity_lists by the pos in ascending order
        c_sent_entity_lists.sort(key=lambda x: x[2])
        if index != next_index:
            next_sent_entity_lists.sort(key=lambda x: x[2])
        
        entity_list = []
        for item in c_sent_entity_lists:
            if item[0] not in entity_list:
                entity_list.append(item[0])
                entity_for_each_2_sents.append([item[0], item[3]])
            
        if index != next_index:
            for item in next_sent_entity_lists:
                if item[0] not in entity_list:
                    entity_list.append(item[0])
                    entity_for_each_2_sents.append([item[0], item[3]])

        relation_dict['entity'].append(entity_for_each_2_sents)

    del entity_item
    del c_sent_entity_lists
    del next_sent_entity_lists
    del entity_for_each_2_sents
        

    # relation pairs
    relation_pairs = []

    for index in range(len(train_set[i]['sents'])):
        relation_pairs_for_each_2_sents = {}
        # focus on the current sent and the next sent if it exists
        if index + 1 < len(train_set[i]['vertexSet']):
            next_index = index + 1
        else:
            next_index = index

        # heads, tails: ['entity_name', start_pos]
        for relation_item in train_set[i]['labels']:
            heads = []
            tails = []
            
            # head
            head_exist = False
            for head_span in train_set[i]['vertexSet'][relation_item['h']]:
                if head_span['sent_id'] == index or head_span['sent_id'] == next_index:
                    heads.append([head_span['name'].lower().strip(), head_span['pos'][0]])
                    head_exist = True
            if not head_exist:
                continue
    
            # tail
            tail_exist = False
            for tail_span in train_set[i]['vertexSet'][relation_item['t']]:
                if tail_span['sent_id'] == index or tail_span['sent_id'] == next_index:
                    tails.append([tail_span['name'].lower().strip(), tail_span['pos'][0]])
                    tail_exist = True
            if not tail_exist:
                continue
            

            if relation_info[relation_item['r']] not in relation_pairs_for_each_2_sents.keys():
                relation_pairs_for_each_2_sents[relation_info[relation_item['r']]] = []
            for head in heads:
                for tail in tails:
                    relation_pairs_for_each_2_sents[relation_info[relation_item['r']]].append([head[0], tail[0]])

        relation_dict['relation'].append(relation_pairs_for_each_2_sents)


    
    # break


# save the relation_dict to a json file

# with open('DocRED/data/bi-sent-pre-process.json', 'w') as f:
    # json.dump(relation_dict, f)

In [65]:
len(relation_dict['text']) == len(relation_dict['entity']) == len(relation_dict['relation'])

True

In [98]:
relation_dict['relation'][0]

{'headquarters location': [['zest airways, inc.', 'pasay city'],
  ['asian spirit and zest air', 'pasay city'],
  ['airasia zest', 'pasay city']],
 'country': [['zest airways, inc.', 'philippines'],
  ['asian spirit and zest air', 'philippines'],
  ['airasia zest', 'philippines'],
  ['pasay city', 'philippines'],
  ['manila', 'philippines'],
  ['metro manila', 'philippines'],
  ['ninoy aquino international airport', 'philippines']],
 'located in the administrative territorial entity': [['pasay city',
   'metro manila'],
  ['metro manila', 'philippines'],
  ['ninoy aquino international airport', 'pasay city']],
 'contains administrative territorial entity': [['philippines',
   'metro manila'],
  ['metro manila', 'pasay city']]}

In [10]:
ner = 1

In [9]:
import json


from datasets import Dataset

relation_dict = {}
if ner:
    with open('DocRED/data/bi-sent-pre-process.json') as f:
        relation_dict = json.load(f)

    dataset = Dataset.from_dict(
        {
            'text': relation_dict['text'],
            'entity': relation_dict['entity'],
            'relation': relation_dict['relation']
        }
    )

else:
    pass

NameError: name 'ner' is not defined

In [9]:
dataset

Dataset({
    features: ['text', 'entity', 'relation'],
    num_rows: 24256
})

In [10]:
dataset[0]

{'text': 'zest airways , inc. operated as airasia zest ( formerly asian spirit and zest air ) , was a low - cost airline based at the ninoy aquino international airport in pasay city , metro manila in the philippines . it operated scheduled domestic and international tourist services , mainly feeder services linking manila and cebu with 24 domestic destinations in support of the trunk route operations of other airlines .',
 'entity': [['zest airways, inc.', 'organization'],
  ['airasia zest', 'organization'],
  ['asian spirit and zest air', 'organization'],
  ['ninoy aquino international airport', 'location'],
  ['pasay city', 'location'],
  ['metro manila', 'location'],
  ['philippines', 'location'],
  ['manila', 'location'],
  ['cebu', 'location'],
  ['24', 'number']],
 'relation': {'applies to jurisdiction': None,
  'author': None,
  'award received': None,
  'basin country': None,
  'capital': None,
  'capital of': None,
  'cast member': None,
  'chairperson': None,
  'characters':

In [11]:
len(dataset['entity'][0])

10

In [12]:
"""relation_info_dict = {}
for id, relation in enumerate(dataset[0]['relation'].keys()):
    relation_info_dict[relation] = id

with open('DocRED/data/relation-index.json', 'w') as f:
    json.dump(relation_info_dict, f)"""

with open('DocRED/data/relation-index.json') as f:
    relation_info_dict = json.load(f)

In [13]:
def pro_processing_ner(example, tokenizer, padding=True):
    texts = example['text']

    for index in range(len(texts)):
        # entity extraction and NER
        texts[index] = texts[index].lower().strip() + " [learn1] [learn2] "
        for entity in example['entity'][index]:
            texts[index] = texts[index] + " entity : " + entity[0] + " , type : " + entity[1] + " ;"
        texts[index] = texts[index][:-1] + "."
        # print("1")
        # add relation classificaiton
        texts[index] = texts[index].lower().strip() + " [learn3] [learn4]"
        for relation_type, relation_pair in example['relation'][index].items():
            if relation_pair:
                texts[index] = texts[index] + " relation " + str(relation_info_dict[relation_type]) + " : 1 ;"
            else:
                texts[index] = texts[index] + " relation " + str(relation_info_dict[relation_type]) + " : 0 ;"

        texts[index] = texts[index][:-1] + "."

        # add relation extraction
        texts[index] = texts[index].lower().strip() + " [learn5] [learn6]"
        # print("2")
        for relation_type, relation_pair in example['relation'][index].items():
            if relation_pair:
                # print("text: ", texts[index])
                # print("relation_pair: ", relation_pair)
                texts[index] = texts[index] + " for relation " + str(relation_info_dict[relation_type]) + " ,"
                for pair in relation_pair:
                    texts[index] = texts[index] + " head : " + pair[0] + " , tail : " + pair[1] + ";"
                texts[index] = texts[index][:-1] + "."
                
        if texts[index][-2:] != "6]":
            texts[index] = texts[index][:-1] + ". " + tokenizer.eos_token
        else:
            texts[index] = texts[index] + ". " + tokenizer.eos_token

        # print("text: ", texts[index])
    # print("3")
    # print(texts[0])
    output_ids = tokenizer(texts, add_special_tokens=False)['input_ids']

    # input_ids = []
    attention_mask = []
    # print("4")
    count = 0
    for i in range(len(output_ids)):
        ids = output_ids[i]
        if len(ids) > 1024:
            output_ids[i] = output_ids[i][:1023] + [tokenizer.eos_token_id]
            count += 1
        assert len(output_ids[i]) <= 1024
        attention_mask.append([1] * len(output_ids[i]) + [0] * (1024 - len(output_ids[i])))
        assert len(attention_mask[i]) == 1024
    if count != 0:
        print(f"truncated {count} examples")

    # print("5")
    if padding:
        for i in range(len(output_ids)):
            ids = output_ids[i]
            output_ids[i] = ids + [tokenizer.pad_token_id] * (1024 - len(ids))
            assert len(output_ids[i]) == 1024
    # print("6")
    return {
        'input_ids': output_ids,
        'attention_mask': attention_mask,
        }




In [209]:
# feed the dataset:dataset to the pro_processing_ner() function with tokenizer, at each time, we feed 30 examples to the function, and then save the output to a json file
# each time the return of the function is a dict, we need to save the dict to a list, and then save the list to a json file

import json

output = {"input_ids": [], "attention_mask": []}

for i in range(0, len(dataset), 30):
    print(i)
    result = pro_processing_ner(dataset[i:i+30], tokenizer)
    output["input_ids"].extend(result["input_ids"])
    output["attention_mask"].extend(result["attention_mask"])
    

with open('DocRED/data/train_ner.json', 'w') as f:
    json.dump(output, f)


0
30
60
90
120
150
180
210
240
270
300
330
truncated 1 examples
360
390
420
450
480
510
540
570
600
630
660
690
truncated 3 examples
720
truncated 1 examples
750
780
810
840
truncated 1 examples
870
900
truncated 1 examples
930
960
truncated 1 examples
990
1020
1050
1080
1110
1140
1170
1200
1230
1260
1290
1320
1350
1380
1410
1440
1470
1500
1530
truncated 1 examples
1560
1590
truncated 2 examples
1620
1650
truncated 2 examples
1680
1710
truncated 2 examples
1740
1770
1800
1830
1860
truncated 1 examples
1890
1920
truncated 2 examples
1950
1980
truncated 1 examples
2010
2040
2070
truncated 1 examples
2100
truncated 1 examples
2130
2160
2190
2220
truncated 1 examples
2250
2280
2310
2340
2370
2400
2430
2460
2490
truncated 2 examples
2520
truncated 2 examples
2550
truncated 2 examples
2580
2610
2640
2670
2700
truncated 1 examples
2730
2760
truncated 1 examples
2790
2820
2850
2880
2910
2940
2970
3000
3030
truncated 1 examples
3060
3090
3120
truncated 1 examples
3150
3180
3210
truncated 1 exam

In [16]:
import json
from datasets import Dataset

with open('DocRED/data/train_ner.json') as f:
    ner_dataset = json.load(f)

tokenized_dataset = Dataset.from_dict(
    {
        'input_ids': ner_dataset['input_ids'],
        'attention_mask': ner_dataset['attention_mask'],
    }
)

# del ner_dataset

In [17]:
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [18]:
tokenizer.decode(tokenized_dataset[66]['input_ids'])

'jonas geirnaert ( born july 28, 1982 ) studied animation at the kask in ghent. in may 2004 he won the short film jury prize at the cannes film festival with his animated short flatlife ( 11 min ). [learn1] [learn2] entity : jonas geirnaert, type : head of government ; entity : july 28, 1982, type : time ; entity : kask, type : organization ; entity : ghent, type : location ; entity : may 2004, type : time ; entity : short film jury prize, type : miscellaneous ; entity : cannes film festival, type : miscellaneous ; entity : flatlife, type : miscellaneous ; entity : 11 min, type : number. [learn3] [learn4] relation 0 : 0 ; relation 1 : 0 ; relation 2 : 0 ; relation 3 : 0 ; relation 4 : 0 ; relation 5 : 0 ; relation 6 : 0 ; relation 7 : 0 ; relation 8 : 0 ; relation 9 : 0 ; relation 10 : 0 ; relation 11 : 0 ; relation 12 : 0 ; relation 13 : 0 ; relation 14 : 0 ; relation 15 : 0 ; relation 16 : 0 ; relation 17 : 0 ; relation 18 : 1 ; relation 19 : 0 ; relation 20 : 0 ; relation 21 : 0 ; r

In [19]:
tokenized_dataset.__getitems__([1,4])

[{'input_ids': tensor([  270, 12228,  7530,  ..., 50258, 50258, 50258]),
  'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])},
 {'input_ids': tensor([ 1169, 18091,   373,  ..., 50258, 50258, 50258]),
  'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}]

# trainer

In [14]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="GPT2-intermediate",
    # notes="PubmedBERT-FT-NER_w_NERin_10epochs",
    name="GPT2-medium-peft-DocRED-w-ner-5epochs"
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m309439737[0m ([33mtian1995[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [20]:
import transformers
from transformers import DataCollatorForLanguageModeling

trainer = transformers.Trainer(
    model=model, 
    train_dataset=tokenized_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8, 
        gradient_accumulation_steps=8,
        warmup_steps=1000, 
        num_train_epochs=5,
        learning_rate=2e-4, 
        fp16=True,
        logging_steps=100, 
        report_to="wandb",
        save_strategy="epoch",
        output_dir='DocRED/GPT_medium_peft_w_ner'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [21]:
trainer.train()



  0%|          | 0/1895 [00:00<?, ?it/s]

{'loss': 21.7149, 'learning_rate': 1.86e-05, 'epoch': 0.26}
{'loss': 2.6904, 'learning_rate': 3.86e-05, 'epoch': 0.53}
{'loss': 0.7528, 'learning_rate': 5.86e-05, 'epoch': 0.79}
{'loss': 0.5316, 'learning_rate': 7.860000000000001e-05, 'epoch': 1.06}
{'loss': 0.4609, 'learning_rate': 9.86e-05, 'epoch': 1.32}
{'loss': 0.4351, 'learning_rate': 0.0001186, 'epoch': 1.58}
{'loss': 0.4182, 'learning_rate': 0.0001386, 'epoch': 1.85}
{'loss': 0.406, 'learning_rate': 0.0001586, 'epoch': 2.11}
{'loss': 0.3986, 'learning_rate': 0.0001786, 'epoch': 2.37}
{'loss': 0.3896, 'learning_rate': 0.0001986, 'epoch': 2.64}
{'loss': 0.3869, 'learning_rate': 0.00017921787709497208, 'epoch': 2.9}
{'loss': 0.3809, 'learning_rate': 0.00015687150837988826, 'epoch': 3.17}
{'loss': 0.3798, 'learning_rate': 0.00013452513966480446, 'epoch': 3.43}
{'loss': 0.3739, 'learning_rate': 0.00011217877094972067, 'epoch': 3.69}
{'loss': 0.3698, 'learning_rate': 8.983240223463688e-05, 'epoch': 3.96}
{'loss': 0.3664, 'learning_ra

TrainOutput(global_step=1895, training_loss=1.6639082775266945, metrics={'train_runtime': 58398.3457, 'train_samples_per_second': 2.077, 'train_steps_per_second': 0.032, 'train_loss': 1.6639082775266945, 'epoch': 5.0})

In [22]:
wandb.finish()
trainer.save_model("DocRED/GPT_medium_peft_w_ner")

model.save_pretrained("DocRED/GPT_medium_peft_w_ner/model/GPT_medium_peft_w_ner.peft")

# save the tokenizer
# tokenizer.save_pretrained("DocRED/GPT_w_ner/tokenizer")

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
train/learning_rate,▁▂▃▃▄▅▆▆▇█▇▆▆▅▄▃▂▁
train/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,5.0
train/global_step,1895.0
train/learning_rate,2e-05
train/loss,0.3642
train/total_flos,2.299536831676416e+17
train/train_loss,1.66391
train/train_runtime,58398.3457
train/train_samples_per_second,2.077
train/train_steps_per_second,0.032


In [26]:
trainer.save_model("DocRED/GPT_medium_peft_w_ner")

# Inference

In [24]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

checkpoint = "DocRED/GPT_medium_peft_w_ner"

peft_model_id = "DocRED/GPT_medium_peft_w_ner/model/GPT_medium_peft_w_ner.peft"
# checkpoint = "DocRED/GPT_without_ner/model"

In [30]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("DocRED/GPT_w_ner/tokenizer")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium", config=configuration, ignore_mismatched_sizes=True)
model.resize_token_embeddings(len(tokenizer))

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2-medium and are newly initialized because the shapes did not match:
- wte.weight: found shape torch.Size([50257, 1024]) in the checkpoint and torch.Size([50265, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
model = PeftModel.from_pretrained(model, peft_model_id)

In [33]:
# output all of the special tokens in the tokenizer
tokenizer.all_special_tokens

['<|startoftext|>',
 '<|endoftext|>',
 '<|pad|>',
 '[learn1]',
 '[learn2]',
 '[learn3]',
 '[learn4]',
 '[learn5]',
 '[learn6]']

In [34]:
import torch

model.eval()
model.to("cpu")
# inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. [learn1] [learn2] entity :", return_tensors="pt")

inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. [learn1] [learn2] entity : ", return_tensors="pt", padding='max_length', max_length=1000)

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=20, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again.entity : blogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogsblogs


test data pre-processing

In [221]:
# load the json file from DocRED/data/test.json and DocRED/data/rel_info.json

import json

with open('DocRED/data/dev.json') as f:
    test_set = json.load(f)


with open('DocRED/data/rel_info.json') as f:
    rel_info = json.load(f)

In [222]:
relation_dict = {
    'text':[],
    'entity': [],
    'relation': []
}
id_count = 0

def pro_processing_data(train_set=train_set):
    for i in range(len(train_set)):
        

        # text
        sent_pairs = []
        for sent_index, sent in enumerate(train_set[i]['sents']):
            sents = ""

            # flatten the sent list
            a = " ".join(sent)
            sents += a.lower() + " "

            # and the next_sent if it exists
            try:
                next_sent = train_set[i]['sents'][sent_index + 1]
                b = " ".join(next_sent)
                sents += b.lower() + " "
            except:
                pass
            # post process the sents for some spaces
            sents = sents.strip()
            sents = sents.replace("  ", " ")

            relation_dict['text'].append(sents)
                
        del sents


        # entity
        entity = []
        for index in range(len(train_set[i]['sents'])):
            # focus on the current sent and the next sent if it exists
            if index + 1 < len(train_set[i]['sents']):
                next_index = index + 1
            else:
                next_index = index
            # group the entities for every 2 sents, no repeated entities in one group
            c_sent_entity_lists = []
            next_sent_entity_lists = []
            entity_for_each_2_sents = []

            for entity_spans in train_set[i]['vertexSet']:
                for item in entity_spans:
                    entity_item = []
                    # if neither in the current sent nor in the next sent, continue
                    if item['sent_id'] != index and item['sent_id'] != next_index:
                        continue
                    # also store the first pos of the entity in the entity_item
                    # it will look like this: [[entity_name, sent_index, pos1, ner_type]]
                    entity_item = [item['name'].lower().strip(), item['sent_id'], item['pos'][0], ner_info[item['type']]]

                    if entity_item[1] == index:
                        c_sent_entity_lists.append(entity_item)
                    else:
                        next_sent_entity_lists.append(entity_item)

            # sort the c_sent_entity_lists and next_sent_entity_lists by the pos in ascending order
            c_sent_entity_lists.sort(key=lambda x: x[2])
            if index != next_index:
                next_sent_entity_lists.sort(key=lambda x: x[2])
            
            entity_list = []
            for item in c_sent_entity_lists:
                if item[0] not in entity_list:
                    entity_list.append(item[0])
                    entity_for_each_2_sents.append([item[0], item[3]])
                
            if index != next_index:
                for item in next_sent_entity_lists:
                    if item[0] not in entity_list:
                        entity_list.append(item[0])
                        entity_for_each_2_sents.append([item[0], item[3]])

            relation_dict['entity'].append(entity_for_each_2_sents)

        del entity_item
        del c_sent_entity_lists
        del next_sent_entity_lists
        del entity_for_each_2_sents
            

        # relation pairs
        relation_pairs = []

        for index in range(len(train_set[i]['sents'])):
            relation_pairs_for_each_2_sents = {}
            # focus on the current sent and the next sent if it exists
            if index + 1 < len(train_set[i]['vertexSet']):
                next_index = index + 1
            else:
                next_index = index

            # heads, tails: ['entity_name', start_pos]
            for relation_item in train_set[i]['labels']:
                heads = []
                tails = []
                
                # head
                head_exist = False
                for head_span in train_set[i]['vertexSet'][relation_item['h']]:
                    if head_span['sent_id'] == index or head_span['sent_id'] == next_index:
                        heads.append([head_span['name'].lower().strip(), head_span['pos'][0]])
                        head_exist = True
                if not head_exist:
                    continue
        
                # tail
                tail_exist = False
                for tail_span in train_set[i]['vertexSet'][relation_item['t']]:
                    if tail_span['sent_id'] == index or tail_span['sent_id'] == next_index:
                        tails.append([tail_span['name'].lower().strip(), tail_span['pos'][0]])
                        tail_exist = True
                if not tail_exist:
                    continue
                

                if relation_info[relation_item['r']] not in relation_pairs_for_each_2_sents.keys():
                    relation_pairs_for_each_2_sents[relation_info[relation_item['r']]] = []
                for head in heads:
                    for tail in tails:
                        relation_pairs_for_each_2_sents[relation_info[relation_item['r']]].append([head[0], tail[0]])

            relation_dict['relation'].append(relation_pairs_for_each_2_sents)


        
    return relation_dict


test_relation_dict = pro_processing_data(test_set)

# save the relation_dict to a json file

with open('DocRED/data/bi-sent-pre-process_test.json', 'w') as f:
    json.dump(test_relation_dict, f)

In [223]:
import json

with open('DocRED/data/ner_info.json') as f:
    ner_info = json.load(f)

with open('DocRED/data/rel_info.json') as f:
    relation_info = json.load(f)

In [224]:
def pro_processing_ner(example, tokenizer, padding=True):
    texts = example['text']

    for index in range(len(texts)):
        # entity extraction and NER
        texts[index] = texts[index].lower().strip() + " [learn1] [learn2] "
        for entity in example['entity'][index]:
            texts[index] = texts[index] + " entity : " + entity[0] + " , type : " + entity[1] + " ;"
        texts[index] = texts[index][:-1] + "."
        # print("1")
        # add relation classificaiton
        texts[index] = texts[index].lower().strip() + " [learn3] [learn4]"
        for relation_type, relation_pair in example['relation'][index].items():
            if relation_pair:
                texts[index] = texts[index] + " relation " + str(relation_info_dict[relation_type]) + " : 1 ;"
            else:
                texts[index] = texts[index] + " relation " + str(relation_info_dict[relation_type]) + " : 0 ;"

        texts[index] = texts[index][:-1] + "."

        # add relation extraction
        texts[index] = texts[index].lower().strip() + " [learn5] [learn6]"
        # print("2")
        for relation_type, relation_pair in example['relation'][index].items():
            if relation_pair:
                # print("text: ", texts[index])
                # print("relation_pair: ", relation_pair)
                texts[index] = texts[index] + " for relation " + str(relation_info_dict[relation_type]) + " ,"
                for pair in relation_pair:
                    texts[index] = texts[index] + " head : " + pair[0] + " , tail : " + pair[1] + ";"
                texts[index] = texts[index][:-1] + "."
                
        if texts[index][-2:] != "6]":
            texts[index] = texts[index][:-1] + ". " + tokenizer.eos_token
        else:
            texts[index] = texts[index] + ". " + tokenizer.eos_token

        # print("text: ", texts[index])
    # print("3")
    # print(texts[0])
    output_ids = tokenizer(texts, add_special_tokens=False)['input_ids']

    # input_ids = []
    attention_mask = []
    # print("4")
    count = 0
    for i in range(len(output_ids)):
        ids = output_ids[i]
        if len(ids) > 1024:
            output_ids[i] = output_ids[i][:1023] + [tokenizer.eos_token_id]
            count += 1
        assert len(output_ids[i]) <= 1024
        attention_mask.append([1] * len(output_ids[i]) + [0] * (1024 - len(output_ids[i])))
        assert len(attention_mask[i]) == 1024
    if count != 0:
        print(f"truncated {count} examples")

    # print("5")
    if padding:
        for i in range(len(output_ids)):
            ids = output_ids[i]
            output_ids[i] = ids + [tokenizer.pad_token_id] * (1024 - len(ids))
            assert len(output_ids[i]) == 1024
    # print("6")
    return {
        'input_ids': output_ids,
        'attention_mask': attention_mask,
        }



In [35]:
import json

from datasets import Dataset

ner = 1

test_relation_dict = {}
if ner:
    with open('DocRED/data/bi-sent-pre-process_test.json') as f:
        test_relation_dict = json.load(f)

    test_dataset = Dataset.from_dict(
        {
            'text': test_relation_dict['text'],
            'entity': test_relation_dict['entity'],
            'relation': test_relation_dict['relation']
        }
    )

else:
    pass



output = {"input_ids": [], "attention_mask": []}

for i in range(0, len(test_dataset), 30):
    print(i)
    result = pro_processing_ner(test_dataset[i:i+30], tokenizer)
    output["input_ids"].extend(result["input_ids"])
    output["attention_mask"].extend(result["attention_mask"])
    

with open('DocRED/data/test_ner.json', 'w') as f:
    json.dump(output, f)

0


NameError: name 'pro_processing_ner' is not defined

randomly select test data

In [36]:
import json
from datasets import Dataset


with open('DocRED/data/test_ner.json') as f:
    ner_dataset = json.load(f)

tokenized_test_dataset = Dataset.from_dict(
    {
        'input_ids': ner_dataset['input_ids'],
        'attention_mask': ner_dataset['attention_mask'],
    }
)

tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [37]:
# random sample 1 example from the test_dataset
import random
# have a random seed
random.seed(80)


index = random.randint(0, len(tokenized_test_dataset))
print("index: ", index)
print(tokenizer.decode(tokenized_test_dataset[index]['input_ids']))

# output the length of tokenized_test_dataset[index]['input_ids'] except the padding tokens. the tokenized_test_dataset[index]['input_ids'] is tensor

input_ids_list = tokenized_test_dataset[index]['input_ids'].tolist()
valid_length = len(input_ids_list) - input_ids_list.count(tokenizer.pad_token_id)

print(tokenizer.decode(tokenized_test_dataset[index]['input_ids'][:valid_length]))

# generate a lower triangle matrix of 1s with the shape is (valid_length, valid_length), using torch

low_triangle_matrix = torch.tril(torch.ones((valid_length, valid_length), dtype=torch.long))

# find the index of the token id of "[learn2]" in the tokenized_test_dataset[index]['input_ids'] tensor

learn2_index = tokenized_test_dataset[index]['input_ids'].tolist().index(tokenizer.convert_tokens_to_ids("[learn2]"))

# have a vector to store the token of okenized_test_dataset[index]['input_ids'][learn2_index + 1:valid_length]

gold_truth = tokenized_test_dataset[index]['input_ids'][learn2_index + 1:valid_length]

# multiply the low_triangle_matrix with the tokenized_test_dataset[index]['input_ids'][:valid_length]

batch_input = low_triangle_matrix * tokenized_test_dataset[index]['input_ids'][:valid_length]

# making all 0 in batch_input to tokenizer.pad_token_id

batch_input[batch_input == 0] = tokenizer.pad_token_id

# input from the [learn2] token

actually_input = batch_input[learn2_index:-1]

# have tensors actually_input_attention_mask, when the token is not padding token, the value is 1, otherwise, the value is 0

actually_input_attention_mask = torch.ones(actually_input.shape, dtype=torch.long)
actually_input_attention_mask[actually_input == tokenizer.pad_token_id] = 0


index:  2224
kungliga hovkapellet (, the royal court orchestra ) is a swedish orchestra, originally part of the royal court in sweden's capital stockholm. its existence was first recorded in 1526. [learn1] [learn2] entity : kungliga hovkapellet, type : organization ; entity : the royal court orchestra, type : organization ; entity : royal court orchestra, type : organization ; entity : swedish, type : location ; entity : royal court, type : organization ; entity : sweden, type : location ; entity : stockholm, type : location ; entity : 1526, type : time. [learn3] [learn4] relation 0 : 0 ; relation 1 : 0 ; relation 2 : 0 ; relation 3 : 0 ; relation 4 : 1 ; relation 5 : 1 ; relation 6 : 0 ; relation 7 : 0 ; relation 8 : 0 ; relation 9 : 0 ; relation 10 : 0 ; relation 11 : 0 ; relation 12 : 0 ; relation 13 : 0 ; relation 14 : 1 ; relation 15 : 0 ; relation 16 : 0 ; relation 17 : 0 ; relation 18 : 0 ; relation 19 : 0 ; relation 20 : 0 ; relation 21 : 0 ; relation 22 : 0 ; relation 23 : 0 ;

inference

In [38]:
from tqdm.notebook import trange, tqdm
import torch
import numpy as np


model.eval()
outputs = []
model.to("cuda")

batch_output = []

with torch.no_grad():
    # feed the actually_input to the model by 10 examples each time
    for i in tqdm(range(0, len(actually_input), 10)):
        print(i)
        output = model(input_ids=actually_input[i:i+10].to("cuda"), attention_mask=actually_input_attention_mask[i:i+10].to("cuda"))
        current_output = np.array(output['logits'].cpu())
        max_index = np.argmax(current_output[:, -1, :], axis=1)
        batch_output.extend(max_index)
        # break

    # print(tokenizer.batch_decode(max_index, skip_special_tokens=False)[0])

  0%|          | 0/63 [00:00<?, ?it/s]

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620


In [39]:
len(batch_output) == len(gold_truth)

True

In [40]:
# calculate the accuracy

accuracy = sum(np.array(batch_output) == np.array(gold_truth.tolist())) / len(batch_output)

In [42]:
accuracy

0.0

In [43]:
tokenizer.decode(input_ids_list)

"kungliga hovkapellet (, the royal court orchestra ) is a swedish orchestra, originally part of the royal court in sweden's capital stockholm. its existence was first recorded in 1526. [learn1] [learn2] entity : kungliga hovkapellet, type : organization ; entity : the royal court orchestra, type : organization ; entity : royal court orchestra, type : organization ; entity : swedish, type : location ; entity : royal court, type : organization ; entity : sweden, type : location ; entity : stockholm, type : location ; entity : 1526, type : time. [learn3] [learn4] relation 0 : 0 ; relation 1 : 0 ; relation 2 : 0 ; relation 3 : 0 ; relation 4 : 1 ; relation 5 : 1 ; relation 6 : 0 ; relation 7 : 0 ; relation 8 : 0 ; relation 9 : 0 ; relation 10 : 0 ; relation 11 : 0 ; relation 12 : 0 ; relation 13 : 0 ; relation 14 : 1 ; relation 15 : 0 ; relation 16 : 0 ; relation 17 : 0 ; relation 18 : 0 ; relation 19 : 0 ; relation 20 : 0 ; relation 21 : 0 ; relation 22 : 0 ; relation 23 : 0 ; relation 24

In [44]:
# output the index of the correct prediction

for index, item in enumerate(zip(batch_output, gold_truth.tolist())):
    if item[0] == item[1]:
        print(index, ": ", tokenizer.decode(item[0]))

# Analysis

w ner

In [61]:
gold_truth
with open("DocRED/GPT_w_ner[]/result/epoch_5_result.pkl", "rb") as f:
    result = pickle.load(f)

In [62]:
print(f'the length: {len(result["output"])}, {len(result["label"])}')
print(f'instance:\n{result["output"][0]}\n{result["label"][0]}')

the length: 12275, 12275
instance:
('1', '2', 'P17')
('1', '2', 'P17')


In [63]:
# source and target, relation
st_tp = 0
st_fp = 0
st_fn = 0
st_tn = 0

r_tp = 0
r_fp = 0
r_fn = 0
r_tn = 0

tuple_tp = 0
tuple_fp = 0  
tuple_fn = 0
tuple_tn = 0


for output, label in zip(result['output'], result['label']):
    pair = False
    relation = False
    if output[0] == label[0] and output[1] == label[1]:
        st_tp += 1
        pair = True
    else:
        st_fn += 1
        st_fp += 1
    
    if output[2] == label[2]:
        r_tp += 1
        relation = True
    else:
        r_fn += 1Some inputs may be truncated because they are too long.
        tuple_fn += 1
        tuple_fp += 1

In [64]:
# calculate the precision, recall and f1 score

# for source and target
st_precision = st_tp / (st_tp + st_fp)
st_recall = st_tp / (st_tp + st_fn)
st_f1 = 2 * st_precision * st_recall / (st_precision + st_recall)
print(f"source and target precision: {st_precision}, recall: {st_recall}, f1: {st_f1}")

# for relation
r_precision = r_tp / (r_tp + r_fp)
r_recall = r_tp / (r_tp + r_fn)
r_f1 = 2 * r_precision * r_recall / (r_precision + r_recall)
print(f"relation precision: {r_precision}, recall: {r_recall}, f1: {r_f1}")

# for tuple
tuple_precision = tuple_tp / (tuple_tp + tuple_fp)
tuple_recall = tuple_tp / (tuple_tp + tuple_fn)
tuple_f1 = 2 * tuple_precision * tuple_recall / (tuple_precision + tuple_recall)
print(f"tuple precision: {tuple_precision}, recall: {tuple_recall}, f1: {tuple_f1}")

source and target precision: 0.8537678207739308, recall: 0.8537678207739308, f1: 0.8537678207739308
relation precision: 0.7141344195519348, recall: 0.7141344195519348, f1: 0.7141344195519348
tuple precision: 0.6980855397148676, recall: 0.6980855397148676, f1: 0.6980855397148676
