In [1]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# model, tokenizer

In [5]:
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=25, problem_type = "multi_label_classification")

In [6]:
outputid2label = {0: 'B', 1: 'IN', 2: 'OUT'}
outputlabel2id = {'B': 0, 'IN': 1, 'OUT': 2}

In [7]:
import torch.nn as nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification

class MultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(MultiLabelClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-cased')
        self.num_labels = num_labels
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.config = self.bert.config

    def forward(self, input_ids, 
                attention_mask=None, 
                token_type_ids=None,
                labels=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None,):
        

        outputs = self.bert(input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        pooled_output = outputs[0]

        # pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
                loss = loss_fct(logits.view(-1, self.num_labels),
                        labels.view(-1))
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

model = MultiLabelClassifier(num_labels=len(outputid2label))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# save the tokenizer

tokenizer.save_pretrained('nyt/BERT_ED')

('nyt/BERT_ED/tokenizer_config.json',
 'nyt/BERT_ED/special_tokens_map.json',
 'nyt/BERT_ED/vocab.txt',
 'nyt/BERT_ED/added_tokens.json')

# data

In [9]:
# load the json file from DocRED/data/test.json and DocRED/data/rel_info.json

import json

# with open('nyt/train.json') as f:
#     train_set = json.load(f)

with open('nyt/train.json') as f:
    train_set = json.load(f)


with open('nyt/words2id.json') as f:
    word2id = json.load(f)

id2word = {v: k for k, v in word2id.items()}


with open('nyt/relations2id.json') as f:
    rel2id_original = json.load(f)

In [10]:
id2head_type = {}
id2tail_type = {}
id2rel = {}

for k, v in rel2id_original.items():
    if k == "None":
        id2head_type[v] = "None"
        id2tail_type[v] = "None"
        id2rel[v] = "None"

    else:
        id2head_type[v] = k.split('/')[1]
        id2tail_type[v] = k.split('/')[2]
        id2rel[v] = k.split('/')[3]

rel2id = {v: k for k, v in id2rel.items()}

the structure of train_set is:

[length]
[words]
[head, tail, relation, head, tail, relation]

In [None]:
from tqdm import tqdm

relation_dict = {
    'text':[],
    'entity': [],
    'relation': []
}
id_count = 0

for i in tqdm(range(len(train_set[0]))):
    

    # text
    sent = ""
    for sent_index, word_id in enumerate(train_set[1][i]):
        word = id2word[word_id]
        # flatten the sent list
        sent += word + " "

    # post process the sents for some spaces
    sents = sent.strip()
    sents = sent.replace("  ", " ")

    relation_dict['text'].append(sent)
            
    # del sents


    # entity
    entity = []
    entity_for_order = []

    # relations
    relation_pairs = {rel:[] for rel in rel2id.keys()}

    for j in range(0, len(train_set[2][i]), 3):
        head_index = train_set[2][i][j]
        head = train_set[1][i][head_index]

        tail_index = train_set[2][i][j + 1]
        tail = train_set[1][i][tail_index]

        relation = id2rel[train_set[2][i][j + 2]]

        if (head_index, id2head_type[train_set[2][i][j + 2]]) not in entity_for_order:
            entity_for_order.append((head_index, id2head_type[train_set[2][i][j + 2]]))
        if (tail_index, id2tail_type[train_set[2][i][j + 2]]) not in entity_for_order:
            entity_for_order.append((tail_index, id2tail_type[train_set[2][i][j + 2]]))
        
        relation_pairs[relation].append((id2word[train_set[1][i][head_index]], id2word[train_set[1][i][tail_index]]))

    # reorder the entity_for_order by the first element of the tuple
    entity_for_order.sort(key=lambda x: x[0])
    for tuple_item in entity_for_order:
        entity.append((id2word[train_set[1][i][tuple_item[0]]], tuple_item[1]))
    
    for j in rel2id.keys():
        if relation_pairs[j] == []:
            relation_pairs[j] = None

    # del entity
    # del entity_for_order


    relation_dict['entity'].append(entity)

    relation_dict['relation'].append(relation_pairs)


    
    # break


# save the relation_dict to a json file

# with open('nyt/data/sent-pre-process.json', 'w') as f:
    # json.dump(relation_dict, f)

In [None]:
len(relation_dict['text']) == len(relation_dict['entity']) == len(relation_dict['relation'])

In [11]:
import json

ner = 1
from datasets import Dataset

relation_dict = {}
if ner:
    with open('nyt/data/sent-pre-process.json') as f:
        relation_dict = json.load(f)

    dataset = Dataset.from_dict(
        {
            'text': relation_dict['text'],
            'entity': relation_dict['entity'],
            'relation': relation_dict['relation']
        }
    )

else:
    pass

In [13]:
outputid2label

{0: 'B', 1: 'IN', 2: 'OUT'}

In [14]:
def is_subset(a, b):
    if len(b) > len(a):
        raise ValueError("entity is longer than text")
    else:
        for i in range(len(a) - len(b) + 1):
            if a[i:i + len(b)] == b:
                return True, i
        raise ValueError("entity is not in text")

def pro_processing_ner(example):
    texts = example['text']
    input_texts = []
    labels = []
    for index in range(len(texts)):
        # entity extraction and NER
        text = texts[index].strip()
        text_w_relation = f"[CLS] " + text + " [SEP]"

        tokenized_text = tokenizer.tokenize(text_w_relation)

        tokenized_entity = [tokenizer.tokenize(entity) for entity, _ in example['entity'][index]]

        label = [ [2] * len(tokenized_text) + [-100] * (512 - len(tokenized_text))]
        label = label[0]
        
        for entity in tokenized_entity:
            _, index = is_subset(tokenized_text, entity)
            
            label[index] = 0
            if len(entity) > 1:
                label[index + 1: index + len(entity)] = [1] * (len(entity) - 1)


        input_texts.append(text_w_relation)
        labels.append(label)
    return {
        'input_ids': input_texts, 
        'labels': labels
        }

In [15]:
# feed the dataset:dataset to the pro_processing_ner() function with tokenizer, at each time, we feed 30 examples to the function, and then save the output to a json file
# each time the return of the function is a dict, we need to save the dict to a list, and then save the list to a json file

import json
from tqdm import tqdm


output = {"input_texts": []}
labels = {"labels": []}

for i in tqdm(range(0, len(dataset), 30)):
    result = pro_processing_ner(dataset[i:i+30])
    output["input_texts"].extend(result["input_ids"])
    labels["labels"].extend(result["labels"])

100%|██████████| 1874/1874 [00:31<00:00, 60.01it/s]


In [16]:
num = 7

print(output["input_texts"][num])
print(labels["labels"][num])

[CLS] United States Representative Charles B. Rangel said yesterday that he would endorse C. Virginia Fields in New York City 's Democratic mayoral primary , giving her a jolt of momentum and delivering a setback to the efforts of Fernando Ferrer , the former Bronx borough president , to win black support . [SEP]
[2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 

In [17]:
# make the output["input_texts"] into a dataset
from datasets import Dataset

input_text_dataset = Dataset.from_dict(
    {
        'input_texts': output['input_texts'],
    }
)

labels_dataset = Dataset.from_dict(
    {
        'labels': labels['labels'],
    }
)


In [18]:
tokenized_dataset = input_text_dataset.map(lambda example: tokenizer(example['input_texts'], padding='max_length', add_special_tokens=False, truncation=True, max_length=512, pad_to_max_length=True), batched=True)

Map:   0%|          | 0/56195 [00:00<?, ? examples/s]

In [19]:
import torch

# add a new column to the tokenized_dataset

tokenized_dataset = tokenized_dataset.add_column('labels', labels['labels'])

In [21]:
tokenized_dataset

Dataset({
    features: ['input_texts', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 56195
})

In [22]:
# remove the column of input_texts in the tokenized_dataset
tokenized_dataset.remove_columns('input_texts')
# save the tokenized_dataset

tokenized_dataset.save_to_disk('nyt/BERT_ED/train_data_ED')
# with open('DocRED/data/train_ner_short_relation.json', 'w') as f:
#     json.dump(tokenized_dataset, f)

Saving the dataset (0/1 shards):   0%|          | 0/56195 [00:00<?, ? examples/s]

In [10]:
from datasets import Dataset

tokenized_dataset = Dataset.load_from_disk('nyt/BERT_ED/train_data_ED')
tokenized_dataset = tokenized_dataset.remove_columns('input_texts')

In [11]:
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [12]:
tokenized_dataset.__getitems__([1,4])

[{'input_ids': tensor([  101,  1456,  2938, 23616,  9272,  9637,  2249,   150, 13329,  9741,
            143,  9919, 21669, 12152,  2162, 22412, 10090,   117,  1340,  1512,
            118,  1351,  1476,   119,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [13]:
import torch

model.eval()

with torch.no_grad():
    outputs = model(input_ids=tokenized_dataset['input_ids'][0:2], attention_mask=tokenized_dataset['attention_mask'][0:2], labels=tokenized_dataset['labels'][0:2])
    # outputs = model(input_ids=tokenized_dataset['input_ids'][0:2], attention_mask=tokenized_dataset['attention_mask'][0:2])

In [14]:
outputs[0]

tensor(1.2252)

# Evaluation

In [15]:
import evaluate

import numpy as np

# load the json file from DocRED/data/test.json and DocRED/data/rel_info.json

import json

# with open('nyt/train.json') as f:
#     train_set = json.load(f)

with open('nyt/train.json') as f:
    train_set = json.load(f)


with open('nyt/words2id.json') as f:
    word2id = json.load(f)

id2word = {v: k for k, v in word2id.items()}


with open('nyt/relations2id.json') as f:
    rel2id_original = json.load(f)



id2head_type = {}
id2tail_type = {}
id2rel = {}

for k, v in rel2id_original.items():
    if k == "None":
        id2head_type[v] = "None"
        id2tail_type[v] = "None"
        id2rel[v] = "None"

    else:
        id2head_type[v] = k.split('/')[1]
        id2tail_type[v] = k.split('/')[2]
        id2rel[v] = k.split('/')[3]

rel2id = {v: k for k, v in id2rel.items()}


def compute_metrics(eval_preds):
    logits, labels = eval_preds

    # for the -1 dim of the logits, have the index of the max value, if the original shape is [b, s, 3] then the shape of the index is [b, s]
    predictions = np.argmax(logits, axis=-1)
    # make the predictions to a numpy array
    if isinstance(predictions , torch.Tensor):
        predictions  = predictions.numpy()

    # if label is tensor, set the label to numpy
    if isinstance(labels, torch.Tensor):
        labels = labels.numpy()

    # Total accuracy, ignore the -100 index in the labels
    total_accuracy = np.mean(predictions[labels != -100] == labels[labels != -100])

    # Accuracy for B (0)
    accuracy_0 = np.mean(predictions[labels == 0] == 0)

    
    # Accuracy for in (1)
    accuracy_1 = np.mean(predictions[labels == 1] == 1)
    
    # Accuracy for out (2)
    accuracy_2 = np.mean(predictions[labels == 2] == 2)

    # accuracy for each class

    return {
        "precision_for_begin": accuracy_0,
        "precision_for_in": accuracy_1,
        "precision_for_out": accuracy_2,
        "precision_for_all": total_accuracy,
    }

In [17]:
compute_metrics((outputs[1], tokenized_dataset['labels'][0:2]))

{'precision_for_begin': 0.0,
 'precision_for_in': 0.5714285714285714,
 'precision_for_out': 0.09803921568627451,
 'precision_for_all': 0.14516129032258066}

In [57]:
# import json
# 
# from datasets import Dataset
# 
# ner = 1
# 
# test_relation_dict = {}
# if ner:
#     with open('nyt/data/test-sent-pre-process.json') as f:
#         test_relation_dict = json.load(f)
# 
#     test_dataset = Dataset.from_dict(
#         {
#             'text': test_relation_dict['text'],
#             'entity': test_relation_dict['entity'],
#             'relation': test_relation_dict['relation']
#         }
#     )
# 
# else:
#     pass
# 
# 
# dataset = test_dataset

In [58]:
# import json
# from tqdm import tqdm
# 
# 
# output = {"input_texts": []}
# test_labels = []
# 
# for i in tqdm(range(0, len(dataset), 30)):
#     result = pro_processing_ner(dataset[i:i+30])
#     output["input_texts"].extend(result["input_ids"])
#     test_labels.extend(result["labels"])
# 
# 
# from datasets import Dataset
# 
# input_text_dataset = Dataset.from_dict(
#     {
#         'input_texts': output['input_texts'],
#     }
# )

100%|██████████| 167/167 [00:02<00:00, 58.12it/s]


In [62]:
# tokenized_dataset = input_text_dataset.map(lambda example: tokenizer(example['input_texts'], padding='max_length', truncation=True, max_length=512, add_special_tokens=False, pad_to_max_length=True), batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [64]:
# tokenized_dataset.remove_columns('input_texts')
# tokenized_dataset = tokenized_dataset.add_column('labels', test_labels)
# tokenized_dataset.save_to_disk('nyt/BERT_ED/test_data_RC')

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [18]:
# valid_dataset

from datasets import Dataset

valid_dataset = Dataset.load_from_disk('nyt/BERT_ED/test_data_RC')
valid_dataset = valid_dataset.remove_columns('input_texts')


# random sample 1 example from the test_dataset
import random
# have a random seed
random.seed(65)


random_test_index = random.randint(0, len(valid_dataset))


# print(tokenizer.decode(tokenized_test_dataset[random_test_index * 96]['input_ids']))

# output the length of tokenized_test_dataset[index]['input_ids'] except the padding tokens. the tokenized_test_dataset[index]['input_ids'] is tensor

valid_dataset = valid_dataset[random_test_index : random_test_index + 10]

valid_dataset = Dataset.from_dict(
    {
        'input_ids': valid_dataset['input_ids'],
        'attention_mask': valid_dataset['attention_mask'],
        'labels': valid_dataset['labels']
    }
)


valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# training

In [19]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="BERT-intermediate",
    # notes="PubmedBERT-FT-NER_w_NERin_10epochs",
    name="ED_nyt_5epochs"
)

[34m[1mwandb[0m: Currently logged in as: [33m309439737[0m ([33mtian1995[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [23]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 56195
})

In [22]:
import transformers
from transformers import TrainingArguments, Trainer
import torch.nn as nn


training_args = TrainingArguments(
    output_dir='nyt/BERT_ED',
    num_train_epochs=5,
    auto_find_batch_size=True,
    load_best_model_at_end=True,
    warmup_steps=1000,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    report_to="wandb",
    save_strategy="epoch",
)


class EntityDetectionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs[1]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 3),
                        labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = EntityDetectionTrainer(
    model=model, 
    train_dataset=tokenized_dataset,
    eval_dataset=valid_dataset,
    args=training_args,
    compute_metrics=compute_metrics,
    )

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision For Begin,Precision For In,Precision For Out,Precision For All
1,0.0267,0.040267,0.9,0.5,0.992523,0.982238


In [None]:
wandb.finish()
trainer.save_model("nyt/BERT_ED/model-5epochs")

# save the tokenizer
tokenizer.save_pretrained("nyt/BERT_ED/tokenizer")

# Inference

In [None]:
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

checkpoint = "nyt/BERT_ED/model/pytorch_model.bin"

In [None]:
import torch.nn as nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification

class MultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super(MultiLabelClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-cased')
        self.num_labels = num_labels
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob)
        self.config = self.bert.config

    def forward(self, input_ids, 
                attention_mask=None, 
                token_type_ids=None,
                labels=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None,):
        

        outputs = self.bert(input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        pooled_output = outputs[0]

        # pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
                loss = loss_fct(logits.view(-1, self.num_labels),
                        labels.view(-1))
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

model = MultiLabelClassifier(num_labels=len(outputid2label))

In [None]:
import torch

model.load_state_dict(torch.load(checkpoint))

In [None]:
from datasets import Dataset

test_dataset = Dataset.load_from_disk('nyt/BERT_ED/test_data_RC')
# test_dataset = valid_dataset.remove_columns('input_texts')

test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
model.eval()
model.to("cpu")

with torch.no_grad():
    outputs = model(input_ids=test_dataset['input_ids'][0:2], attention_mask=test_dataset['attention_mask'][0:2])
    
compute_metrics((outputs[0], test_dataset['labels'][0:2]))

In [None]:
from tqdm.notebook import trange, tqdm
import torch
import numpy as np

final_result = {
        "precision_for_begin": [],
        "precision_for_in": [],
        "precision_for_out": [],
        "precision_for_all": []
    }


model.eval()
outputs = []
model.to("cuda")
output_results = []


with torch.no_grad():
    # feed the actually_input to the model by 50 examples each time
    for input_index in tqdm(range(0, len(test_dataset), 50)):
        # print(input_index + 1, " / ", len(tokenized_test_dataset))
        input_ids =test_dataset[input_index : input_index+50]["input_ids"].to("cuda")
        attention_mask = test_dataset[input_index : input_index+50]["attention_mask"].to("cuda")

        output = model(input_ids=input_ids, attention_mask=attention_mask)
        # copy output[0] to cpu
        pred = output[0].clone().cpu()
        result = compute_metrics((pred, test_dataset[input_index : input_index+50]["labels"]))

        for k, v in result.items():
            final_result[k].append(v)


result_sum = {}
for k, v in final_result.items():  
    result_sum[k] = sum(v) / 100

print(result_sum)


In [None]:
# using matplotlib to plot the final_result

import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
plt.plot(result_sum.values())

plt.xticks(range(len(result_sum)), result_sum.keys(), rotation=90)
plt.show()