https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing#scrollTo=T-gy-LxM0yAi

In [2]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
checkpoint = "microsoft/biogpt"
from relations import relations
from datasets import DatasetDict, Dataset
import pandas as pd
from tqdm.notebook import trange, tqdm
from labels import get_labels

In [3]:
# load labels for bert_w_ner
additional_tokens, _, _, _ = get_labels(mode='GPT_w_ner')
print(additional_tokens, "\n", additional_tokens)

{'additional_special_tokens': ['[learn1]', '[learn2]', '[learn3]', '[learn4]', '[learn5]', '[learn6]']} 
 {'additional_special_tokens': ['[learn1]', '[learn2]', '[learn3]', '[learn4]', '[learn5]', '[learn6]']}


# load the model

In [4]:
# load the model in 8-bit quantization configuration
# the max length of the input is 1024
model = AutoModelForCausalLM.from_pretrained(checkpoint, 
    # load_in_8bit=True, 
    device_map={'':torch.cuda.current_device()},)

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
print_trainable_parameters(model)

trainable params: 346763264 || all params: 346763264 || trainable%: 100.0


# Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

In [8]:
# adding new tokens to the tokenizer
# since I haven't load the model so I will resize the embedding of the model later]
num_added_toks = tokenizer.add_special_tokens(additional_tokens)
print('We have added', num_added_toks, 'tokens')

# save the tokenizer
tokenizer.save_pretrained("GPT_without_ner/GPT_w_ner_tokenizer")

We have added 6 tokens


('GPT_without_ner/GPT_w_ner_tokenizer/tokenizer_config.json',
 'GPT_without_ner/GPT_w_ner_tokenizer/special_tokens_map.json',
 'GPT_without_ner/GPT_w_ner_tokenizer/vocab.json',
 'GPT_without_ner/GPT_w_ner_tokenizer/merges.txt',
 'GPT_without_ner/GPT_w_ner_tokenizer/added_tokens.json')

In [9]:
model.resize_token_embeddings(len(tokenizer))

Embedding(42390, 1024)

# PEFT

Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in float32 for stability.

We also cast the output of the last layer and embedding layer in float32 for the same reasons.

In [10]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)

model.biogpt.embed_tokens = CastOutputToFloat(model.biogpt.embed_tokens)
model.output_projection = CastOutputToFloat(model.output_projection)

In [11]:
# more with LoRAconfig: https://huggingface.co/docs/peft/conceptual_guides/lora

from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftType

peft_config = LoraConfig(
    # r: the rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters.
    r=16,
    # alpha: LoRA scaling factor.
    lora_alpha=32, 
    # target_modules: The modules (for example, attention blocks) to apply the LoRA update matrices.
    target_modules=["q_proj", "v_proj"],
    fan_in_fan_out=True,
    lora_dropout=0.05,
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
print_trainable_parameters(model)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/tian/mambaforge/envs/BioRED/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/tian/mambaforge/envs/BioRED/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


trainable params: 1572864 || all params: 348342272 || trainable%: 0.4515283175278825


In [12]:
# make model's embed_tokens layer also trainable

model.biogpt.embed_tokens[0].weight.requires_grad = True
model.output_projection[0].weight.requires_grad = True

print_trainable_parameters(model)

trainable params: 44980224 || all params: 348342272 || trainable%: 12.912651611803232


In [13]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BioGptForCausalLM(
      (biogpt): BioGptModel(
        (embed_tokens): CastOutputToFloat(
          (0): Embedding(42390, 1024)
        )
        (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0-23): 24 x BioGptDecoderLayer(
            (self_attn): BioGptAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(
                in_features=1024, out_features=1024, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): Pa

In [14]:
# for model, print the layer's name if the layer is trainable, and print the precision of the layer

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape, param.dtype)

base_model.model.biogpt.embed_tokens.0.weight torch.Size([42390, 1024]) torch.float32
base_model.model.biogpt.layers.0.self_attn.v_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.biogpt.layers.0.self_attn.v_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.biogpt.layers.0.self_attn.q_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.biogpt.layers.0.self_attn.q_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.biogpt.layers.1.self_attn.v_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.biogpt.layers.1.self_attn.v_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.biogpt.layers.1.self_attn.q_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.biogpt.layers.1.self_attn.q_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.biogpt.layers.2.self_attn

# pre-process the text

In [15]:
# from data_preprocessing import make_GPT_re_data, GPT_w_ner_preprocess_function

from data_preprocessing import all_line_of_pmid, get_original_text, get_identifier_and_entity, reorder_list, get_relations

In [16]:
# train and valid file paths
train_file_path = 'data/BioRED/processed/train.tsv'
valid_file_path = 'data/BioRED/processed/dev.tsv'

In [22]:
file_path=train_file_path
lower=True
from relations import relations
import random
seed = 42

In [27]:
# def make_GPT_re_data_no_ner(file_path, lower=True):
"""make a dictionary for the dataset for GPT_re
input is the .tsv file path
return the hugging face dataset
data_dict = {
    "pmids": [],
    "text": [],
    "entities": [],
    "outputs": []
}

"""

"""make a dictionary for the dataset for GPT_re
input is the .tsv file path
return the hugging face dataset
data_dict = {
    "pmids": [],
    "text": [],
    "entities": [],
    "outputs": [],
    "relations": [] (lower)
    }

"""
random.seed(10)
data_dict = {
    "pmids": [],
    "text": [],
    "entities": [],
    "outputs": [],
    "relation": []
}
dropped = []
dataset = pd.read_csv(file_path, delimiter="\t", header=None)

# relations_dict = {relations[i]: f"[RELATION{i}]" for i in range(len(relations))}

# tag_dict = {}
start = 0
while start < (len(dataset) - 1):
# while start == 0:
    pmid, start, end = all_line_of_pmid(dataset, start)
    # the text
    text = get_original_text(dataset, start, end, lower)

    pmid = dataset.iloc[start, 0]
    included_relations = {}
    for i in range(start, end):
        if dataset.iloc[i, 8] == "None":
            continue
        else:
            included_relations[dataset.iloc[i, 8].lower()] = {
                'entities':[],
                'outputs':[],
                'output_lines':[]
            }
    
    # entities_list: [(source, target), (source, target), ...)] 

    # for i in range(start, start + 1):
    for i in range(start, end):
        if dataset.iloc[i, 8] == "None":
            continue
        # get entities and their identifiers of this line. the 1st item of entities is the src and the 2nd is the tgt
        entities, entity_to_identifier = get_identifier_and_entity(dataset, i, i + 1, lower)

        # 1) reorder the entities, find the first occurred entity as the entity1
        reordered_e1 = reorder_list(text, {list(entities.items())[0][0]:list(entities.items())[0][1]}, lower, mode='length')
        reordered_e2 = reorder_list(text, {list(entities.items())[1][0]:list(entities.items())[1][1]}, lower, mode='length')
        if len(reordered_e1) == 0 or len(reordered_e2) == 0:
            dropped.append(i)
            continue
        # for strings in reordered_e1 and reordered_e2, if there is a space before or after a dot, delete the space
        for j in range(len(reordered_e1)):
            new_string = ".".join(reordered_e1[j].split(" . "))
            new_string = ".".join(new_string.split(" ."))
            new_string = ".". join (new_string.split(". "))
            reordered_e1[j] = new_string

        for j in range(len(reordered_e2)):
            new_string = ".".join(reordered_e2[j].split(" . "))
            new_string = ".".join(new_string.split(" ."))
            new_string = ".". join (new_string.split(". "))
            reordered_e2[j] = new_string

        if dataset.iloc[i, 3] == list(entities.keys())[0]:
            # reordered_e1 is the source
            output_line = f"the source is {reordered_e1[0]} and the target is {reordered_e2[0]}"
            included_relations[dataset.iloc[i, 8].lower()]['outputs'].append(output_line)
            included_relations[dataset.iloc[i, 8].lower()]['entities'].append((reordered_e1, reordered_e2))
        elif dataset.iloc[i, 3] == list(entities.keys())[1]:
            # reordered_e2 is the source
            output_line = f"the source is {reordered_e2[0]} and the target is {reordered_e1[0]}"
            included_relations[dataset.iloc[i, 8].lower()]['outputs'].append(output_line)
            included_relations[dataset.iloc[i, 8].lower()]['entities'].append((reordered_e2, reordered_e1))
        else:
            dropped.append(i)
            print("error in line: ", i)
            continue

    for r, v in included_relations.items():
        data_dict["pmids"].append(str(pmid))
        data_dict["text"].append(text.strip())
        data_dict["relation"].append(r.lower().strip())
        out_line = ""
        for line in v['outputs']:
            out_line += line.lower().strip() + " ; "
        out_line = out_line[:-3] + " . "
        data_dict["outputs"].append(out_line)
        data_dict['entities'].append(v['entities'])
    # randomly choosing a itwm that is in the relations and not in the included_relations.keys()
    # have a random index in len(relations)
    # if the index is in the included_relations.keys(), continue
    # else, add the relation to the included_relations.keys()
    random_index = random.randint(0, len(relations) - 1)
    while relations[random_index].lower() in included_relations.keys() or relations[random_index].lower() == "none":
        random_index = random.randint(0, len(relations) - 1)
    
    # add the relation to the included_relations.keys()
    data_dict["pmids"].append(str(pmid))
    data_dict["text"].append(text.strip())
    data_dict["relation"].append(relations[random_index].lower().strip())
    data_dict["outputs"].append("the source is none . ")
    data_dict['entities'].append([(['none'], ['none'])])
    start = end

if dropped:
    print(f"Dropped {len(dropped)} line:\n {dropped}")

# for k,v in data_dict.items():
    # print(k, v)
# return data_dict

IndexError: list index out of range

In [34]:
len(reordered_e2)

0

In [63]:
data_dict['entities'][0][0]

(['type ii diabetes',
  'type ii diabetic',
  'maturity-onset diabetes',
  'type ii diabetes mellitus',
  'type ii ( non-insulin-dependent ) diabetes mellitus'],
 ['hnf-6', 'hepatocyte nuclear factor-6', 'hepatocyte nuclear factor (hnf)-6'])

In [51]:
list(entities.keys())[1]

'3175'

In [46]:
included_relations

{'association': {'entities': [], 'outputs': []},
 'positive_correlation': {'entities': [], 'outputs': []}}

In [45]:
included_relations[dataset.iloc[i, 8].lower()]['outputs']

KeyError: 'none'

In [32]:
reordered_e2[0]

'hnf-6'

In [30]:
reordered_e2

['hnf-6', 'hepatocyte nuclear factor-6', 'hepatocyte nuclear factor (hnf)-6']

In [28]:
reordered_e1

['type ii diabetes',
 'type ii diabetic',
 'maturity-onset diabetes',
 'type ii diabetes mellitus',
 'type ii ( non-insulin-dependent ) diabetes mellitus']

In [23]:
entity_to_identifier

{'hnf-6': {'entity_type': 'GeneOrGeneProduct', 'identifier': '3175'},
 'hepatocyte nuclear factor-6': {'entity_type': 'GeneOrGeneProduct',
  'identifier': '3175'},
 'hepatocyte nuclear factor (hnf)-6': {'entity_type': 'GeneOrGeneProduct',
  'identifier': '3175'},
 'mody4': {'entity_type': 'GeneOrGeneProduct', 'identifier': '3651'},
 'mody': {'entity_type': 'GeneOrGeneProduct', 'identifier': '3651'}}

In [24]:
reordered_e1

['hepatocyte nuclear factor-6', 'hepatocyte nuclear factor (hnf)-6', 'hnf-6']

In [16]:
# make bert_re data
train_data_raw = make_GPT_re_data(file_path=train_file_path, lower=True)
valid_data_raw = make_GPT_re_data(file_path=valid_file_path, lower=True)

NameError: name 'make_GPT_re_data' is not defined

In [17]:
# save the raw data
# import json

# with open('GPT_w_ner/data/train_data_dict.json', 'w') as f:
#     json.dump(train_data_raw, f)

# with open('GPT_w_ner/data/valid_data_dict.json', 'w') as f:
#     json.dump(valid_data_raw, f)

In [18]:
# import json

# with open('GPT_w_ner/data/train_data_dict.json', 'r') as f:
#     train_data_raw = json.load(f)

# with open('GPT_w_ner/data/valid_data_dict.json', 'r') as f:
#     valid_data_raw = json.load(f)

# print(train_data_raw.keys())
# for k, v in train_data_raw.items():
#     print(k, len(v))

# # make into Dataset type
# train_data_raw = Dataset.from_dict(train_data_raw)
# valid_data_raw = Dataset.from_dict(valid_data_raw)

In [19]:
from torch.utils.data import Subset
"""
for the train_dataset:
{'[None]': 18720,
 '[Association]': 2183,
 '[Bind]': 60,
 '[Comparison]': 28,
 '[Conversion]': 3,
 '[Cotreatment]': 31,
 '[Drug_Interaction]': 11,
 '[Negative_Correlation]': 763,
 '[Positive_Correlation]': 1088}

so it is neccessary to balance the dataset, we randomly choose 3000 samples from the [None] class with the seed 42
"""
import random
random.seed(42)

# get the index of the [None] class of the datasets type of train_data_raw
none_index = [i for i, example in enumerate(train_data_raw) if example['relation'] == 'none']

# randomly choose 18720-3000 samples from the [None] class
none_index = random.sample(none_index, 18720-3000)
keep_indices = [i for i in range(len(train_data_raw)) if i not in none_index]

# delete the [None] class samples from the train_data_raw
train_data_raw_balanced = train_data_raw.select(keep_indices)

In [20]:
"""train_data_raw_balanced"""

'train_data_raw_balanced'

In [21]:
"""train_data_raw_balanced[0]"""

'train_data_raw_balanced[0]'

In [22]:
"""dataset = DatasetDict({
    "train": train_data_raw_balanced,
    "valid": valid_data_raw
})"""

'dataset = DatasetDict({\n    "train": train_data_raw_balanced,\n    "valid": valid_data_raw\n})'

In [23]:
"""tokenized_datasets = dataset.map(lambda example: GPT_w_ner_preprocess_function(example, tokenizer, mode="gpt_w_ner"), batched=True, remove_columns=['pmids', 'text', 'entities', 'outputs', 'relation'])"""

'tokenized_datasets = dataset.map(lambda example: GPT_w_ner_preprocess_function(example, tokenizer, mode="gpt_w_ner"), batched=True, remove_columns=[\'pmids\', \'text\', \'entities\', \'outputs\', \'relation\'])'

In [24]:
# tokenized_datasets.save_to_disk('GPT_w_ner/data/tokenized_dataset_w_ner')


In [25]:
from datasets import load_from_disk

tokenized_datasets = load_from_disk('GPT_w_ner/data/tokenized_dataset_w_ner')

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 7167
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 6650
    })
})

In [26]:
# to tensor
tokenized_datasets.set_format(type='torch', columns=['input_ids'])

In [27]:
tokenizer.decode(tokenized_datasets['train']['input_ids'][0])

'hepatocyte nuclear factor-6: associations between genetic variability and type ii diabetes and between genetic variability and estimates of insulin secretion. the transcription factor hepatocyte nuclear factor (hnf) -6 is an upstream regulator of several genes involved in the pathogenesis of maturity-onset diabetes of the young. we therefore tested the hypothesis that variability in the hnf-6 gene is associated with subsets of type ii (non-insulin-dependent) diabetes mellitus and estimates of insulin secretion in glucose tolerant subjects. we cloned the coding region as well as the intron-exon boundaries of the hnf-6 gene. w e then examined them on genomic dna in six mody probands without mutations in the mody1, mody3 and mody4 genes and in 54 patients with late-onset type ii diabetes by combined single strand conformational polymorphism-heteroduplex analysis followed by direct sequencing of identified variants. an identified missense variant was examined in association studies and ge

# Training

wandb

In [28]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="GPT2",
    # notes="PubmedBERT-FT-NER_w_NERin_10epochs",
    name="BioGPT_w_ner_epoch_5_balanced_train_data_no_[]",
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m309439737[0m ([33mtian1995[0m). Use [1m`wandb login --relogin`[0m to force relogin


training

In [29]:
from transformers import DataCollatorForLanguageModeling

In [30]:
import transformers

trainer = transformers.Trainer(
    model=model, 
    train_dataset=tokenized_datasets['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8, 
        gradient_accumulation_steps=8,
        warmup_steps=1000, 
        num_train_epochs=15,
        learning_rate=2e-4, 
        fp16=True,
        logging_steps=1, 
        report_to="wandb",
        save_strategy="epoch",
        output_dir='GPT_w_ner'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



  0%|          | 0/1680 [00:00<?, ?it/s]

{'loss': 3.2103, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.01}
{'loss': 3.2309, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.02}
{'loss': 3.2165, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.03}
{'loss': 3.2313, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.04}
{'loss': 3.1868, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.04}
{'loss': 3.1758, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.05}
{'loss': 3.1678, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.06}
{'loss': 3.2831, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.07}
{'loss': 3.3207, 'learning_rate': 1.8e-06, 'epoch': 0.08}
{'loss': 3.1115, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.09}
{'loss': 3.2277, 'learning_rate': 2.2e-06, 'epoch': 0.1}
{'loss': 3.2029, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.11}
{'loss': 3.2163, 'learning_rate': 2.6e-06, 'epoch': 0.12}
{'loss': 3.3188, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.12}
{'loss': 3.2118, 'learning

TrainOutput(global_step=1680, training_loss=0.7817108644749082, metrics={'train_runtime': 34418.62, 'train_samples_per_second': 3.123, 'train_steps_per_second': 0.049, 'train_loss': 0.7817108644749082, 'epoch': 15.0})

In [31]:
import wandb
wandb.finish()
trainer.save_model("GPT_w_ner/models/GPT_w_ner_epoch_15_balanced")

VBox(children=(Label(value='0.001 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.070750…

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇▇███▇▇▆▆▅▅▅▄▄▃▃▂▂▁▁
train/loss,███▆▆▅▄▄▄▃▃▃▂▃▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,15.0
train/global_step,1680.0
train/learning_rate,0.0
train/loss,0.1464
train/total_flos,2.0071882986356736e+17
train/train_loss,0.78171
train/train_runtime,34418.62
train/train_samples_per_second,3.123
train/train_steps_per_second,0.049


In [36]:
model.save_pretrained("GPT_w_ner/models/GPT_w_ner_epoch_15_balanced.peft")

In [37]:
# Since there are key-unmatches in the trainer.save_model(), we need to rename the keys and load the paras in the model

embed_tokens_state_dict = torch.load("GPT_w_ner/models/GPT_w_ner_epoch_15_balanced/pytorch_model.bin")

old_keys = ["base_model.model.biogpt.embed_tokens.0.weight", "base_model.model.output_projection.0.weight"]
new_keys = ["base_model.model.biogpt.embed_tokens.weight", "base_model.model.output_projection.weight"]

for old_key, new_key in zip(old_keys, new_keys):
    # Get the value of the old key
    value = embed_tokens_state_dict[old_key]

    # Create a new key-value pair with the updated name
    embed_tokens_state_dict[new_key] = value

    # Delete the old key if desired
    del embed_tokens_state_dict[old_key]

torch.save(embed_tokens_state_dict, "GPT_w_ner/models/GPT_w_ner_epoch_15_balanced/pytorch_model-af.bin")

In [35]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BioGptForCausalLM(
      (biogpt): BioGptModel(
        (embed_tokens): CastOutputToFloat(
          (0): Embedding(42390, 1024)
        )
        (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0-23): 24 x BioGptDecoderLayer(
            (self_attn): BioGptAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(
                in_features=1024, out_features=1024, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): Pa

# load model and inference

In [33]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "microsoft/biogpt"

peft_model_id = "GPT_w_ner/models/GPT_w_ner_epoch_15_balanced_train_data.peft"
# config = PeftConfig.from_pretrained(peft_model_id)
model_p = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained("GPT_w_ner/GPT_w_ner_tokenizer")

# resize the token embeddings to match the tokenizer
model_p.resize_token_embeddings(len(tokenizer))

# Load the Lora model
# the resized embedding layer are still uncorrected, need to load the weights manually
model_p = PeftModel.from_pretrained(model_p, peft_model_id)


In [34]:
model_p

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BioGptForCausalLM(
      (biogpt): BioGptModel(
        (embed_tokens): Embedding(42390, 1024)
        (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0-23): 24 x BioGptDecoderLayer(
            (self_attn): BioGptAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(
                in_features=1024, out_features=1024, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embeddin

In [38]:
model_p.load_state_dict(torch.load("GPT_w_ner/models/GPT_w_ner_epoch_15_balanced/pytorch_model-af.bin"))

<All keys matched successfully>

In [40]:
model_p.eval()
model_p.to("cpu")
inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label :", return_tensors="pt")

with torch.no_grad():
    outputs = model_p.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

Tweet text: @ HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label: the new drugs are more complex and variable than ever


In [4]:
import pandas as pd
import re
from tqdm.notebook import trange, tqdm
from torch import nn
from labels import get_labels
from relations import relations
from datasets import DatasetDict, Dataset

from data_preprocessing import make_GPT_re_data, GPT_w_ner_preprocess_function
additional_tokens, _, _, _ = get_labels(mode='GPT_w_ner')


In [5]:
# load test data and preprocess
test_file_path = 'data/BioRED/processed/test.tsv'
test_data = make_GPT_re_data(file_path=test_file_path, lower=True)

test_dataset_raw = Dataset.from_dict(test_data)
# test_dataset = test_dataset_raw.map(NER_preprocess_function, batched=False)
# with bert only:
test_dataset = test_dataset_raw.map(lambda example: GPT_w_ner_preprocess_function(example, tokenizer, mode="gpt_w_ner", infer=True), batched=True, remove_columns=['pmids', 'text', 'entities', 'outputs'])
test_dataset.set_format(type='torch', columns=['input_ids'])
# the test_dataset has two columns: input_ids and labels, split the labels coloumn into test_dataset_labels

Dropped 0 line:
 []


Map:   0%|          | 0/7590 [00:00<?, ? examples/s]

In [14]:
model.eval()
outputs = []
model.to("cuda")
with torch.no_grad():
    for i in tqdm(range(len(test_dataset))):
    # for i in range(1):
        output = model.generate(input_ids=test_dataset[i]["input_ids"].unsqueeze(0).to("cuda"), max_new_tokens=50)
        output_text = tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=False)[0]
        outputs.append(output_text.split("[learn1] [learn2] [learn3] [learn4] [learn5] [learn6] ")[1])

    # print(tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=False)[0])

  0%|          | 0/7590 [00:00<?, ?it/s]

In [13]:
outputs

['the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [entity2] is [None]. </s>',
 'the relation between source [entity1] and target [

In [11]:
test_dataset['labels'][30:80]

['the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and target [entity2] is [Association] . ',
 'the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and target [entity2] is [Positive_Correlation] . ',
 'the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and target [entity2] is [Negative_Correlation] . ',
 'the relation between source [entity1] and target [entity2] is [None] . ',
 'the relation between source [entity1] and targe

In [41]:
relation_dict = {f"[{v}]": 0 for v in relations}

In [43]:
for lines in train_dataset['labels']:
    relation_dict[lines.split(" ")[-3]] += 1

In [44]:
relation_dict

{'[None]': 18720,
 '[Association]': 2183,
 '[Bind]': 60,
 '[Comparison]': 28,
 '[Conversion]': 3,
 '[Cotreatment]': 31,
 '[Drug_Interaction]': 11,
 '[Negative_Correlation]': 763,
 '[Positive_Correlation]': 1088}

In [16]:
result = {
    "output": [],
    "label": []
}

for output, label in zip(outputs, test_dataset['labels']):
    result['output'].append(output)
    result['label'].append(label)

In [17]:
# save the result dictionary
import pickle
with open("GPT_w_ner/result/GPT_w_ner_epoch_15_balanced_result.pkl", "wb") as f:
    pickle.dump(result, f)

post-processing and evaluation

In [18]:
# load the result dictionary
import pickle
with open("GPT_w_ner/result/GPT_w_ner_epoch_15_balanced_result.pkl", "rb") as f:
    result = pickle.load(f)

In [30]:
uncorrected = 0
for i in range(len(result['output'])):
    if result['output'][i][:-6] != result['label'][i][:-3]:
        uncorrected += 1

In [32]:
len(result['output'])

7590

In [31]:
uncorrected

1181

In [40]:
count = 0
for i in range(len(result['output'])):
    if result['label'][i][-9:-3] != '[None]':
        count += 1

In [41]:
count

1163

In [54]:
for i in range(5360, 5380):
    print(result['output'][i])

the relation between source [entity1] and target [entity2] is [None]. </s>
the relation between source [entity1] and target [entity2] is [None]. </s>
the relation between source [entity1] and target [entity2] is [None]. </s>
the relation between source [entity1] and target [entity2] is [Positive_Correlation]. </s>
the relation between source [entity1] and target [entity2] is [Positive_Correlation]. </s>
the relation between source [entity1] and target [entity2] is [Positive_Correlation]. </s>
the relation between source [entity1] and target [entity2] is [Positive_Correlation]. </s>
the relation between source [entity1] and target [entity2] is [Positive_Correlation]. </s>
the relation between source [entity1] and target [entity2] is [Positive_Correlation]. </s>
the relation between source [entity1] and target [entity2] is [None]. </s>
the relation between source [entity1] and target [entity2] is [Positive_Correlation]. </s>
the relation between source [entity1] and target [entity2] is [

In [55]:
for i in range(5360, 5380):
    print(result['label'][i])

the relation between source [entity1] and target [entity2] is [None] . 
the relation between source [entity1] and target [entity2] is [None] . 
the relation between source [entity1] and target [entity2] is [None] . 
the relation between source [entity1] and target [entity2] is [None] . 
the relation between source [entity1] and target [entity2] is [Positive_Correlation] . 
the relation between source [entity1] and target [entity2] is [None] . 
the relation between source [entity1] and target [entity2] is [Positive_Correlation] . 
the relation between source [entity1] and target [entity2] is [Negative_Correlation] . 
the relation between source [entity1] and target [entity2] is [None] . 
the relation between source [entity1] and target [entity2] is [Cotreatment] . 
the relation between source [entity1] and target [entity2] is [Positive_Correlation] . 
the relation between source [entity1] and target [entity2] is [Cotreatment] . 
the relation between source [entity1] and target [entity2]

In [53]:
count = 0
for i in range(len(result['output'])):
    if result['output'][i][-12:-6] != '[None]':
        print(i,"  ", result['output'][i][-30:-6])
        count += 1

970    s [Positive_Correlation]
971    s [Positive_Correlation]
972    s [Positive_Correlation]
973    s [Positive_Correlation]
974    s [Positive_Correlation]
975    s [Positive_Correlation]
977    s [Positive_Correlation]
978    s [Positive_Correlation]
980    s [Positive_Correlation]
981    s [Positive_Correlation]
982    s [Positive_Correlation]
983    s [Positive_Correlation]
984    s [Positive_Correlation]
985    s [Positive_Correlation]
986    s [Positive_Correlation]
989    s [Positive_Correlation]
991    s [Positive_Correlation]
992    s [Positive_Correlation]
993    s [Positive_Correlation]
996    s [Positive_Correlation]
997    s [Positive_Correlation]
998    s [Positive_Correlation]
999    s [Positive_Correlation]
1468    ntity2] is [Association]
1483    ntity2] is [Association]
1488    ntity2] is [Association]
1490    ntity2] is [Association]
1494    ntity2] is [Association]
1502    ntity2] is [Association]
1509    ntity2] is [Association]
1539    ntity2] is [Association]
