https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing#scrollTo=T-gy-LxM0yAi

In [15]:
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
checkpoint = "microsoft/biogpt"
from relations import relations
from datasets import DatasetDict, Dataset
import pandas as pd
from tqdm.notebook import trange, tqdm
from labels import get_labels

In [16]:
# load labels for bert_w_ner
additional_tokens, _, _, _ = get_labels(mode='GPT_w_ner')
print(additional_tokens, "\n", additional_tokens)

{'additional_special_tokens': ['[learn1]', '[learn2]', '[learn3]', '[learn4]', '[learn5]', '[learn6]']} 
 {'additional_special_tokens': ['[learn1]', '[learn2]', '[learn3]', '[learn4]', '[learn5]', '[learn6]']}


# load the model

In [17]:
# load the model in 8-bit quantization configuration
# the max length of the input is 1024
model = AutoModelForCausalLM.from_pretrained(checkpoint, 
    # load_in_8bit=True, 
    device_map={'':torch.cuda.current_device()},)

In [18]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [19]:
print_trainable_parameters(model)

trainable params: 346763264 || all params: 346763264 || trainable%: 100.0


# Tokenizer

In [20]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")

In [21]:
# adding new tokens to the tokenizer
# since I haven't load the model so I will resize the embedding of the model later]
num_added_toks = tokenizer.add_special_tokens(additional_tokens)
print('We have added', num_added_toks, 'tokens')

# save the tokenizer
tokenizer.save_pretrained("GPT_without_ner/GPT_w_ner_tokenizer")

We have added 6 tokens


('GPT_without_ner/GPT_w_ner_tokenizer/tokenizer_config.json',
 'GPT_without_ner/GPT_w_ner_tokenizer/special_tokens_map.json',
 'GPT_without_ner/GPT_w_ner_tokenizer/vocab.json',
 'GPT_without_ner/GPT_w_ner_tokenizer/merges.txt',
 'GPT_without_ner/GPT_w_ner_tokenizer/added_tokens.json')

In [22]:
model.resize_token_embeddings(len(tokenizer))

Embedding(42390, 1024)

# PEFT

Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in float32 for stability.

We also cast the output of the last layer and embedding layer in float32 for the same reasons.

In [23]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)

model.biogpt.embed_tokens = CastOutputToFloat(model.biogpt.embed_tokens)
model.output_projection = CastOutputToFloat(model.output_projection)

In [24]:
# more with LoRAconfig: https://huggingface.co/docs/peft/conceptual_guides/lora

from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftType

peft_config = LoraConfig(
    # r: the rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters.
    r=16,
    # alpha: LoRA scaling factor.
    lora_alpha=32, 
    # target_modules: The modules (for example, attention blocks) to apply the LoRA update matrices.
    target_modules=["q_proj", "v_proj"],
    fan_in_fan_out=True,
    lora_dropout=0.05,
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 1572864 || all params: 348342272 || trainable%: 0.4515283175278825


In [25]:
# make model's embed_tokens layer also trainable

model.biogpt.embed_tokens[0].weight.requires_grad = True
model.output_projection[0].weight.requires_grad = True

print_trainable_parameters(model)

trainable params: 44980224 || all params: 348342272 || trainable%: 12.912651611803232


In [26]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BioGptForCausalLM(
      (biogpt): BioGptModel(
        (embed_tokens): CastOutputToFloat(
          (0): Embedding(42390, 1024)
        )
        (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0-23): 24 x BioGptDecoderLayer(
            (self_attn): BioGptAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(
                in_features=1024, out_features=1024, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): Pa

In [27]:
# for model, print the layer's name if the layer is trainable, and print the precision of the layer

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.shape, param.dtype)

base_model.model.biogpt.embed_tokens.0.weight torch.Size([42390, 1024]) torch.float32
base_model.model.biogpt.layers.0.self_attn.v_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.biogpt.layers.0.self_attn.v_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.biogpt.layers.0.self_attn.q_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.biogpt.layers.0.self_attn.q_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.biogpt.layers.1.self_attn.v_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.biogpt.layers.1.self_attn.v_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.biogpt.layers.1.self_attn.q_proj.lora_A.default.weight torch.Size([16, 1024]) torch.float32
base_model.model.biogpt.layers.1.self_attn.q_proj.lora_B.default.weight torch.Size([1024, 16]) torch.float32
base_model.model.biogpt.layers.2.self_attn

# pre-process the text

In [28]:
from data_preprocessing import make_GPT_re_data_no_ner, GPT_no_ner_preprocess_function

# from data_preprocessing import all_line_of_pmid, get_original_text, get_identifier_and_entity, reorder_list, get_relations

In [29]:
# train and valid file paths
train_file_path = 'data/BioRED/processed/train.tsv'
valid_file_path = 'data/BioRED/processed/dev.tsv'

In [30]:
# make bert_re data
train_data_raw = make_GPT_re_data_no_ner(file_path=train_file_path, lower=True, random_seed=42)
valid_data_raw = make_GPT_re_data_no_ner(file_path=valid_file_path, lower=True, random_seed=42)

Dropped 10 line:
 [4548, 6646, 6758, 6776, 6866, 10222, 11775, 14657, 18818, 21689]
Dropped 9 line:
 [467, 941, 2220, 2233, 2261, 5335, 5337, 5378, 5490]


In [32]:
train_data_raw.keys()

dict_keys(['pmids', 'text', 'entities', 'outputs', 'relation'])

In [33]:
# save the raw data
# import json

# with open('GPT_without_ner/data/train_data_dict.json', 'w') as f:
#     json.dump(train_data_raw, f)

# with open('GPT_without_ner/data/valid_data_dict.json', 'w') as f:
#     json.dump(valid_data_raw, f)

In [42]:
import json

with open('GPT_without_ner/data/train_data_dict.json', 'r') as f:
    train_data_raw = json.load(f)

with open('GPT_without_ner/data/valid_data_dict.json', 'r') as f:
    valid_data_raw = json.load(f)

print(train_data_raw.keys())
for k, v in train_data_raw.items():
    print(k, len(v))

# make into Dataset type
train_data_raw = Dataset.from_dict(train_data_raw)
valid_data_raw = Dataset.from_dict(valid_data_raw)

dict_keys(['pmids', 'text', 'entities', 'outputs', 'relation'])
pmids 1283
text 1283
entities 1283
outputs 1283
relation 1283


In [44]:
dataset = DatasetDict({
    "train": train_data_raw,
    "valid": valid_data_raw
})

In [45]:
dataset['train'][0]

{'pmids': '10491763',
 'text': 'hepatocyte nuclear factor-6 : associations between genetic variability and type ii diabetes and between genetic variability and estimates of insulin secretion . the transcription factor hepatocyte nuclear factor (hnf)-6 is an upstream regulator of several genes involved in the pathogenesis of maturity-onset diabetes of the young . we therefore tested the hypothesis that variability in the hnf-6 gene is associated with subsets of type ii ( non-insulin-dependent ) diabetes mellitus and estimates of insulin secretion in glucose tolerant subjects . we cloned the coding region as well as the intron-exon boundaries of the hnf-6 gene . w e then examined them on genomic dna in six mody probands without mutations in the mody1 , mody3 and mody4 genes and in 54 patients with late-onset type ii diabetes by combined single strand conformational polymorphism-heteroduplex analysis followed by direct sequencing of identified variants . an identified missense variant was

In [46]:
tokenized_datasets = dataset.map(lambda example: GPT_no_ner_preprocess_function(example, tokenizer), batched=True, remove_columns=['pmids', 'text', 'entities', 'outputs', 'relation'])

Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

truncated 4 examples


Map:   0%|          | 0/333 [00:00<?, ? examples/s]

truncated 2 examples


In [48]:
tokenized_datasets.save_to_disk('GPT_without_ner/data/tokenized_dataset_no_ner')


Saving the dataset (0/1 shards):   0%|          | 0/1283 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/333 [00:00<?, ? examples/s]

In [49]:
from datasets import load_from_disk

tokenized_datasets = load_from_disk('GPT_without_ner/data/tokenized_dataset_no_ner')

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1283
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 333
    })
})

In [26]:
# to tensor
tokenized_datasets.set_format(type='torch', columns=['input_ids'])

In [47]:
tokenizer.decode(tokenized_datasets['train']['input_ids'][0])

'hepatocyte nuclear factor-6: associations between genetic variability and type ii diabetes and between genetic variability and estimates of insulin secretion. the transcription factor hepatocyte nuclear factor (hnf) -6 is an upstream regulator of several genes involved in the pathogenesis of maturity-onset diabetes of the young. we therefore tested the hypothesis that variability in the hnf-6 gene is associated with subsets of type ii (non-insulin-dependent) diabetes mellitus and estimates of insulin secretion in glucose tolerant subjects. we cloned the coding region as well as the intron-exon boundaries of the hnf-6 gene. w e then examined them on genomic dna in six mody probands without mutations in the mody1, mody3 and mody4 genes and in 54 patients with late-onset type ii diabetes by combined single strand conformational polymorphism-heteroduplex analysis followed by direct sequencing of identified variants. an identified missense variant was examined in association studies and ge

# Training

wandb

In [50]:
import wandb

wandb.init(
    # set the wandb project where this run will be logged
    project="GPT2",
    # notes="PubmedBERT-FT-NER_w_NERin_10epochs",
    name="BioGPT_no_ner_epoch_15",
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33m309439737[0m ([33mtian1995[0m). Use [1m`wandb login --relogin`[0m to force relogin


training

In [51]:
from transformers import DataCollatorForLanguageModeling

In [52]:
import transformers

trainer = transformers.Trainer(
    model=model, 
    train_dataset=tokenized_datasets['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8, 
        gradient_accumulation_steps=8,
        warmup_steps=1000, 
        num_train_epochs=15,
        learning_rate=2e-4, 
        fp16=True,
        logging_steps=1, 
        report_to="wandb",
        save_strategy="epoch",
        output_dir='GPT_without_ner'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 3.0152, 'learning_rate': 2.0000000000000002e-07, 'epoch': 0.05}
{'loss': 3.1082, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.1}
{'loss': 3.0445, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.15}
{'loss': 3.0369, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.2}
{'loss': 2.9729, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.25}
{'loss': 2.9954, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.3}
{'loss': 3.0043, 'learning_rate': 1.4000000000000001e-06, 'epoch': 0.35}
{'loss': 2.9991, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.4}
{'loss': 3.0874, 'learning_rate': 1.8e-06, 'epoch': 0.45}
{'loss': 3.0998, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.5}
{'loss': 3.0013, 'learning_rate': 2.2e-06, 'epoch': 0.55}
{'loss': 3.0844, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.6}
{'loss': 3.1047, 'learning_rate': 2.6e-06, 'epoch': 0.65}
{'loss': 2.9493, 'learning_rate': 2.8000000000000003e-06, 'epoch': 0.7}
{'loss': 2.9872, 'learning_rate'

TrainOutput(global_step=300, training_loss=2.3627151997884113, metrics={'train_runtime': 6143.1653, 'train_samples_per_second': 3.133, 'train_steps_per_second': 0.049, 'train_loss': 2.3627151997884113, 'epoch': 14.91})

In [53]:
import wandb
wandb.finish()
trainer.save_model("GPT_without_ner/models/GPT_no_ner_epoch_15")

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/loss,██▇█▇█▇█▇▇▇▇▆▇▆▆▆▅▅▅▅▄▄▄▃▄▃▃▂▂▂▂▂▂▂▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,14.91
train/global_step,300.0
train/learning_rate,6e-05
train/loss,1.6486
train/total_flos,3.571695470247936e+16
train/train_loss,2.36272
train/train_runtime,6143.1653
train/train_samples_per_second,3.133
train/train_steps_per_second,0.049


In [54]:
model.save_pretrained("GPT_without_ner/models/GPT_no_ner_epoch_15.peft")

In [55]:
# Since there are key-unmatches in the trainer.save_model(), we need to rename the keys and load the paras in the model

embed_tokens_state_dict = torch.load("GPT_without_ner/models/GPT_no_ner_epoch_15/pytorch_model.bin")

old_keys = ["base_model.model.biogpt.embed_tokens.0.weight", "base_model.model.output_projection.0.weight"]
new_keys = ["base_model.model.biogpt.embed_tokens.weight", "base_model.model.output_projection.weight"]

for old_key, new_key in zip(old_keys, new_keys):
    # Get the value of the old key
    value = embed_tokens_state_dict[old_key]

    # Create a new key-value pair with the updated name
    embed_tokens_state_dict[new_key] = value

    # Delete the old key if desired
    del embed_tokens_state_dict[old_key]

torch.save(embed_tokens_state_dict, "GPT_without_ner/models/GPT_no_ner_epoch_15/pytorch_model-af.bin")

In [35]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BioGptForCausalLM(
      (biogpt): BioGptModel(
        (embed_tokens): CastOutputToFloat(
          (0): Embedding(42390, 1024)
        )
        (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0-23): 24 x BioGptDecoderLayer(
            (self_attn): BioGptAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(
                in_features=1024, out_features=1024, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): Pa

# load model and inference

In [1]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "microsoft/biogpt"

peft_model_id = "GPT_without_ner/models/GPT_no_ner_epoch_15.peft"
# config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained("GPT_without_ner/GPT_w_ner_tokenizer")

# resize the token embeddings to match the tokenizer
model.resize_token_embeddings(len(tokenizer))

# Load the Lora model
# the resized embedding layer are still uncorrected, need to load the weights manually
model = PeftModel.from_pretrained(model, peft_model_id)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/tian/mambaforge/envs/BioRED/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /home/tian/mambaforge/envs/BioRED/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [2]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BioGptForCausalLM(
      (biogpt): BioGptModel(
        (embed_tokens): Embedding(42390, 1024)
        (embed_positions): BioGptLearnedPositionalEmbedding(1026, 1024)
        (layers): ModuleList(
          (0-23): 24 x BioGptDecoderLayer(
            (self_attn): BioGptAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(
                in_features=1024, out_features=1024, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embeddin

In [2]:
model.load_state_dict(torch.load("GPT_without_ner/models/GPT_no_ner_epoch_15/pytorch_model-af.bin"))

<All keys matched successfully>

In [3]:
model.eval()
model.to("cpu")
inputs = tokenizer("Tweet text : @HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label :", return_tensors="pt")

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0])

Tweet text: @ HondaCustSvc Your customer service has been horrible during the recall process. I will never purchase a Honda again. Label: 'the best' is the best for the patient


In [4]:
import pandas as pd
import re
from tqdm.notebook import trange, tqdm
from torch import nn
from labels import get_labels
from relations import relations
from datasets import DatasetDict, Dataset

from data_preprocessing import make_GPT_re_data_no_ner, GPT_no_ner_preprocess_function
additional_tokens, _, _, _ = get_labels(mode='GPT_w_ner')

In [5]:
import json

"""# load test data and preprocess
test_file_path = 'data/BioRED/processed/test.tsv'
test_data = make_GPT_re_data_no_ner(file_path=test_file_path, lower=True, random_seed=42)

# save the raw data
with open('GPT_without_ner/data/test_data_dict.json', 'w') as f:
    json.dump(test_data, f)"""
with open('GPT_without_ner/data/test_data_dict.json', 'r') as f:
    test_data= json.load(f)


test_dataset_raw = Dataset.from_dict(test_data)

test_dataset = test_dataset_raw.map(lambda example: GPT_no_ner_preprocess_function(example, tokenizer,  infer=True), batched=True, remove_columns=['pmids', 'text', 'entities', 'outputs', 'relation'])

test_dataset.save_to_disk('GPT_without_ner/data/test_tokenized_dataset_no_ner')



from datasets import load_from_disk

test_dataset = load_from_disk('GPT_without_ner/data/test_tokenized_dataset_no_ner')

test_dataset.set_format(type='torch', columns=['input_ids'])

Map:   0%|          | 0/322 [00:00<?, ? examples/s]

truncated 4 examples


Saving the dataset (0/1 shards):   0%|          | 0/322 [00:00<?, ? examples/s]

In [10]:
test_dataset

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 322
})

In [6]:
tokenizer.decode(test_dataset['input_ids'][0])

'a novel scn5a mutation manifests as a malignant form of long qt syndrome with perinatal onset of tachycardia / bradycardia. objective: congenital long qt syndrome (lqts) with in utero onset of the rhythm disturbances is associated with a poor prognosis. in this study we investigated a newborn patient with fetal bradycardia, 2: 1 atrioventricular block and ventricular tachycardia soon after birth. methods: mutational analysis and dna sequencing were conducted in a newborn. the 2: 1 atrioventricular block improved to 1: 1 conduction only after intravenous lidocaine infusion or a high dose of mexiletine, which also controlled the ventricular tachycardia. results: a novel, spontaneous lqts-3 mutation was identified in the transmembrane segment 6 of domain iv of the na (v) 1.5 cardiac sodium channel, with a g-- > a substitution at codon 1763, which changed a valine (gtg) to a methionine (atg). the proband was heterozygous but the mutation was absent in the parents and the sister. expressio

In [7]:
tokenizer.eos_token_id

2

In [10]:
from tqdm.notebook import trange, tqdm
import torch


model.eval()
outputs = []
model.to("cuda")
with torch.no_grad():
    for i in tqdm(range(len(test_dataset))):
    # for i in range(1):
        output = model.generate(input_ids=test_dataset[i]["input_ids"].unsqueeze(0).to("cuda"), max_new_tokens=50, eos_token_id=tokenizer.eos_token_id)
        output_text = tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=False)[0]
        try:
            outputs.append(output_text.split("[learn6]")[1].strip())
        except:
            outputs.append(output_text.strip())
        if i % 10 == 0:
            print(outputs[-1])

    # print(tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=False)[0])

  0%|          | 0/322 [00:00<?, ?it/s]

the source is none. </s>
the source is none. </s>
the source is hbv and the target is rtl180m m m; the source is hbv and the target is rt180m m; the source is hbv and the target is rt180m m; the source is
the source is none. </s>
the source is pethidine and the target is pethidine. </s>
the source is cambium and the target is lrr-rlk; the source is cambium and the target is cle41 / pxy; the source is cambium and the target is lrr-rlk; the
the source is hawthorn and the target is isoproterenol; the source is hawthorn and the target is isoproterenol; the source is crataegus and the target is isoproterenol; the source is crataegus and the target is
the source is meth and the target is 5-ht6 receptor; the source is meth and the target is 5-ht6 receptor; the source is meth and the target is 5-ht6 receptor; the source is meth and the target is
the source is none. </s>
the source is none. </s>
the source is none. </s>
the source is rsv and the target is lif; the source is rsv and the target i

In [12]:
test_dataset['labels'][30:80]

['the source is lithium and the target is polyuria ; the source is ndi and the target is pkca ; the source is lithium and the target is pkca ; the source is lithium and the target is ndi . ',
 'the source is none . ',
 'the source is alcohol and the target is il-1beta ; the source is alcohol and the target is tnf-alpha ; the source is alcohol and the target is lc3b-i and -ii ; the source is alcohol and the target is s6 ; the source is alcohol and the target is atg12 - 5 ; the source is alcohol and the target is eif4b ; the source is alcohol and the target is atg12 - 5 ; the source is alcohol and the target is il-6 ; the source is alcohol and the target is hsl ; the source is alcohol and the target is ulk1 ; the source is alcohol and the target is atgl . ',
 'the source is alcohol and the target is ppargamma ; the source is alcohol and the target is c/ebpalpha . ',
 'the source is alcohol and the target is s6k1 . ',
 'the source is none . ',
 'the source is scopolamine-induced and the t

post-processing and evaluation

In [23]:
# post processing for the outputs without ner
pairs = []
for output in outputs:
    pair = []
    # if the output doesn't end with "<|endoftext|>", find the lastest ";" of the output and only take the previous part
    if output.endswith("<|endoftext|>"):
        string = output

        for line in string.split(";"):
            try:
                source = line.split("the source is")[1].strip()
                source = source.split("and the target is")[0].strip()
                if source.startswith("none"):
                    source = "none"
                    target = "none"
                    continue
                target = line.split("the target is")[1].strip()
                if target.endswith(". <|endoftext|>"):
                    target = target.split(". <|endoftext|>")[0].strip()
                    
                if (source, target) not in pair:
                    pair.append((source, target))
            except:
                continue

    else:
        string = output.split(";")[:-1]
        string = [line.strip() for line in string]
        for line in string:
            try:
                source = line.split("the source is")[1].strip()
                source = source.split("and the target is")[0].strip()

                target = line.split("the target is")[1].strip()
                if (source, target) not in pair:
                    pair.append((source, target))
            except:
                continue
    if len(pair) == 0:
        pair.append(("none", "none"))
    pairs.append(pair)
    
output_pairs = pairs

In [24]:
# post processing for the labels without ner
pairs = []
for output in test_dataset['labels']:
    pair = []
    # if the output doesn't end with "<|endoftext|>", find the lastest ";" of the output and only take the previous part
    if output.endswith("<|endoftext|>"):
        string = output

        for line in string.split(";"):
            try:
                source = line.split("the source is")[1].strip()
                source = source.split("and the target is")[0].strip()
                if source.startswith("none"):
                    source = "none"
                    target = "none"
                    continue
                target = line.split("the target is")[1].strip()
                if target.endswith(". <|endoftext|>"):
                    target = target.split(". <|endoftext|>")[0].strip()
                    
                if (source, target) not in pair:
                    pair.append((source, target))
            except:
                continue

    else:
        string = output.split(";")[:-1]
        string = [line.strip() for line in string]
        for line in string:
            try:
                source = line.split("the source is")[1].strip()
                source = source.split("and the target is")[0].strip()

                target = line.split("the target is")[1].strip()
                if (source, target) not in pair:
                    pair.append((source, target))
            except:
                continue
    if len(pair) == 0:
        pair.append(("none", "none"))
    pairs.append(pair)

label_pairs = pairs

In [25]:
result = {
    "output": [],
    "label": []
}

for output, label in zip(output_pairs, label_pairs):
    result['output'].append(output)
    result['label'].append(label)

In [27]:
# save the result dictionary
import pickle
with open("GPT_without_ner/result/epoch_15_result.pkl", "wb") as f:
    pickle.dump(result, f)

# Analysis

In [29]:
import pickle
with open("GPT_without_ner/result/epoch_15_result.pkl", "rb") as f:
    result = pickle.load(f)

In [30]:
print(f'the length: {len(result["output"])}, {len(result["label"])}')
print(f'instance:\n{result["output"][0]}\n{result["label"][0]}')

the length: 322, 322
instance:
[('none', 'none')]
[('lidocaine', 'ventricular tachycardia'), ('mexiletine', 'arrhythmias'), ('lidocaine', 'lqts'), ('mexiletine', 'lqts'), ('mexiletine', 'atrioventricular block'), ('lidocaine', 'atrioventricular block'), ('mexiletine', 'ventricular tachycardia')]


In [31]:
tuple_tp = 0
tuple_fp = 0  
tuple_fn = 0
tuple_tn = 0

for output, label in zip(result['output'], result['label']):
    for pair in output:
        true_tuple = False
        for label_pair in label:
            if pair[0] == label_pair[0] and pair[1] == label_pair[1]:
                tuple_tp += 1
                true_tuple = True
                break
        if not true_tuple:
            tuple_fp += 1
    
    for label_pair in label:
        true_tuple = False
        for pair in output:
            if pair[0] == label_pair[0] and pair[1] == label_pair[1]:
                true_tuple = True
                break
        if not true_tuple:
            tuple_fn += 1

In [32]:
# calculate the precision, recall and f1 score

# for tuple
tuple_precision = tuple_tp / (tuple_tp + tuple_fp)
tuple_recall = tuple_tp / (tuple_tp + tuple_fn)
tuple_f1 = 2 * tuple_precision * tuple_recall / (tuple_precision + tuple_recall)
print(f"tuple precision: {tuple_precision}, recall: {tuple_recall}, f1: {tuple_f1}")

tuple precision: 0.28835978835978837, recall: 0.11077235772357724, f1: 0.16005873715124816


In [33]:
# loosen the condition for the tp


tuple_tp = 0
tuple_fp = 0  
tuple_fn = 0
tuple_tn = 0

for output, label in zip(result['output'], result['label']):
    for pair in output:
        true_tuple = False
        for label_pair in label:
            if (pair[0] in label_pair[0]) and (pair[1] in label_pair[1]):
                tuple_tp += 1
                true_tuple = True
                break
        if not true_tuple:
            tuple_fp += 1
    
    for label_pair in label:
        true_tuple = False
        for pair in output:
            if (pair[0] in label_pair[0]) and (pair[1] in label_pair[1]):
                true_tuple = True
                break
        if not true_tuple:
            tuple_fn += 1

# calculate the precision, recall and f1 score

# for tuple
tuple_precision = tuple_tp / (tuple_tp + tuple_fp)
tuple_recall = tuple_tp / (tuple_tp + tuple_fn)
tuple_f1 = 2 * tuple_precision * tuple_recall / (tuple_precision + tuple_recall)
print(f"tuple precision: {tuple_precision}, recall: {tuple_recall}, f1: {tuple_f1}")

tuple precision: 0.30687830687830686, recall: 0.11776649746192894, f1: 0.17021276595744683
