In [1]:
import pickle as pkl
import os 
import sys
import numpy as np

import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from torch_geometric.data import Data
from accelerate import Accelerator


sys.path.append("/home/ec2-user/proj/code/graphbert/src")

from utility.prompting import (
    Item,
    get_prompt_tuning_prompt
)
DATA_PATH = "../data/text_graph"
DATA_NAME = "text_graph_pubmed" # "text_graph_cora"

with open(os.path.join(DATA_PATH, f"{DATA_NAME}.pkl"), 'rb') as f:
    graph = pkl.load(f)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
graph

Data(text_nodes=[19717], text_labels=[19717], y=[19717], x=[19717, 768], edge_index=[2, 44338])

# Create task prompt

In [3]:
task_name = 'prompt_tuning'
pubmed_item = Item(
    desc = "Question: Which category from the list that the paper most likely belong to?",
    categories = ['Diabetes Mellitus Type 1', 'Diabetes Mellitus Type 2','Diabetes Mellitus, Experimental'],
    question = "Given the keywords of a research paper, identify one category from a distinct list of research topics that you predict the paper will most likely belong to."
    )
hard_prompt = get_prompt_tuning_prompt(
    task_name = task_name,
    task_item = pubmed_item
)

In [4]:
print(hard_prompt)

### USER: Question: Which category from the list that the paper most likely belong to? 

Belows are 3 potential categories to consider:
Category [1](Diabetes Mellitus Type 1) 
Category [2](Diabetes Mellitus Type 2) 
Category [3](Diabetes Mellitus, Experimental) 

Given the keywords of a research paper, identify one category from a distinct list of research topics that you predict the paper will most likely belong to.
### ASSISTANT:


# Here is how to use one GPU for training

In [5]:
# load gpt-2 model
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from tqdm import tqdm

model_name_or_path = "/home/ec2-user/proj/llm_models/vicuna-7b-v1.5"
tokenizer_name_or_path = "/home/ec2-user/proj/llm_models/vicuna-7b-v1.5"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.41s/it]


In [6]:
# build our graph prompt tuning model
from tuner import GraphPeftType, GraphPromptTuningConfig
from mapping import get_peft_graph_model
from peft import TaskType
peft_config = GraphPromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    input_embedding_dim=768,
    num_virtual_tokens=4,
    encoder_hidden_size=1024,
    embed_projection=True
)
model = get_peft_graph_model(model, peft_config)
print(model.print_trainable_parameters())

trainable params: 17,581,056 || all params: 6,755,996,672 || trainable%: 0.26022890261127707
None


In [11]:
state_dict = model.prompt_encoder['default'].transform.state_dict()

In [12]:
state_dict

OrderedDict([('0.weight',
              tensor([[ 1.5565e-02, -8.9188e-03, -1.8593e-02,  ..., -1.4404e-03,
                       -3.3002e-02, -1.6373e-02],
                      [ 2.2331e-02,  3.5551e-02,  1.4876e-02,  ..., -3.3919e-03,
                        1.7511e-02,  1.0691e-02],
                      [ 3.4553e-02,  2.1867e-03, -1.0429e-02,  ...,  1.8481e-02,
                       -2.3085e-02,  5.7828e-03],
                      ...,
                      [-5.5246e-03, -3.5043e-02, -1.2255e-02,  ...,  2.7078e-02,
                       -3.2238e-02,  2.6146e-02],
                      [ 1.9129e-02,  7.3476e-05,  1.8694e-02,  ..., -3.7632e-03,
                       -1.6735e-02, -1.7149e-02],
                      [ 2.6266e-02,  1.8581e-02,  3.0920e-02,  ..., -3.3916e-02,
                       -4.3927e-03, -2.8788e-02]])),
             ('0.bias',
              tensor([-0.0221,  0.0203,  0.0139,  ...,  0.0120, -0.0332,  0.0146])),
             ('2.weight',
              tensor([[

# prepare input dataset

In [26]:
from datasets import Dataset
data = Dataset.from_dict(
    {   
        'embeds': graph.x,
        'labels': graph.text_labels,
    }
)

# settings 
max_length = 128
lr = 5e-3
num_epochs = 2
batch_size = 1

# Split data into train and test with 50% for train and 50% for test
split_data = data.train_test_split(test_size=0.5)


In [21]:
def preprocess_function(examples):
    batch_size = len(examples['labels'])
    embeds = examples['embeds']
    targets = examples['labels']

    # tokenize task prompt and targets
    prompt = tokenizer(hard_prompt)
    labels = tokenizer(targets) 
    labels['labels'] = []
    labels['prompt_tokens'] = []

    for i in range(batch_size):
        # extract graph embedding, prompt ids, and target ids
        graph_embeds = embeds[i]
        prompt_ids = prompt["input_ids"]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]

        sample_input_ids = prompt_ids + label_input_ids
        attention_mask = [1] * len(labels["input_ids"][i])

        model_input_ids = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        attention_mask = [0] * (max_length - len(sample_input_ids)) + [1] * len(sample_input_ids)
        label_ids = [-100] * (max_length - len(label_input_ids)) + label_input_ids

        labels["input_ids"][i] = torch.tensor(model_input_ids[:max_length])
        labels["attention_mask"][i] = torch.tensor(attention_mask[:max_length])
        labels["labels"].append(torch.tensor(label_ids[:max_length]))
        labels['prompt_tokens'].append(torch.tensor(graph_embeds))

    return labels

In [22]:
processed_datasets = split_data.map(
    preprocess_function,
    batched=True,
    num_proc=16,
    load_from_cache_file=False,
    desc="Prepare dataset for Graph input",
)

Prepare dataset for Graph input (num_proc=16):   0%|          | 0/9858 [00:00<?, ? examples/s]

Prepare dataset for Graph input (num_proc=16): 100%|██████████| 9858/9858 [00:04<00:00, 2336.10 examples/s]
Prepare dataset for Graph input (num_proc=16): 100%|██████████| 9859/9859 [00:04<00:00, 2387.75 examples/s]


In [23]:
train_dataloader = DataLoader(
    processed_datasets['train'], shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
test_dataloader = DataLoader(processed_datasets['test'], collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)


In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [25]:
accelerator = Accelerator()

model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, lr_scheduler
)

In [27]:
device = accelerator.device
#model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        #batch = {k: v.to(device) for k, v in batch.items()}
        batch = {k: v for k, v in batch.items()}
        outputs = model(input_ids=batch["input_ids"], prompt_tokens=batch['embeds'], attention_mask=batch["attention_mask"], labels=batch['labels'])
        loss = outputs.loss
        total_loss += loss.detach().float()
        #loss.backward()
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(test_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(input_ids=batch["input_ids"], prompt_tokens=batch['embeds'], attention_mask=batch["attention_mask"], labels=batch['labels'])
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(test_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    #print(f"{epoch=}: {train_ppl=} {train_epoch_loss=}")
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 9858/9858 [45:23<00:00,  3.62it/s]
100%|██████████| 9859/9859 [23:21<00:00,  7.04it/s]


epoch=0: train_ppl=tensor(1.7064, device='cuda:0') train_epoch_loss=tensor(0.5344, device='cuda:0') eval_ppl=tensor(1.6939, device='cuda:0') eval_epoch_loss=tensor(0.5270, device='cuda:0')


100%|██████████| 9858/9858 [45:23<00:00,  3.62it/s]
100%|██████████| 9859/9859 [23:20<00:00,  7.04it/s]

epoch=1: train_ppl=tensor(1.6454, device='cuda:0') train_epoch_loss=tensor(0.4980, device='cuda:0') eval_ppl=tensor(1.6216, device='cuda:0') eval_epoch_loss=tensor(0.4834, device='cuda:0')





In [28]:
inputs = tokenizer(hard_prompt,return_tensors="pt")
device = 'cuda:0'
generated_answer = []
original_answer = []
for step, batch in enumerate(tqdm(split_data['test'])):
    if step>10:
        continue
    with torch.no_grad():
        input_ids = inputs["input_ids"].to(device).view(1,-1)
        attention_mask = inputs["attention_mask"].to(device).view(1,-1)
        embeds = torch.tensor(batch['embeds'], device=input_ids.device).view(1,-1)
        outputs = model.generate(
            input_ids=input_ids, 
            prompt_tokens=embeds, 
            attention_mask=attention_mask,
            max_new_tokens=64
            ).detach().cpu()
        generated_answer.append(tokenizer.decode(outputs[0])) 
        original_answer.append(batch['labels']) 
    

100%|██████████| 9859/9859 [00:22<00:00, 436.23it/s] 


In [29]:
correct_num = 0
total_num = 0
generated_results = [item.split('###')[-1] for item in generated_answer]
for prediction, groundtruth in zip(generated_results, original_answer):
    if groundtruth in prediction:
        correct_num += 1
    total_num += 1
print(f"Accuracy: {correct_num/total_num}")

Accuracy: 0.0


In [30]:
generated_results

[' ASSISTANT:, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type',
 ' ASSISTANT:, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type, Type',
 ' ASSISTANT: Type12 Type12 Type12 Type12 Type12 Type112 Type112 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type1',
 ' ASSISTANT: Type12 Type Type Type12 Type12 Type12 Type12 Type112 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type',
 ' ASSISTANT: Type2 Type Type Type Type Type2 Type12 Type12 Type12 Type112 Type12 Type12 Type12 Type12 Type12 Type12 Type12 Type2 Type12 Type12 Type12 Type12 Type2 Type12 Type12',
 ' ASSISTANT: Type2 Type Type Type Type12 Type12 Type12 Type12 Type1

In [17]:

for step, batch in enumerate(tqdm(train_dataloader)):
    #batch = {k: v.to(device) for k, v in batch.items()}
    batch = {k: v for k, v in batch.items()}

100%|██████████| 9858/9858 [00:12<00:00, 766.93it/s]


In [18]:
batch.keys()

dict_keys(['embeds', 'labels', 'input_ids', 'attention_mask', 'prompt_tokens'])

In [19]:
batch['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')