In [1]:
import pickle as pkl
import os 
import sys
import numpy as np

import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from torch_geometric.data import Data
sys.path.append("/home/ec2-user/proj/code/graphbert/src")

from utility.prompting import (
    Item,
    get_prompt_tuning_prompt
)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = "/home/ec2-user/proj/datasets/graph/text_graph/node_pubmed"
DATA_NAME = "text_graph_pubmed" #"text_graph_aids" #"text_graph_pubmed" # # "text_graph_cora"

with open(os.path.join(DATA_PATH, f"{DATA_NAME}.pkl"), 'rb') as f:
    graph = pkl.load(f)

In [3]:
graph

Data(text_nodes=[19717], text_labels=[19717], y=[19717], x=[19717, 768], edge_index=[2, 44338])

In [10]:
TRAIN_SPLIT_NAME = 'train_index'
with open(os.path.join(DATA_PATH, f"{TRAIN_SPLIT_NAME}.pkl"), 'rb') as f:
    train_split = pkl.load(f)
TEST_SPLIT_NAME = 'test_index'
with open(os.path.join(DATA_PATH, f"{TEST_SPLIT_NAME}.pkl"), 'rb') as f:
    test_split = pkl.load(f)

In [12]:
train_split

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [5]:
np.unique(graph.text_node_labels)

array(['Artificial Intelligence', 'Computation and Language',
       'Computational Complexity',
       'Computational Engineering, Finance, and Science',
       'Computational Geometry', 'Computer Science and Game Theory',
       'Computer Vision and Pattern Recognition', 'Computers and Society',
       'Cryptography and Security', 'Data Structures and Algorithms',
       'Databases', 'Digital Libraries', 'Discrete Mathematics',
       'Distributed, Parallel, and Cluster Computing',
       'Emerging Technologies', 'Formal Languages and Automata Theory',
       'General Literature', 'Graphics', 'Hardware Architecture',
       'Human-Computer Interaction', 'Information Retrieval',
       'Information Theory', 'Logic in Computer Science',
       'Machine Learning', 'Mathematical Software', 'Multiagent Systems',
       'Multimedia', 'Networking and Internet Architecture',
       'Neural and Evolutionary Computing', 'Numerical Analysis',
       'Operating Systems', 'Other Computer Science'

# Create prompt

In [3]:
task_name = 'prompt_tuning'
pubmed_item = Item(
    desc = "Question: Which category from the list that the paper most likely belong to?",
    categories = ['Diabetes Mellitus Type 1', 'Diabetes Mellitus Type 2','Diabetes Mellitus, Experimental'],
    question = "Given the keywords of a research paper, identify one category from a distinct list of research topics that you predict the paper will most likely belong to."
    )
hard_prompt = get_prompt_tuning_prompt(
    task_name = task_name,
    task_item = pubmed_item
)

In [4]:
print(hard_prompt)

### USER: Question: Which category from the list that the paper most likely belong to? 

Belows are 3 potential categories to consider:
Category [1](Diabetes Mellitus Type 1) 
Category [2](Diabetes Mellitus Type 2) 
Category [3](Diabetes Mellitus, Experimental) 

Given the keywords of a research paper, identify one category from a distinct list of research topics that you predict the paper will most likely belong to.
### ASSISTANT:


### Here is a toy example with prompt tunable model 

In [5]:
"""
Here we create a toy example with gpt-2 and pubmed dataset with bert embedding for a prompt tunable model
"""
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, PrefixTuningConfig

# load a pretrained gpt-2 model
model_name_or_path = "/home/ec2-user/proj/llm_models/gpt2"
tokenizer_name_or_path = "/home/ec2-user/proj/llm_models/gpt2"
prompt_peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Classify if the tweet is a complaint or not:",
    tokenizer_name_or_path=model_name_or_path,
)


In [6]:
dataset_name = "pubmed"
checkpoint_name = f"checkpoints_{dataset_name}_{task_name}_{model_name_or_path.split('/')[-1]}_v1.pt".replace(
    "_", "/"
)
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 2e-3
num_epochs = 100
batch_size = 8

In [7]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
prompt_model = get_peft_model(model, prompt_peft_config)
print(prompt_model.print_trainable_parameters())

trainable params: 6,144 || all params: 124,445,952 || trainable%: 0.00493708304790822
None


In [8]:
prompt_model

PeftModelForCausalLM(
  (base_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (prompt_encoder): Mo

# now build our prompt tuning model

In [9]:
from tuner import GraphPeftType, GraphPromptTuningConfig
from peft import TaskType
# load a pretrained gpt-2 model
model_name_or_path = "/home/ec2-user/proj/llm_models/gpt2"
tokenizer_name_or_path = "/home/ec2-user/proj/llm_models/gpt2"
peft_config = GraphPromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    input_embedding_dim=768,
    num_virtual_tokens=8,
    encoder_hidden_size=2048,
    embed_projection=True
)

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from mapping import get_peft_graph_model

In [11]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

In [12]:
graph_prompt_model = get_peft_graph_model(model, peft_config)
print(graph_prompt_model.print_trainable_parameters())

trainable params: 14,163,968 || all params: 138,603,776 || trainable%: 10.219034725287715
None


In [13]:
graph_prompt_model.prompt_encoder

ModuleDict(
  (default): GraphPromptEncoder(
    (transform): Sequential(
      (0): Linear(in_features=768, out_features=2048, bias=True)
      (1): Tanh()
      (2): Linear(in_features=2048, out_features=6144, bias=True)
    )
  )
)

# how to generate for prefix model and our graph model

In [14]:
print(prompt_model.print_trainable_parameters())
print(graph_prompt_model.print_trainable_parameters())

trainable params: 6,144 || all params: 124,445,952 || trainable%: 0.00493708304790822
None
trainable params: 14,163,968 || all params: 138,603,776 || trainable%: 10.219034725287715
None


In [15]:
input_text = "How are you doing?"
inputs = tokenizer(input_text,return_tensors="pt")

In [16]:
# base model
output = model.generate(
    input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=32
)
print(tokenizer.decode(output[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


How are you doing?

I'm doing a lot of work on my website. I'm doing a lot of work on my website. I'm doing a lot of work on


In [17]:
# prompt tuning model
output = prompt_model.generate(
    input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=32
)
print(tokenizer.decode(output[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


How are you doing?

I'm doing fine. I'm not going to be tweeting about it. I'm not going to be tweeting about it. I'm not going to


In [25]:
# graph prompt tuning model
graph_embed = torch.rand((1,768))
output = graph_prompt_model.generate(
    input_ids=inputs["input_ids"], prompt_tokens=graph_embed, attention_mask=inputs["attention_mask"], max_new_tokens=32
)
print(tokenizer.decode(output[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


How are you doing?

I'm a big fan of the new "Giant" and "Giant" games. I've been playing the "Giant" and "


In [19]:
output

tensor([[2437,  389,  345, 1804,   30,  198,  198,   40, 1101,  257, 1310, 1643,
          286,  257, 4336,  286,  262,  366,   40, 1101,  257, 1310, 1643,  286,
          257, 4336,  286,  262,  366,   40, 1101,  257, 1310, 1643,  286,  257,
         4336]])