In [1]:
import pickle as pkl
import os 
import sys
import numpy as np

import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
from torch_geometric.data import Data
sys.path.append("/home/ec2-user/proj/code/graphbert/src")

from utility.prompting import (
    Item,
    get_prompt_tuning_prompt
)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from tuner import GraphPeftType, GraphPromptTuningConfig
from peft import TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from mapping import get_peft_graph_model
# load a pretrained gpt-2 model
model_name_or_path = "/home/ec2-user/proj/llm_models/vicuna-7b-v1.5"
tokenizer_name_or_path = "/home/ec2-user/proj/llm_models/vicuna-7b-v1.5"
peft_config = GraphPromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    input_embedding_dim=768,
    num_virtual_tokens=4,
    encoder_hidden_size=1024,
    embed_projection=True
)

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.51s/it]


In [4]:
graph_prompt_model = get_peft_graph_model(model, peft_config)
print(graph_prompt_model.print_trainable_parameters())

trainable params: 17,581,056 || all params: 6,755,996,672 || trainable%: 0.26022890261127707
None


In [5]:
graph_prompt_model.prompt_encoder['default'].transform[0].weight

Parameter containing:
tensor([[ 0.0200,  0.0046, -0.0208,  ..., -0.0274, -0.0224,  0.0160],
        [ 0.0316,  0.0214, -0.0300,  ..., -0.0263,  0.0119,  0.0053],
        [ 0.0349,  0.0080, -0.0024,  ..., -0.0080,  0.0105,  0.0010],
        ...,
        [-0.0105,  0.0175,  0.0234,  ..., -0.0070, -0.0318,  0.0046],
        [ 0.0006,  0.0028, -0.0201,  ..., -0.0029,  0.0320, -0.0133],
        [-0.0058,  0.0061, -0.0187,  ..., -0.0026, -0.0347, -0.0049]],
       requires_grad=True)

In [6]:
peft_model_id = '/home/ec2-user/proj/code/graphbert/saved_models/arxiv-vicuna-7b-v1.5'
graph_prompt_model.load_adapter(peft_model_id, adapter_name='default')

_IncompatibleKeys(missing_keys=['base_model.model.embed_tokens.weight', 'base_model.model.layers.0.self_attn.q_proj.weight', 'base_model.model.layers.0.self_attn.k_proj.weight', 'base_model.model.layers.0.self_attn.v_proj.weight', 'base_model.model.layers.0.self_attn.o_proj.weight', 'base_model.model.layers.0.mlp.gate_proj.weight', 'base_model.model.layers.0.mlp.up_proj.weight', 'base_model.model.layers.0.mlp.down_proj.weight', 'base_model.model.layers.0.input_layernorm.weight', 'base_model.model.layers.0.post_attention_layernorm.weight', 'base_model.model.layers.1.self_attn.q_proj.weight', 'base_model.model.layers.1.self_attn.k_proj.weight', 'base_model.model.layers.1.self_attn.v_proj.weight', 'base_model.model.layers.1.self_attn.o_proj.weight', 'base_model.model.layers.1.mlp.gate_proj.weight', 'base_model.model.layers.1.mlp.up_proj.weight', 'base_model.model.layers.1.mlp.down_proj.weight', 'base_model.model.layers.1.input_layernorm.weight', 'base_model.model.layers.1.post_attention_l

In [7]:
graph_prompt_model.prompt_encoder['default'].transform[0].weight

Parameter containing:
tensor([[ 0.0034,  0.0167,  0.0274,  ..., -0.0363, -0.0467, -0.0033],
        [-0.0399, -0.0430,  0.0293,  ..., -0.0030,  0.0526, -0.0337],
        [-0.0327,  0.0273, -0.0103,  ..., -0.0046,  0.0975,  0.0366],
        ...,
        [ 0.0271, -0.0375,  0.0170,  ...,  0.0162,  0.0319, -0.0098],
        [ 0.0135,  0.0292, -0.0323,  ..., -0.0199, -0.0278, -0.0274],
        [-0.0347, -0.0009, -0.0374,  ..., -0.0263, -0.0378,  0.0112]],
       requires_grad=True)

In [8]:
from datasets import Dataset
from datasets import load_from_disk

PREPARED_DATASET_PATH = "/home/ec2-user/proj/code/graphbert/last-run-prepared/text_graph_arxiv"

# Load the processed training dataset
processed_datasets = load_from_disk(f'{PREPARED_DATASET_PATH}/train_dataset')

# Load the processed test dataset
processed_datasets_test = load_from_disk(f'{PREPARED_DATASET_PATH}/test_dataset')

  table = cls._concat_blocks(blocks, axis=0)


In [9]:
processed_datasets['train'][0].keys()

dict_keys(['embeds', 'labels', 'input_ids', 'attention_mask', 'prompt_tokens'])

In [12]:
sys.path.append("/home/ec2-user/proj/code/graphbert/src")
from utility.prompting import (
    Item,
    get_prompt_tuning_prompt
)

PROMPT_SETTINGS = {
    'arxiv':{
        'desc': "Question: Which category from the list that the paper most likely belong to?",
        'categories': ['Artificial Intelligence', 'Computation and Language',
                        'Computational Complexity',
                        'Computational Engineering, Finance, and Science',
                        'Computational Geometry', 'Computer Science and Game Theory',
                        'Computer Vision and Pattern Recognition', 'Computers and Society',
                        'Cryptography and Security', 'Data Structures and Algorithms',
                        'Databases', 'Digital Libraries', 'Discrete Mathematics',
                        'Distributed, Parallel, and Cluster Computing',
                        'Emerging Technologies', 'Formal Languages and Automata Theory',
                        'General Literature', 'Graphics', 'Hardware Architecture',
                        'Human-Computer Interaction', 'Information Retrieval',
                        'Information Theory', 'Logic in Computer Science',
                        'Machine Learning', 'Mathematical Software', 'Multiagent Systems',
                        'Multimedia', 'Networking and Internet Architecture',
                        'Neural and Evolutionary Computing', 'Numerical Analysis',
                        'Operating Systems', 'Other Computer Science', 'Performance',
                        'Programming Languages', 'Robotics',
                        'Social and Information Networks', 'Software Engineering', 'Sound',
                        'Symbolic Computation', 'Systems and Control'],
        'question': "Given the title and abstract of a research paper, identify one category from a distinct list of research topics that you predict the paper will most likely belong to."
    },
    'pubmed':{
        'desc': "Question: Which category from the list that the paper most likely belong to?",
        'categories': ['Diabetes Mellitus Type 1', 'Diabetes Mellitus Type 2','Diabetes Mellitus, Experimental'],
        'question': "Given the keywords of a research paper, identify one category from a distinct list of research topics that you predict the paper will most likely belong to."
    },
    'aids':{
        'desc': "Question: Which category from the list that the input molecule most likely belong to?",
        'categories': ['HIV antiviral active compound', 'HIV antiviral inactive compound'],
        'question': "Given the atoms type and their connection structure of a compound, identify if the given compound is HIV antiviral active or not."
    },
}
task_name = 'prompt_tuning'
DATA_NAME = "text_graph_arxiv"
if 'pubmed' in DATA_NAME:
    PROMPT_SETTINGS_DICT = PROMPT_SETTINGS['pubmed']
elif 'aids' in DATA_NAME:
    PROMPT_SETTINGS_DICT = PROMPT_SETTINGS['aids']
elif 'arxiv' in DATA_NAME:
    PROMPT_SETTINGS_DICT = PROMPT_SETTINGS['arxiv']
else:
    raise ValueError(f'Currently does not support {DATA_NAME}')
desc, categories, question = PROMPT_SETTINGS_DICT['desc'], PROMPT_SETTINGS_DICT['categories'], PROMPT_SETTINGS_DICT['question']
input_item = Item(
    desc = desc,
    categories = categories,
    question = question
    )
hard_prompt = get_prompt_tuning_prompt(
    task_name = task_name,
    task_item = input_item
)
inputs = tokenizer(hard_prompt,return_tensors="pt")

In [13]:
hard_prompt

'### USER: Question: Which category from the list that the paper most likely belong to? \n\nBelows are 40 potential categories to consider:\nCategory [1](Artificial Intelligence) \nCategory [2](Computation and Language) \nCategory [3](Computational Complexity) \nCategory [4](Computational Engineering, Finance, and Science) \nCategory [5](Computational Geometry) \nCategory [6](Computer Science and Game Theory) \nCategory [7](Computer Vision and Pattern Recognition) \nCategory [8](Computers and Society) \nCategory [9](Cryptography and Security) \nCategory [10](Data Structures and Algorithms) \nCategory [11](Databases) \nCategory [12](Digital Libraries) \nCategory [13](Discrete Mathematics) \nCategory [14](Distributed, Parallel, and Cluster Computing) \nCategory [15](Emerging Technologies) \nCategory [16](Formal Languages and Automata Theory) \nCategory [17](General Literature) \nCategory [18](Graphics) \nCategory [19](Hardware Architecture) \nCategory [20](Human-Computer Interaction) \nC