### Import Packages

In [1]:
import os
import torch
import pandas as pd
from vllm import LLM, SamplingParams
from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from peft import LoraConfig, PeftModel

# Setup environment 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Merge Base Model and Adaptor into New Model (Then Restart the kernel)

In [2]:
base_model_name = "/home/zhangwei/pretrained_models/llama-2-13b-chat-hf"   # Path of the pretrained model downloaded from Hugging Face
adaptor_name = "saved_models/llama2_13b_chat_qlora/train_6163_lora_r64_lr1e-05/checkpoint-3082" # Fine-tuned model name
new_model_name = f"saved_models/merged_models-{adaptor_name}"
device_map = {"": 0}    

In [4]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

model = PeftModel.from_pretrained(base_model, adaptor_name)
model = model.merge_and_unload()
model.save_pretrained(new_model_name, safe_serialization=False)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained(new_model_name, safe_serialization=False)  ## safe_serilization = False is very important, https://github.com/vllm-project/vllm/issues/615

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 4096}


('saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_6163_lora_r64_lr1e-05/checkpoint-3082/tokenizer_config.json',
 'saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_6163_lora_r64_lr1e-05/checkpoint-3082/special_tokens_map.json',
 'saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_6163_lora_r64_lr1e-05/checkpoint-3082/tokenizer.model',
 'saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_6163_lora_r64_lr1e-05/checkpoint-3082/added_tokens.json',
 'saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_6163_lora_r64_lr1e-05/checkpoint-3082/tokenizer.json')

### Load Merged Model

In [3]:
sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens = 3072, stop = ['!!!'])
llm = LLM(model = new_model_name)

INFO 04-22 07:43:43 llm_engine.py:79] Initializing an LLM engine with config: model='saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_6163_lora_r64_lr1e-05/checkpoint-3082', tokenizer='saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_6163_lora_r64_lr1e-05/checkpoint-3082', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


INFO 04-22 07:44:17 llm_engine.py:337] # GPU blocks: 862, # CPU blocks: 327
INFO 04-22 07:44:33 model_runner.py:666] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-22 07:44:33 model_runner.py:670] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 04-22 07:45:31 model_runner.py:738] Graph capturing finished in 58 secs.


### Preprocess Data

In [7]:
# Data Loading and Preprocessing
test_df = pd.read_csv("data/prod/test.csv")
source_text = "input"
target_text = "output"
instruction = f"{source_text}2{target_text}: "
instruction = "annotate the products in the paragraph. "

test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST]"
prompts = list(test_df['text'])
prompts[:5]

['<s>[INST] annotate the productions in the paragraph. The additional ring which arises from an intramolecular HDA reaction may be useful for the synthesis of triquinanes or other polycyclic compounds. [/INST]',
 '<s>[INST] annotate the productions in the paragraph. The decrease in entropy associated with tethering the two reactive components suggests that the reaction would be significantly more facile than the intermolecular reaction.25 However , this potential rate enhancement is com- promised by the dramatic decrease in rate associated with intermolecular cycloadditions with substituted norbomadienes as described previously. [/INST]',
 '<s>[INST] annotate the productions in the paragraph. There were no reported examples of successful intramolecular HDA reactions in the literature prior to 1992.5d,26 In an intramolecular reaction , there are two possible modes of [ 2n + 227 + 2271 cycloaddition which have to be considered ( Scheme 3 ). [/INST]',
 '<s>[INST] annotate the productions 

### Inference

In [8]:
# Generate texts from the prompts. 
# The output is a list of RequestOutput objects that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
predictions = []

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt},\nGenerated text: {generated_text!r}")
    predictions.append(generated_text.strip())

Processed prompts: 100%|██████████| 723/723 [00:29<00:00, 24.38it/s]

Prompt: <s>[INST] annotate the productions in the paragraph. The additional ring which arises from an intramolecular HDA reaction may be useful for the synthesis of triquinanes or other polycyclic compounds. [/INST],
Generated text: ' The additional ring which arises from an intramolecular HDA reaction may be useful for the synthesis of triquinanes or other polycyclic compounds.'
Prompt: <s>[INST] annotate the productions in the paragraph. The decrease in entropy associated with tethering the two reactive components suggests that the reaction would be significantly more facile than the intermolecular reaction.25 However , this potential rate enhancement is com- promised by the dramatic decrease in rate associated with intermolecular cycloadditions with substituted norbomadienes as described previously. [/INST],
Generated text: ' The decrease in entropy associated with tethering the two reactive components suggests that the reaction would be significantly more facile than the intermolec




### Save the Predictions

In [9]:
pred_df = pd.DataFrame()
pred_df['Generated Text'] = predictions
pred_df['Actual Text'] = test_df[target_text]
pred_df['Paragraph'] = test_df[source_text]
pred_df['BIO Label'] = test_df['bio_label']
pred_df.to_csv(f"results/predictions/prediction_of_{new_model_name.replace('/', '-')}.csv", index = None)
pred_df

Unnamed: 0,Generated Text,Actual Text,Paragraph,BIO Label
0,The additional ring which arises from an intra...,The additional ring which arises from an intra...,The additional ring which arises from an intra...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,The decrease in entropy associated with tether...,The decrease in entropy associated with tether...,The decrease in entropy associated with tether...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,There were no reported examples of successful ...,There were no reported examples of successful ...,There were no reported examples of successful ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,The dienophile in the tether can cyclize on Ca...,The dienophile in the tether can cyclize on Ca...,The dienophile in the tether can cyclize on Ca...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Molecular models and MM2 calculations indicate...,Molecular models and MM2 calculations indicate...,Molecular models and MM2 calculations indicate...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
...,...,...,...,...
718,The '*O isotopic data at hand indicate that th...,The '*O isotopic data at hand indicate that th...,The '*O isotopic data at hand indicate that th...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
719,What can be ascertained from the peak position...,What can be ascertained from the peak position...,What can be ascertained from the peak position...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
720,"Thus , for the good resonance amides 5 and 6c ...","Thus , for the good resonance amides 5 and 6c ...","Thus , for the good resonance amides 5 and 6c ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
721,This is in accordance with expectations based ...,This is in accordance with expectations based ...,This is in accordance with expectations based ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


: 