### Import Packages

In [1]:
import os
import torch
import pandas as pd
from vllm import LLM, SamplingParams
from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from peft import LoraConfig, PeftModel

# Setup environment
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

### Load Fine-Tuned Model

In [2]:
new_model_name = "saved_models/Mistral-7B-Instruct-v0.2/train_599_lr5e-06_bs2/checkpoint-1800" # Fine-tuned model name
sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens = 3072, stop = ['!!!'])
llm = LLM(model = new_model_name, tensor_parallel_size=1)

INFO 04-22 16:58:44 llm_engine.py:79] Initializing an LLM engine with config: model='saved_models/Mistral-7B-Instruct-v0.2/train_599_lr5e-06_bs2/mixed_checkpoint-1800-2100-2400-2700-3000', tokenizer='saved_models/Mistral-7B-Instruct-v0.2/train_599_lr5e-06_bs2/mixed_checkpoint-1800-2100-2400-2700-3000', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 04-22 16:58:55 llm_engine.py:337] # GPU blocks: 9054, # CPU blocks: 2048
INFO 04-22 16:58:57 model_runner.py:666] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-22 16:58:57 model_runner.py:670] CUDA graphs can take additi

### Preprocess Data

In [3]:
# Data Loading and Preprocessing
test_df = pd.read_csv("data/role/test.csv")
source_text = "input"
target_text = "output"
instruction = f"{source_text}2{target_text}: "
instruction = "annotate the reaction roles in the paragraph. "

test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST]"
prompts = list(test_df['text'])
prompts[:5]

['<s>[INST] annotate the reaction roles in the paragraph. Reaction of diphenylacetylene with complex 19A led to only <Prod*cycloheptadienone*Prod> 23A in 30 % yield ; with (phenylcyclopropy1)- carbene complex 19B , cycloheptadienone 25 was produced in 53 % yield. [/INST]',
 '<s>[INST] annotate the reaction roles in the paragraph. Reaction of diphenylacetylene with complex 19A led to only cycloheptadienone <Prod*23A*Prod> in 30 % yield ; with (phenylcyclopropy1)- carbene complex 19B , cycloheptadienone 25 was produced in 53 % yield. [/INST]',
 '<s>[INST] annotate the reaction roles in the paragraph. Reaction of diphenylacetylene with complex 19A led to only cycloheptadienone 23A in 30 % yield ; with (phenylcyclopropy1)- carbene complex 19B , cycloheptadienone <Prod*25*Prod> was produced in 53 % yield. [/INST]',
 '<s>[INST] annotate the reaction roles in the paragraph. We were excited to ﬁnd that , with 2.0 equiv of copper acetate and DMSO as the solvent , <Prod*2-(3-(methylthio)naphthal

### Inference

In [4]:
# Generate texts from the prompts. 
# The output is a list of RequestOutput objects that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
predictions = []

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt},\nGenerated text: {generated_text!r}")
    predictions.append(generated_text.strip())

Processed prompts:   0%|          | 0/111 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 111/111 [00:06<00:00, 16.60it/s]

Prompt: <s>[INST] annotate the reaction roles in the paragraph. Reaction of diphenylacetylene with complex 19A led to only <Prod*cycloheptadienone*Prod> 23A in 30 % yield ; with (phenylcyclopropy1)- carbene complex 19B , cycloheptadienone 25 was produced in 53 % yield. [/INST],
Generated text: ' Reaction of <Reactants*diphenylacetylene*Reactants> with complex <Reactants*19A*Reactants> led to only <Prod*cycloheptadienone*Prod> 23A in <Yield*30 %*Yield> yield ; with (phenylcyclopropy1)- carbene complex 19B , cycloheptadienone 25 was produced in 53 % yield.'
Prompt: <s>[INST] annotate the reaction roles in the paragraph. Reaction of diphenylacetylene with complex 19A led to only cycloheptadienone <Prod*23A*Prod> in 30 % yield ; with (phenylcyclopropy1)- carbene complex 19B , cycloheptadienone 25 was produced in 53 % yield. [/INST],
Generated text: ' Reaction of <Reactants*diphenylacetylene*Reactants> with complex <Reactants*19A*Reactants> led to only cycloheptadienone <Prod*23A*Prod> in <




### Save the Predictions

In [5]:
pred_df = pd.DataFrame()
pred_df['Generated Text'] = predictions
pred_df['Actual Text'] = test_df[target_text] 
pred_df['Paragraph'] = test_df[source_text]
pred_df['BIO Label'] = test_df['bio_label']
pred_df.to_csv(f"results/predictions/prediction_of_{new_model_name.replace('/', '-')}.csv", index = None)
pred_df

Unnamed: 0,Generated Text,Actual Text,Paragraph,BIO Label
0,Reaction of <Reactants*diphenylacetylene*React...,Reaction of <Reactants*diphenylacetylene*React...,Reaction of diphenylacetylene with complex 19A...,"['O', 'O', 'B-Reactants', 'O', 'O', 'B-Reactan..."
1,Reaction of <Reactants*diphenylacetylene*React...,Reaction of <Reactants*diphenylacetylene*React...,Reaction of diphenylacetylene with complex 19A...,"['O', 'O', 'B-Reactants', 'O', 'O', 'B-Reactan..."
2,Reaction of diphenylacetylene with complex 19A...,Reaction of <Reactants*diphenylacetylene*React...,Reaction of diphenylacetylene with complex 19A...,"['O', 'O', 'B-Reactants', 'O', 'O', 'O', 'O', ..."
3,"We were excited to ﬁnd that , with 2.0 equiv o...","We were excited to ﬁnd that , with 2.0 equiv o...","We were excited to ﬁnd that , with 2.0 equiv o...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,"We were excited to ﬁnd that , with 2.0 equiv o...","We were excited to ﬁnd that , with 2.0 equiv o...","We were excited to ﬁnd that , with 2.0 equiv o...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
...,...,...,...,...
106,The metal - mediated conversion of terminal al...,The metal - mediated conversion of terminal al...,The metal - mediated conversion of terminal al...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
107,The metal - mediated conversion of terminal al...,The metal - mediated conversion of terminal al...,The metal - mediated conversion of terminal al...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
108,"For example , <Reaction*enantioselective depro...","For example , <Reaction*enantioselective depro...","For example , enantioselective deproto- nation...","['O', 'O', 'O', 'B-Reaction', 'I-Reaction', 'I..."
109,"For example , <Reaction*enantioselective depro...","For example , <Reaction*enantioselective depro...","For example , enantioselective deproto- nation...","['O', 'O', 'O', 'B-Reaction', 'I-Reaction', 'I..."


: 