### Import Packages

In [1]:
import os
import torch
import pandas as pd
from vllm import LLM, SamplingParams
from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from peft import LoraConfig, PeftModel

# Setup environment 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Merge Base Model and Adaptor into New Model (Then Restart the kernel)

In [2]:
base_model_name = "/home/zhangwei/pretrained_models/llama-2-13b-chat-hf"   # Path of the pretrained model downloaded from Hugging Face
adaptor_name = "saved_models/llama2_13b_chat_qlora/train_200_lora_r64_lr1e-05/checkpoint-600" # Fine-tuned model name
new_model_name = f"saved_models/merged_models-{adaptor_name}"
device_map = {"": 0}    

In [4]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

model = PeftModel.from_pretrained(base_model, adaptor_name)
model = model.merge_and_unload()
model.save_pretrained(new_model_name, safe_serialization=False)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained(new_model_name, safe_serialization=False)  ## safe_serilization = False is very important, https://github.com/vllm-project/vllm/issues/615

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 4096}


('saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_1060_lora_r64_lr1e-05/checkpoint-3710/tokenizer_config.json',
 'saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_1060_lora_r64_lr1e-05/checkpoint-3710/special_tokens_map.json',
 'saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_1060_lora_r64_lr1e-05/checkpoint-3710/tokenizer.model',
 'saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_1060_lora_r64_lr1e-05/checkpoint-3710/added_tokens.json',
 'saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_1060_lora_r64_lr1e-05/checkpoint-3710/tokenizer.json')

### Load Merged Model

In [3]:
sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens = 3072, stop = ['!!!'])
llm = LLM(model = new_model_name)

INFO 04-19 12:22:40 llm_engine.py:87] Initializing an LLM engine with config: model='saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_1060_lora_r64_lr1e-05/checkpoint-3710', tokenizer='saved_models/merged_models-saved_models/llama2_13b_chat_qlora/train_1060_lora_r64_lr1e-05/checkpoint-3710', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 04-19 12:23:01 llm_engine.py:357] # GPU blocks: 862, # CPU blocks: 327
INFO 04-19 12:23:02 model_runner.py:684] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-19 12:23:02 model_runner.py:688] CUDA graphs can take a

### Preprocess Data

In [3]:
# Data Loading and Preprocessing
test_file = "data/data_for_llms/test/test_300.csv"
test_df = pd.read_csv(test_file, encoding='utf-8')

def create_assistant_message(row):
    return f"""{{\"IUPAC\":\"{row['IUPAC']}\",\"1H NMR text\":\"{row['1H NMR text']}\",\"1H NMR conditions\":\"{row['1H NMR conditions']}\",\"1H NMR data\":\"{row['1H NMR data']}\",\"13C NMR text\":\"{row['13C NMR text']}\",\"13C NMR conditions\":\"{row['13C NMR conditions']}\",\"13C NMR data\":\"{row['13C NMR data']}\"}}"""
    
test_df['NMRInfo'] = test_df.apply(create_assistant_message, axis=1)

source_text = "Paragraph"
target_text = "NMRInfo"
instruction = f'{source_text}2{target_text}: '
instruction = '''Extract the NMR information from the Paragraph: '''
#instruction = '''Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The contents in the NMR data are only numbers, such as "13C NMR data": "174.0, 157.7", "1H NMR data": "174.0, 157.7". All responses must originate from information extracted from the given text, ensuring that the extracted content has not been modified or fragmented, and that capitalization and punctuation are exactly the same as the given text. Must end with {"IUPAC":"text","1H NMR text":"text","1H NMR conditions":"text","1H NMR data":"text","13C NMR text":"text","13C NMR conditions":"text","13C NMR data":"text"} format reply.'''

test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST]"

prompts = list(test_df['text'])
prompts[:5]

['<s>[INST] Extract the NMR information from the Paragraph: Synthesis and crystallization A stirred solution of 3,5-bis\xad(tri\xadfluoro\xadmeth\xadyl) aceto\xadphenone (0.5\u2005g, 1.95\u2005mmol) in acetic acid (5\u2005mL) was added dropwise to bromine (0.312\u2005g, 1.95\u2005mmol) in acetic acid. The reaction medium was stirred at room temperature for 5\u2005h. To the resulting mixture, water (5\u2005mL) was added and the mixture was concentrated under reduced pressure. The residue obtained was diluted with ethyl\xadacetate (10\u2005mL), the organic layer washed with water (10\u2005mL) and a sodium bicarbonate solution (5\u2005mL), and filtered through dried sodium sulfate and evaporated to obtain 1-(3,5-bis\xad(tri\xadfluoro\xadmeth\xadyl)phen\xadyl)-2-bromo\xadethanone as a light-yellow solid in 62% yield. m.p: 317–318\u2005K. 1H NMR: (CDCl3, 600\u2005MHz): 8.44 (2H, s), 8.13 (1H, s), 4.48 (2H, s); 13C NMR: (CDCl3, 150\u2005MHz): 188.81, 135.31, 133.06, 132.83, 132.60, 128.99, 1

### Inference

In [None]:
# Generate texts from the prompts. 
# The output is a list of RequestOutput objects that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
predictions = []

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt},\nGenerated text: {generated_text!r}")
    predictions.append(generated_text.strip())

### Save the Predictions

In [None]:
pred_df = pd.DataFrame()
pred_df['Generated Text'] = predictions
pred_df['Actual Text'] = test_df[target_text]
pred_df['Paragraph'] = test_df[source_text]

def safe_json_loads(val):
    try:
        if val is np.nan:
            return {}
        else:
            return json.loads(str(val).replace("'", "\""))

    except (ValueError, TypeError, json.JSONDecodeError):
        
        pattern = r'"(IUPAC|1H NMR text|1H NMR conditions|1H NMR data|13C NMR text|13C NMR conditions|13C NMR data)":"(.*?)"'
        matches = re.findall(pattern, val)
        result = {key: value for key, value in matches}

        keys = ["IUPAC", "1H NMR text", "1H NMR conditions", "1H NMR data", "13C NMR text", "13C NMR conditions", "13C NMR data"]

        for key in keys:
            result[key] = result.get(key, 'N/A')
        return result

# split the dictionary into multiple columns by json_normalize, then concat with the original DataFrame
pred_df['Generated Text'] = pred_df['Generated Text'].apply(safe_json_loads)
pred_df = pd.concat([pred_df.drop('Generated Text', axis=1), pd.json_normalize(pred_df['Generated Text'])], axis=1)
pred_df = pred_df.fillna('N/A')
pred_df.to_csv(f"results/predictions/prediction_of_{new_model_name.replace('/', '-')}.csv", index = None)
pred_df