### Import Packages

In [1]:
import os
import re
import json
import torch
import pandas as pd
from pandas import json_normalize
import numpy as np
from vllm import LLM, SamplingParams
from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from peft import LoraConfig, PeftModel

# Setup environment 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Load Fine-Tuned Model

In [2]:
# Fine-tuned model name
new_model_name = "saved_models/Meta-Llama-3-8B-Instruct/train_200_lr5e-06_bs2/checkpoint-600" # 400 best
new_model_name = "saved_models/Mistral-7B-Instruct-v0.2/train_200_lr5e-06_bs2/checkpoint-400" # 200 best
new_model_name = "saved_models/Meta-Llama-3-8B-Instruct/train_200_without_prompt_lr5e-06_bs2/checkpoint-400" # 400 best
new_model_name = "saved_models/Mistral-7B-Instruct-v0.2/train_200_without_prompt_lr5e-06_bs2/checkpoint-200" # 200 best

sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens = 4096, stop = ['!!!'])
llm = LLM(model = new_model_name, tensor_parallel_size=1)

INFO 04-25 02:53:52 llm_engine.py:79] Initializing an LLM engine with config: model='saved_models/Mistral-7B-Instruct-v0.2/train_200_without_prompt_lr5e-06_bs2/checkpoint-200', tokenizer='saved_models/Mistral-7B-Instruct-v0.2/train_200_without_prompt_lr5e-06_bs2/checkpoint-200', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 04-25 02:54:04 llm_engine.py:337] # GPU blocks: 9054, # CPU blocks: 2048
INFO 04-25 02:54:05 model_runner.py:666] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-25 02:54:05 model_runner.py:670] CUDA graphs can take additional 1~3 GiB memory per

### Preprocess Data (Eunsure the instruction same with training)

In [3]:
# Data Loading and Preprocessing
test_file = "data/data_for_llms/test/test_300.csv"
test_df = pd.read_csv(test_file, encoding='utf-8')

def create_assistant_message(row):
    return f"""{{\"IUPAC\":\"{row['IUPAC']}\",\"1H NMR text\":\"{row['1H NMR text']}\",\"1H NMR conditions\":\"{row['1H NMR conditions']}\",\"1H NMR data\":\"{row['1H NMR data']}\",\"13C NMR text\":\"{row['13C NMR text']}\",\"13C NMR conditions\":\"{row['13C NMR conditions']}\",\"13C NMR data\":\"{row['13C NMR data']}\"}}"""
    
test_df['NMRInfo'] = test_df.apply(create_assistant_message, axis=1)

source_text = "Paragraph"
target_text = "NMRInfo"
instruction = f'{source_text}2{target_text}: '
instruction = '''Extract the NMR information from the Paragraph: '''
#instruction = '''Extract text containing 1H NMR and 13C NMR data, remove interference information such as reactants, raw materials, solvents and other non-final product names based on text semantics, and then extract the name, code or number of the final product. Please delete the IUPAC name Alias, numbers and ordinal numbers before and after fields, such as '2.1.3.', '(HL4)', '(9)', '(4d)'. NMR text should contain complete information, such as instrument power and solvent information, For example, "13C NMR text": "13C NMR (400 MHz, acetone-d6) 174.0 (C), 157.7 (C). Then split the NMR text. The content in NMR conditions is NMR instrument power and solvent information, such as "13C NMR conditions": "400MHz, acetone-d6". The contents in the NMR data are only numbers, such as "13C NMR data": "174.0, 157.7", "1H NMR data": "174.0, 157.7". All responses must originate from information extracted from the given text, ensuring that the extracted content has not been modified or fragmented, and that capitalization and punctuation are exactly the same as the given text. Must end with {"IUPAC":"text","1H NMR text":"text","1H NMR conditions":"text","1H NMR data":"text","13C NMR text":"text","13C NMR conditions":"text","13C NMR data":"text"} format reply.'''

test_df['text'] = f'<s>[INST] {instruction}' + test_df[source_text] + " [/INST]"

prompts = list(test_df['text'])
prompts[:5]

['<s>[INST] Extract the NMR information from the Paragraph: Synthesis and crystallization A stirred solution of 3,5-bis\xad(tri\xadfluoro\xadmeth\xadyl) aceto\xadphenone (0.5\u2005g, 1.95\u2005mmol) in acetic acid (5\u2005mL) was added dropwise to bromine (0.312\u2005g, 1.95\u2005mmol) in acetic acid. The reaction medium was stirred at room temperature for 5\u2005h. To the resulting mixture, water (5\u2005mL) was added and the mixture was concentrated under reduced pressure. The residue obtained was diluted with ethyl\xadacetate (10\u2005mL), the organic layer washed with water (10\u2005mL) and a sodium bicarbonate solution (5\u2005mL), and filtered through dried sodium sulfate and evaporated to obtain 1-(3,5-bis\xad(tri\xadfluoro\xadmeth\xadyl)phen\xadyl)-2-bromo\xadethanone as a light-yellow solid in 62% yield. m.p: 317–318\u2005K. 1H NMR: (CDCl3, 600\u2005MHz): 8.44 (2H, s), 8.13 (1H, s), 4.48 (2H, s); 13C NMR: (CDCl3, 150\u2005MHz): 188.81, 135.31, 133.06, 132.83, 132.60, 128.99, 1

### Inference

In [4]:
# Generate texts from the prompts. 
# The output is a list of RequestOutput objects that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
predictions = []

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt},\nGenerated text: {generated_text!r}")
    predictions.append(generated_text.strip())

Processed prompts: 100%|██████████| 300/300 [01:47<00:00,  2.80it/s]

Prompt: <s>[INST] Extract the NMR information from the Paragraph: Synthesis and crystallization A stirred solution of 3,5-bis­(tri­fluoro­meth­yl) aceto­phenone (0.5 g, 1.95 mmol) in acetic acid (5 mL) was added dropwise to bromine (0.312 g, 1.95 mmol) in acetic acid. The reaction medium was stirred at room temperature for 5 h. To the resulting mixture, water (5 mL) was added and the mixture was concentrated under reduced pressure. The residue obtained was diluted with ethyl­acetate (10 mL), the organic layer washed with water (10 mL) and a sodium bicarbonate solution (5 mL), and filtered through dried sodium sulfate and evaporated to obtain 1-(3,5-bis­(tri­fluoro­meth­yl)phen­yl)-2-bromo­ethanone as a light-yellow solid in 62% yield. m.p: 317–318 K. 1H NMR: (CDCl3, 600 MHz): 8.44 (2H, s), 8.13 (1H, s), 4.48 (2H, s); 13C NMR: (CDCl3, 150 MHz): 188.81, 135.31, 133.06, 132.83, 132.60, 128.99, 127.08, 127.06, 125.42, 123.61, 121.80, 120.00, 29.46.  Refinement Crystal data, data collection




### Save the Predictions

In [5]:
pred_df = pd.DataFrame()
pred_df['Generated Text'] = predictions
pred_df['Actual Text'] = test_df[target_text]
pred_df['Paragraph'] = test_df[source_text]

def safe_json_loads(val):
    try:
        if val is np.nan:
            return {}
        else:
            return json.loads(str(val).replace("'", "\""))

    except (ValueError, TypeError, json.JSONDecodeError):

        pattern = r'"(IUPAC|1H NMR text|1H NMR conditions|1H NMR data|13C NMR text|13C NMR conditions|13C NMR data)":"(.*?)"'
        matches = re.findall(pattern, val)
        result = {key: value for key, value in matches}

        keys = ["IUPAC", "1H NMR text", "1H NMR conditions", "1H NMR data", "13C NMR text", "13C NMR conditions", "13C NMR data"]

        for key in keys:
            result[key] = result.get(key, 'N/A')
        return result

# split the dictionary into multiple columns by json_normalize, then concat with the original DataFrame
pred_df['Generated Text'] = pred_df['Generated Text'].apply(safe_json_loads)
pred_df = pd.concat([pred_df.drop('Generated Text', axis=1), pd.json_normalize(pred_df['Generated Text'])], axis=1)
pred_df = pred_df.fillna('N/A')
pred_df.to_csv(f"results/predictions/prediction_of_{new_model_name.replace('/', '-')}.csv", index = None)
pred_df

Unnamed: 0,Actual Text,Paragraph,IUPAC,1H NMR text,1H NMR conditions,1H NMR data,13C NMR text,13C NMR conditions,13C NMR data,19F NMR text,19F NMR conditions,19F NMR data
0,"{""IUPAC"":""1-(3,5-bis(trifluoromethyl)phenyl)-2...",Synthesis and crystallization A stirred soluti...,"1-(3,5-bis-CF3phenyl)-2-bromoethanone","1H NMR: (CDCl3, 600 MHz): 8.44 (2H, s), 8.13 (...","CDCl3, 600 MHz","8.44, 8.13, 4.48","13C NMR: (CDCl3, 150 MHz): 188.81, 135.31, 133...","CDCl3, 150 MHz","188.81, 135.31, 133.06, 132.83, 132.60, 128.99...",,,
1,"{""IUPAC"":""N-(diaminomethylidene)-4-[(E)-(4-hyd...",Synthesis of N-(diaminomethylidene)-4-[(E)-(4-...,N-(diaminomethylidene)-4-[(E)-(4-hydroxyphenyl...,"1H-NMR (500 MHz, DMSO-d6) δ (ppm): 6.77 (4H, b...","500 MHz, DMSO-d6","6.77, 6.9, 7.8, 7.9","13C-NMR (125 MHz, DMSO-d6) δ: 161.6, 158.1, 15...","125 MHz, DMSO-d6","161.6, 158.1, 153.2, 145.1, 126.8, 125.2, 116.0",,,
2,"{""IUPAC"":""1-(3,4-Dimethylphenyl)-5-phenyl-1H-p...","1-(3,4-Dimethylphenyl)-5-phenyl-1H-pyrazole-3,...","1-(3,4-Dimethylphenyl)-5-phenyl-1H-pyrazole-3,...","1H NMR (400 MHz, CDCl3) δ (ppm) 11.6 (br.s, 2H...","400 MHz, CDCl3","11.6, 8.1, 7.9, 7.6, 7.1, 2.1, 1.8","13C NMR (100 MHz, CDCl3) δ (ppm) 171.3, 167.9 ...","100 MHz, CDCl3","171.3, 167.9, 145.2, 142.1, 135.0, 134.0, 132....",,,
3,"{""IUPAC"":""[(M)-d-4]-C12-TEG"",""1H NMR text"":""1H...",Synthesis of [(M)-d-4]-C12-TEG Under an argon ...,[(M)-d-4]-C12-TEG,"1H NMR (400 MHz, CDCl3): δ 0.86 (9H, t, J = 6....","400 MHz, CDCl3","0.86, 1.25–1.52, 1.72–1.89, 1.97, 2.00, 3.37, ...","13C NMR (100 MHz, CDCl3): δ 14.1, 22.6, 23.2, ...","100 MHz, CDCl3","14.1, 22.6, 23.2, 25.96, 26.02, 28.7, 29.2, 29...",,,
4,"{""IUPAC"":""3-Amino-5-(1-methyl-1H-pyrazol-4-yl)...",4.1 3-Amino-5-(1-methyl-1H-pyrazol-4-yl)pyridi...,3-Amino-5-(1-methyl-1H-pyrazol-4-yl)pyridin-2(...,"1H NMR (500 MHz, DMSO-d6) 3.81 (3H, s), 5.09 (...","500 MHz, DMSO-d6","3.81, 5.09, 6.63, 6.83, 7.59, 7.83, 11.38","13C NMR (126 MHz, DMSO-d6) 157.3 (C), 139.2 (C...","126 MHz, DMSO-d6","157.3, 139.2, 135.6, 127.0, 119.7, 115.8, 112....",,,
...,...,...,...,...,...,...,...,...,...,...,...,...
295,"{""IUPAC"":""(S)-5-Benzyl 1-tert-butyl 2-(3-((S)-...","(S)-5-Benzyl 1-tert-butyl 2-(3-((S)-1,5-di-te...","(S)-5-Benzyl 1-tert-butyl 2-(3-((S)-1,5-di-ter...","1H NMR (400 MHz, CDCl3) δ 7.33 (m, 5H), 5.10–5...","400 MHz, CDCl3","7.33, 5.10–5.04, 4.38–4.28, 2.51–2.37, 2.32–2....","13C NMR (100 MHz, CDCl3) δ 172.9, 172.5, 172.0...","100 MHz, CDCl3","172.9, 172.5, 172.0, 171.9, 156.8, 135.8, 128....",,,
296,"{""IUPAC"":""Methyl (E)-2-((1H-imidazol-1-yl)meth...",2.1.1. Synthesis of Methyl (E)-2-((1H-imidazol...,Methyl (E)-2-((1H-imidazol-1-yl)methyl)-3-(4-c...,"1H NMR (500MHz, CDCl3) δ 7.96 (s, 1 H), 7.45 (...","500MHz, CDCl3","7.96, 7.45, 7.39, 7.24, 7.01, 6.84, 4.94, 3.80","13C NMR (125 MHz, CDCl3) δ 166.9, 143.7, 137.2...","125 MHz, CDCl3","166.9, 143.7, 137.2, 136.1, 132.5, 130.4, 129....",,,
297,"{""IUPAC"":""N-(2-((4-(3-(3-Hydroxyphenyl)-1-phen...",N-(2-((4-(3-(3-Hydroxyphenyl)-1-phenyl-1H-pyra...,N-(2-((4-(3-(3-Hydroxyphenyl)-1-phenyl-1H-pyra...,"1H NMR (400 MHz, CD3OD) δ 8.05 (s, 1H, Ar-H), ...","400 MHz, CD3OD","8.05, 7.83, 7.76, 7.54–7.50, 7.37–7.34, 7.29–7...","13C NMR (100 MHz, CD3OD) δ 158.6, 157.6, 149.5...","100 MHz, CD3OD","158.6, 157.6, 149.5, 141.9, 140.3, 139.3, 138....",,,
298,"{""IUPAC"":""(1R,4aS)-2-Hydroxy-3-(4-Methoxypheny...","3.2.7. (1R,4aS)-2-Hydroxy-3-(4-Methoxyphenylam...","(1R,4aS)-2-Hydroxy-3-(4-Methoxyphenylamino)Pro...","1H-NMR (600 MHz, DMSO-d6) δ: 7.15 (s, 1H), 6.9...","600 MHz, DMSO-d6","7.15, 6.98, 6.82, 6.67, 6.55, 5.11, 4.07, 3.96...","13C-NMR (150 MHz, DMSO-d6) δ: 177.6, 151.1, 14...","150 MHz, DMSO-d6","177.6, 151.1, 146.7, 145.2, 134.2, 126.5, 124....",,,


: 