### Import Packages

In [1]:
import os
import re
import json
import torch
import pandas as pd
from pandas import json_normalize
import numpy as np
from vllm import LLM, SamplingParams
from transformers import  AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from peft import LoraConfig, PeftModel

# Setup environment 
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Load Fine-Tuned Model

In [2]:
new_model_name = "saved_models/Mistral-7B-Instruct-v0.2/train_329_lr5e-06_bs2/checkpoint-1155" # 990 best

sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens = 4096, stop = ['!!!'])
llm = LLM(model = new_model_name, tensor_parallel_size=1)

INFO 04-25 05:26:31 llm_engine.py:79] Initializing an LLM engine with config: model='saved_models/Meta-Llama-3-8B-Instruct/train_329_lr5e-06_bs2/checkpoint-165', tokenizer='saved_models/Meta-Llama-3-8B-Instruct/train_329_lr5e-06_bs2/checkpoint-165', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-25 05:26:57 llm_engine.py:337] # GPU blocks: 9565, # CPU blocks: 2048
INFO 04-25 05:27:13 model_runner.py:666] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-25 05:27:13 model_runner.py:670] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 04-25 05:28:01 model_runner.py:738] Graph capturing finished in 49 secs.


### Preprocess Data

In [3]:
# Data Loading and Preprocessing
test_file = "data/data_for_llms/test_329.csv"
test_df = pd.read_csv(test_file, encoding='utf-8')

source_text = "Paragraph"
target_text = "MOFInfo"

instruction = f'{source_text}2{target_text}: '
instruction = '''Extract the MOF synthesis information from the Paragraph.'''

test_df['text'] = f'<s>[INST] {instruction} ' + test_df[source_text] + " [/INST]"

prompts = list(test_df['text'])
prompts[:5]

['<s>[INST] Extract the MOF synthesis information from the Paragraph. nMTV-MOF-5-ABCDEF, Zn4O(BDC)0.88(NH2-BDC)0.01(Br-BDC)0.67[(Cl)2-BDC]0.44(NO2-BDC)0.38(C4H4-BDC)0.62,: 0.67 mL of BDCH2 stock solution, 0.67 mL of NH2-BDCH2 stock solution, 0.67 mL of Br-BDCH2 stock solution, 0.67 mL of (Cl)2-BDCH2 stock solution, 0.67 mL of NO2-BDCH2 stock solution, 0.67 mL of C4H4-BDCH2 stock solution, and 5 mL of Zn(OAc)2·2H2O stock solution were added in sequence in to a 20 mL glass vial. Powder precipitates formed immediately upon mixing. The vial was sealed and placed in the center of microwave oven (700 W) and allowed to react for 15 sec. [/INST]',
 '<s>[INST] Extract the MOF synthesis information from the Paragraph. nCo-MOF-74 Co2(DOT) (DOT=2,5-dihydroxyl-1,4-benzenedicarboxylate): 5 mL of DOTH2 stock solution, 5 mL of ethanol, 5 mL of water, and 5 mL of Co(OAc)2·4H2O stock solution were added in sequence to a 75 mL glass reactor. The reactor was placed in a microwave synthesizer and allowed t

### Inference

In [4]:
# Generate texts from the prompts. 
# The output is a list of RequestOutput objects that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
predictions = []

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt},\nGenerated text: {generated_text!r}")
    predictions.append(generated_text.strip())

Processed prompts: 100%|██████████| 329/329 [00:34<00:00,  9.60it/s]

Prompt: <s>[INST] Extract the MOF synthesis information from the Paragraph. nMTV-MOF-5-ABCDEF, Zn4O(BDC)0.88(NH2-BDC)0.01(Br-BDC)0.67[(Cl)2-BDC]0.44(NO2-BDC)0.38(C4H4-BDC)0.62,: 0.67 mL of BDCH2 stock solution, 0.67 mL of NH2-BDCH2 stock solution, 0.67 mL of Br-BDCH2 stock solution, 0.67 mL of (Cl)2-BDCH2 stock solution, 0.67 mL of NO2-BDCH2 stock solution, 0.67 mL of C4H4-BDCH2 stock solution, and 5 mL of Zn(OAc)2·2H2O stock solution were added in sequence in to a 20 mL glass vial. Powder precipitates formed immediately upon mixing. The vial was sealed and placed in the center of microwave oven (700 W) and allowed to react for 15 sec. [/INST],
Generated text: ' [{"compound name": "nMTV-MOF-5-ABCDEF, Zn4O(BDC)0.88(NH2-BDC)0.01(Br-BDC)0.67[(Cl)2-BDC]0.44(NO2-BDC)0.38(C4H4-BDC)0.62", "metal source": "Zn(OAc)2·2H2O", "metal amount": "5 mL", "linker": "BDCH2 stock solution | NH2-BDCH2 stock solution | Br-BDCH2 stock solution | (Cl)2-BDCH2 stock solution | NO2-BDCH2 stock solution | C4H4-BD




### Save the Predictions

In [5]:
pred_df = pd.DataFrame()
pred_df['Generated Text'] = predictions
pred_df['Actual Text'] = test_df[target_text]
pred_df['Paragraph'] = test_df[source_text]
pred_df.to_csv(f"results/predictions/prediction_of_{new_model_name.replace('/', '-')}.csv", index = None)
pred_df

Unnamed: 0,Generated Text,Actual Text,Paragraph
0,"[{""compound name"": ""nMTV-MOF-5-ABCDEF, Zn4O(BD...","[{""compound name"": ""nMTV-MOF-5-ABCDEF, Zn4O(BD...","nMTV-MOF-5-ABCDEF, Zn4O(BDC)0.88(NH2-BDC)0.01(..."
1,"[{""compound name"": ""nCo-MOF-74 Co2(DOT)"", ""met...","[{""compound name"": ""nCo-MOF-74 Co2(DOT) (DOT=2...","nCo-MOF-74 Co2(DOT) (DOT=2,5-dihydroxyl-1,4-be..."
2,"[{""compound name"": ""nNi-MOF-74 Ni2(DOT)"", ""met...","[{""compound name"": ""nNi-MOF-74 Ni2(DOT)"", ""met...",nNi-MOF-74 Ni2(DOT): 5 mL of DOTH2 stock solut...
3,"[{""compound name"": ""nM3M-MOF-74, Co0.82Mg0.424...","[{""compound name"": ""nM3M-MOF-74, Co0.82Mg0.424...","nM3M-MOF-74, Co0.82Mg0.424Ni0.76(DOT): 5 mL of..."
4,"[{""compound name"": ""nM5M-MOF-74, Co0.54Mg0.27N...","[{""compound name"": ""nM5M-MOF-74, Co0.54Mg0.27N...","nM5M-MOF-74, Co0.54Mg0.27Ni0.60Zn0.34Mn0.35(DO..."
...,...,...,...
324,"[{""compound name"": ""PPMOF 1"", ""metal source"": ...","[{""compound name"": ""PPMOF 1"", ""metal source"": ...","Synthesis of PPMOF 1 H6L (142 mg, 0.1 mmol) an..."
325,"[{""compound name"": ""[Co(atz)(L1)0.5]·DMF (1)"",...","[{""compound name"": ""[Co(atz)(L1)0.5]·DMF"", ""me...",Synthesis of [Co(atz)(L1)0.5]·DMF (1). A mixtu...
326,"[{""compound name"": ""[Co(atz)(L1)0.5]·DMF (1)"",...","[{""compound name"": ""[Co(atz)(L2)0.5]·DMF"", ""me...",Synthesis of [Co(atz)(L1)0.5]·DMF (1). A mixtu...
327,"[{""compound name"": ""[emi][Mn(btc)] (1)"", ""meta...","[{""compound name"": ""[emi][Mn(btc)]"", ""metal so...",Synthesis of MOF Materials. [emi][Mn(btc)] (1)...
