In [20]:
import os 
import json 
import requests
import pandas as pd 
from typing import List, Dict 
import xml.etree.ElementTree as ET
from pathlib import Path
import time 
import xmltodict

In [21]:
OUTPUT_DIR = Path("../data/")
FINETUNING_DIR = OUTPUT_DIR / "finetuning"
RAG_DIR = OUTPUT_DIR / "rag"

FINETUNING_DIR.mkdir(parents=True, exist_ok=True)
RAG_DIR.mkdir(parents=True, exist_ok=True)

### Dataset For Finetuning : medication QA

In [22]:
def process_mediqationqa(file_path):
	df = pd.read_excel(file_path)
	# convert to finetuning format
	finetuning_data = []
	for _, row in df.iterrows():
		# clean and format 
		question = str(row.ge('Question', '')).strip()
		answer = str(row.get('Answer', '')).strip()
		focus = str(row.get('Focus (Drug)', '')).strip()
		
		if question and answer and question != 'nan' and answer != 'nan':
			finetuning_data.append({
						"instruction" : question,
						"output": answer,
						"focus_drug": focus,
						"source": "MedicationQA"
			})
	
	return finetuning_data

### Dataset for Knowledge Base for simple RAG using FAISS : DailyMed

In [23]:
def fetch_dailymed_drug_info(drug_name):
	search_url = f"https://dailymed.nlm.nih.gov/dailymed/services/v2/spls.json"
	params = {"search": drug_name, "pagesize": 1}
	
	try:
		response = requests.get(search_url, params=params, timeout=10)
		response.raise_for_status()
		data = response.json()

		if not data.get('data'):
			print(f"No data found for {drug_name}")
			return None

		set_id = data['data'][0]['setid']

		if not set_id:
			print(f"No setid found for {drug_name}")
			return None

		xml_url = f"https://dailymed.nlm.nih.gov/dailymed/services/v2/spls/{set_id}.xml"
		xml_response = requests.get(xml_url,timeout=10)
		xml_response.raise_for_status()
		xml_data = xmltodict.parse(xml_response.content)

		print(f"Successfully fetched xml for {drug_name}")
		return xml_data
	
	except Exception as e:
		print(f"Error fetching xml for {drug_name}: {e}")
		return None

In [24]:
data = fetch_dailymed_drug_info("Rivastigmine")

Successfully fetched xml for Rivastigmine


In [25]:
# regex to clean html tag from text 
import re

def strip_tags(xml_string):
	# hapus semua tag <...>
	text = re.sub(r"<[^>]+>", " ", xml_string)
	# normalisasi whitespace
	text = re.sub(r"\s+", " ", text)
	return text.strip()

In [26]:
def extract_drug_sections(xml_dict, drug_name):
	if not xml_dict:
		return []
	
	section_of_interest = {
			'dosage' : ['DOSAGE', 'DOSAGE AND ADMINISTRATION'],
			'contraindications': ['CONTRAINDICATIONS'],
			'side_effects': ['ADVERSE REACTIONS', 'SIDE EFFECTS'],
			'mechanism': ['MECHANISM OF ACTION', 'CLINICAL PHARMACOLOGY'],
			'warnings': ['WARNINGS', 'WARNINGS AND PRECAUTIONS'],
			'indications': ['INDICATIONS', 'INDICATIONS AND USAGE']
	}

	rag_chunks = []

	try:
		components = (
			xml_dict.get("document", {})
							.get("component", {})
							.get("structuredBody", {})
							.get("component", [])
		)

		# fallback if structureBody is not present
		# if not components:
		# 	print("Falling back to direct components")
		# 	components = xml_dict.get("document", {}).get("component", [])

		# components should be a list 
		if isinstance(components, dict):
			components = [components]
		
		for comp in components:
			section = comp.get("section", {})
			if not section:
				continue

			# extract section title 
			code = section.get("code", {})
			title = code.get("@displayName", "").upper()

			# extract raw text (it can be list or dict)
			text = section.get("text", "")
			if isinstance(text, dict):
				# convert HTML-ish XML content to string
				text = xmltodict.unparse({"text": text}, pretty=False)

			if isinstance(text, list):
				text = "\n".join(str(t) for t in text)
			
			text = str(text).strip()

			# clean tags 
			text = strip_tags(text)

			# match section with interest list
			for category, keywords in section_of_interest.items():
				if any(k in title for k in keywords):
					if len(text) > 50:
						rag_chunks.append({
							"drug_name": drug_name,
							"category": category,
							"section_title": title,
							"text": text[:2000],  # limit to first 2000 chars
							"source": "DailyMed"
						})
					break # stop checking other categories once matched
			
		return rag_chunks
	
	except Exception as e:
		print(f"Error extracting sections for {drug_name}: {e}")
		return []
	

In [27]:
xml_dict = fetch_dailymed_drug_info("Acetaminophen")
chunks = extract_drug_sections(xml_dict, "Acetaminophen")

print(len(chunks))
for c in chunks:
	print(c["drug_name"])
	print(c["category"])
	print(c["section_title"])
	print(c["text"])
	print()


Successfully fetched xml for Acetaminophen
5
Acetaminophen
indications
INDICATIONS & USAGE SECTION
Atorvastatin calcium tablets are indicated: • To reduce the risk of: o Myocardial infarction (MI), stroke, revascularization procedures, and angina in adults with multiple risk factors for coronary heart disease (CHD) but without clinically evident CHD o MI and stroke in adults with type 2 diabetes mellitus with multiple risk factors for CHD but without clinically evident CHD o Non-fatal MI, fatal and non-fatal stroke, revascularization procedures, hospitalization for congestive heart failure, and angina in adults with clinically evident CHD • As an adjunct to diet to reduce low-density lipoprotein cholesterol (LDL-C) in: o Adults with primary hyperlipidemia. o Adults and pediatric patients aged 10 years and older with heterozygous familial hypercholesterolemia (HeFH). • As an adjunct to other LDL-C-lowering therapies, or alone if such treatments are unavailable, to reduce LDL-C in adults

In [28]:
TARGET_DRUGS =[
	"Aspirin", "Ibuprofen", "Acetaminophen",
	"Amoxicillin", "Azithromycin", "Ciprofloxacin",
	"Metformin", "Atorvastatin", "Lisinopril",
	"Omeprazole", "Levothyroxine", "Albuterol",
	"Gabapentin", "Sertraline", "Losartan",
	"Vitamin D", "Vitamin B12", "Vitamin C"
]


def build_rag_knowledge_base():
	print("Building RAG Knowledge Base from DailyMed...")

	all_rag_chunks = []
	for drug in TARGET_DRUGS:
		time.sleep(1)  # to respect API rate limits
		# fetch xml data
		xml_dict = fetch_dailymed_drug_info(drug)

		if not xml_dict:
			print(f"No dailymed data for {drug}")
			continue 

		# extract section 
		chunks = extract_drug_sections(xml_dict, drug)
		print(f" ✓ Extracted {len(chunks)} chunks")

		all_rag_chunks.extend(chunks)

		time.sleep(1)  # to respect API rate limits

	print(f"✓ Total RAG chunks collected: {len(all_rag_chunks)}")
	return all_rag_chunks

### Save finetuning dataset and rag dataset
if the two cell below run more than once it will override curren json and jsonl file

In [29]:
def save_finetuning_data(qa_pairs : List[Dict]):
	print(f"Saving finetuning dataset... {len(qa_pairs)} pairs")

	# split train test 90 : 10
	split_idx = int(len(qa_pairs) * 0.9)
	train_data = qa_pairs[:split_idx]
	test_data = qa_pairs[split_idx:]

	# save as JSON 
	with open(FINETUNING_DIR / "train.json", "w", encoding="utf-8") as f:
		json.dump(train_data, f, indent=2, ensure_ascii=False)

	with open(FINETUNING_DIR / "test.json", "w", encoding="utf-8") as f:
		json.dump(test_data, f, indent=2, ensure_ascii=False)
	
	# save as JSONL
	with open(FINETUNING_DIR / "train.jsonl", "w", encoding="utf-8") as f:
		for item in train_data:
			f.write(json.dumps(item, ensure_ascii=False) + "\n")

	with open(FINETUNING_DIR / "test.jsonl", "w", encoding="utf-8") as f:
		for item in test_data:
			f.write(json.dumps(item, ensure_ascii=False) + "\n")
	
	print(f"Train set: {len(train_data)} samples")
	print(f"Test set: {len(test_data)} samples")
	print(f"Data saved to {FINETUNING_DIR}")

In [30]:
def save_rag_data(rag_chunks):
	print("Saving RAG knowledge base...")
	
	# save as JSON
	with open(RAG_DIR / "knowledge_base.json", 'w', encoding='utf-8') as f:
		json.dump(rag_chunks, f, indent=2, ensure_ascii=False)
	
	# save as JSONL
	with open(RAG_DIR / "knowledge_base.jsonl", 'w', encoding='utf-8') as f:
		for chunk in rag_chunks:
			f.write(json.dumps(chunk, ensure_ascii=False) + "\n")
	
	# save statistic 
	stats = {
		'total_chunks': len(rag_chunks),
		'chunks_per_drug': {},
		'chunks_per_category': {}
	}

	for chunk in rag_chunks:
		drug = chunk['drug_name']
		category = chunk['category']
		stats['chunks_per_drug'][drug] = stats['chunks_per_drug'].get(drug, 0) + 1
		stats['chunks_per_category'][category] = stats['chunks_per_category'].get(category, 0) + 1
	
	with open(RAG_DIR / "stats.json", 'w', encoding='utf-8') as f:
		json.dump(stats, f, indent=2)
	
	print(f"Total chunks : {len(rag_chunks)}")
	print(f"Saved to {RAG_DIR}")
	for drug, count in stats['chunks_per_drug'].items():
		print(f" - {drug}: {count} chunks")

### Execute All Function Above 

In [31]:
# finetuning dataset
all_qa_pairs = []
mediqationqa_data = process_mediqationqa("../data/finetuning/MedInfo2019-QA-Medications.xlsx")
all_qa_pairs.extend(mediqationqa_data)

# rag dataset
rag_chunks = build_rag_knowledge_base()

# save dataset 
if all_qa_pairs:
  save_finetuning_data(all_qa_pairs)
  
if rag_chunks:
  save_rag_data(rag_chunks)

print("Preprocessing completed.")

Building RAG Knowledge Base from DailyMed...
Successfully fetched xml for Aspirin
 ✓ Extracted 5 chunks
Successfully fetched xml for Ibuprofen
 ✓ Extracted 5 chunks
Successfully fetched xml for Acetaminophen
 ✓ Extracted 5 chunks
Successfully fetched xml for Amoxicillin
 ✓ Extracted 5 chunks
Successfully fetched xml for Azithromycin
 ✓ Extracted 5 chunks
Successfully fetched xml for Ciprofloxacin
 ✓ Extracted 5 chunks
Successfully fetched xml for Metformin
 ✓ Extracted 5 chunks
Successfully fetched xml for Atorvastatin
 ✓ Extracted 5 chunks
Successfully fetched xml for Lisinopril
 ✓ Extracted 5 chunks
Successfully fetched xml for Omeprazole
 ✓ Extracted 5 chunks
Successfully fetched xml for Levothyroxine
 ✓ Extracted 5 chunks
Successfully fetched xml for Albuterol
 ✓ Extracted 5 chunks
Successfully fetched xml for Gabapentin
 ✓ Extracted 5 chunks
Successfully fetched xml for Sertraline
 ✓ Extracted 5 chunks
Successfully fetched xml for Losartan
 ✓ Extracted 5 chunks
Successfully fetche