In [40]:
import os 
import json 
import requests
import pandas as pd 
from typing import List, Dict 
import xml.etree.ElementTree as ET
from pathlib import Path
import time 
import xmltodict

In [41]:
OUTPUT_DIR = Path("../data/")
FINETUNING_DIR = OUTPUT_DIR / "finetuning"
RAG_DIR = OUTPUT_DIR / "rag"

FINETUNING_DIR.mkdir(parents=True, exist_ok=True)
RAG_DIR.mkdir(parents=True, exist_ok=True)

### Dataset For Finetuning : medication QA

In [42]:
def process_mediqationqa(file_path):
	df = pd.read_excel(file_path)
	# convert to finetuning format
	finetuning_data = []
	for _, row in df.iterrows():
		# clean and format 
		question = str(row.ge('Question', '')).strip()
		answer = str(row.get('Answer', '')).strip()
		focus = str(row.get('Focus (Drug)', '')).strip()
		
		if question and answer and question != 'nan' and answer != 'nan':
			finetuning_data.append({
						"instruction" : question,
						"output": answer,
						"focus_drug": focus,
						"source": "MedicationQA"
			})
	
	return finetuning_data

### Dataset for Knowledge Base for simple RAG using FAISS : DailyMed

In [43]:
import requests
import xmltodict
from datetime import datetime

def fetch_dailymed_drug_info(drug_name: str, verbose=True):
    """
    Final version: Fetch DailyMed XML using API v1 with correct drug filtering.
    - Step 1: Search SPLs by exact drug name (v1)
    - Step 2: Select latest SPL version
    - Step 3: Download SPL XML via v2 endpoint
    """

    # -------------------------
    # STEP 1 ‚Äî QUERY API v1
    # -------------------------
    url = f"https://dailymed.nlm.nih.gov/dailymed/services/v1/drugname/{drug_name}/human/spls.json"
    
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        if verbose:
            print(f"‚ùå Error searching SPL for {drug_name}: {e}")
        return None

    rows = data.get("DATA", [])
    if not rows:
        if verbose:
            print(f"‚ö† No SPL entries found for drug '{drug_name}'")
        return None

    # -------------------------
    # STEP 2 ‚Äî Pick latest SPL
    # -------------------------
    def parse_date(date_str):
        try:
            return datetime.strptime(date_str, "%B %d, %Y")
        except:
            return datetime.min

    # rows format:
    # [ SETID, TITLE, SPL_VERSION, PUBLISHED_DATE ]
    latest_row = max(rows, key=lambda r: parse_date(r[3]))

    setid = latest_row[0]
    title = latest_row[1]
    published = latest_row[3]

    if verbose:
        print(f"‚úÖ Found SPL for {drug_name}")
        print(f"   Title     : {title}")
        print(f"   SETID     : {setid}")
        print(f"   Published : {published}")

    # -------------------------
    # STEP 3 ‚Äî DOWNLOAD XML
    # -------------------------
    xml_url = f"https://dailymed.nlm.nih.gov/dailymed/services/v2/spls/{setid}.xml"

    try:
        xml_resp = requests.get(xml_url, timeout=15)
        xml_resp.raise_for_status()
        xml_dict = xmltodict.parse(xml_resp.content)
        if verbose:
            print(f"üì¶ Successfully fetched XML for {drug_name}")
        return xml_dict

    except Exception as e:
        if verbose:
            print(f"‚ùå Failed to fetch XML for SETID {setid}: {e}")
        return None


In [44]:
data = fetch_dailymed_drug_info("Ibuprofen")

‚úÖ Found SPL for Ibuprofen
   Title     : IBUPROFEN TABLET [APHENA PHARMA SOLUTIONS - TENNESSEE, LLC ]
   SETID     : 4522cb9e-4999-c455-e063-6294a90a294f
   Published : December 05, 2025
üì¶ Successfully fetched XML for Ibuprofen


In [45]:
# regex to clean html tag from text 
import re

def strip_tags(xml_string):
	# hapus semua tag <...>
	text = re.sub(r"<[^>]+>", " ", xml_string)
	# normalisasi whitespace
	text = re.sub(r"\s+", " ", text)
	return text.strip()

In [None]:
def extract_drug_sections(xml_dict, drug_name):
	if not xml_dict:
		return []
	
	section_of_interest = {
			'dosage' : ['DOSAGE', 'DOSAGE AND ADMINISTRATION'], 
			'contraindications': ['CONTRAINDICATIONS SECTION', 'CONTRAINDICATIONS'],
			'side_effects': ['ADVERSE REACTIONS', 'SIDE EFFECTS'],
			'mechanism': ['MECHANISM OF ACTION', 'CLINICAL PHARMACOLOGY'],
			'warnings': ['WARNINGS', 'WARNINGS AND PRECAUTIONS'], 
			'indications': ['INDICATIONS', 'INDICATIONS AND USAGE'], 
			'interactions': ['DRUG INTERACTIONS'],
			'overdosage': ['OVERDOSAGE']
	}

	rag_chunks = []

	try:
		components = (
			xml_dict.get("document", {})
							.get("component", {})
							.get("structuredBody", {})
							.get("component", [])
		)

		# fallback if structureBody is not present
		# if not components:
		# 	print("Falling back to direct components")
		# 	components = xml_dict.get("document", {}).get("component", [])

		# components should be a list 
		if isinstance(components, dict):
			components = [components]
		
		for comp in components:
			section = comp.get("section", {})
			if not section:
				continue

			# extract section title 
			code = section.get("code", {})
			title = code.get("@displayName", "").upper()

			# extract raw text (it can be list or dict)
			text = section.get("text", "")
			if isinstance(text, dict):
				# convert HTML-ish XML content to string
				text = xmltodict.unparse({"text": text}, pretty=False)

			if isinstance(text, list):
				text = "\n".join(str(t) for t in text)
			
			text = str(text).strip()

			# clean tags 
			text = strip_tags(text)

			# match section with interest list
			for category, keywords in section_of_interest.items():
				if any(k in title for k in keywords):
					if len(text) > 50:
						rag_chunks.append({
							"drug_name": drug_name,
							"category": category,
							"section_title": title,
							"text": text[:2000],  # limit to first 2000 chars
							"source": "DailyMed"
						})
					break # stop checking other categories once matched
			
		return rag_chunks
	
	except Exception as e:
		print(f"Error extracting sections for {drug_name}: {e}")
		return []
	

In [47]:
xml_dict = fetch_dailymed_drug_info("Acetaminophen")
chunks = extract_drug_sections(xml_dict, "Acetaminophen")

print(len(chunks))
for c in chunks:
	print(c["drug_name"])
	print(c["category"])
	print(c["section_title"])
	print(c["text"])
	print()


‚úÖ Found SPL for Acetaminophen
   Title     : EXTRA STRENGTH PAIN RELIEF (ACETAMINOPHEN) TABLET [GERI-CARE PHARMACEUTICAL CORP]
   SETID     : 7570aaa2-3238-4cd3-b788-915caa970dba
   Published : December 05, 2025
üì¶ Successfully fetched XML for Acetaminophen
3
Acetaminophen
indications
INDICATIONS & USAGE SECTION
temporarily relieves minor aches and pains temporarily reduces fever

Acetaminophen

Acetaminophen
dosage
DOSAGE & ADMINISTRATION SECTION
do not take more than directed adults and children 12 years and over: Take 2 tablets every 6 hours, as needed; not more than 6 tablets in 24 hours. Do not take for more than 10 days unless directed by a doctor. children under 12 years: ask a doctor



In [48]:
TARGET_DRUGS =[
	"Aspirin", "Ibuprofen", "Acetaminophen",
	"Amoxicillin", "Azithromycin", "Ciprofloxacin",
	"Metformin", "Atorvastatin", "Lisinopril",
	"Omeprazole", "Levothyroxine", "Albuterol",
	"Gabapentin", "Sertraline", "Losartan",
	"Vitamin D", "Vitamin B12", "Vitamin C"
]


def build_rag_knowledge_base():
	print("Building RAG Knowledge Base from DailyMed...")

	all_rag_chunks = []
	for drug in TARGET_DRUGS:
		time.sleep(1)  # to respect API rate limits
		# fetch xml data
		xml_dict = fetch_dailymed_drug_info(drug)

		if not xml_dict:
			print(f"No dailymed data for {drug}")
			continue 

		# extract section 
		chunks = extract_drug_sections(xml_dict, drug)
		print(f" ‚úì Extracted {len(chunks)} chunks")

		all_rag_chunks.extend(chunks)

		time.sleep(1)  # to respect API rate limits

	print(f"‚úì Total RAG chunks collected: {len(all_rag_chunks)}")
	return all_rag_chunks

### Save finetuning dataset and rag dataset
if the two cell below run more than once it will override curren json and jsonl file

In [49]:
def save_finetuning_data(qa_pairs : List[Dict]):
	print(f"Saving finetuning dataset... {len(qa_pairs)} pairs")

	# split train test 90 : 10
	split_idx = int(len(qa_pairs) * 0.9)
	train_data = qa_pairs[:split_idx]
	test_data = qa_pairs[split_idx:]

	# save as JSON 
	with open(FINETUNING_DIR / "train.json", "w", encoding="utf-8") as f:
		json.dump(train_data, f, indent=2, ensure_ascii=False)

	with open(FINETUNING_DIR / "test.json", "w", encoding="utf-8") as f:
		json.dump(test_data, f, indent=2, ensure_ascii=False)
	
	# save as JSONL
	with open(FINETUNING_DIR / "train.jsonl", "w", encoding="utf-8") as f:
		for item in train_data:
			f.write(json.dumps(item, ensure_ascii=False) + "\n")

	with open(FINETUNING_DIR / "test.jsonl", "w", encoding="utf-8") as f:
		for item in test_data:
			f.write(json.dumps(item, ensure_ascii=False) + "\n")
	
	print(f"Train set: {len(train_data)} samples")
	print(f"Test set: {len(test_data)} samples")
	print(f"Data saved to {FINETUNING_DIR}")

In [50]:
def save_rag_data(rag_chunks):
	print("Saving RAG knowledge base...")
	
	# save as JSON
	with open(RAG_DIR / "knowledge_base.json", 'w', encoding='utf-8') as f:
		json.dump(rag_chunks, f, indent=2, ensure_ascii=False)
	
	# save as JSONL
	with open(RAG_DIR / "knowledge_base.jsonl", 'w', encoding='utf-8') as f:
		for chunk in rag_chunks:
			f.write(json.dumps(chunk, ensure_ascii=False) + "\n")
	
	# save statistic 
	stats = {
		'total_chunks': len(rag_chunks),
		'chunks_per_drug': {},
		'chunks_per_category': {}
	}

	for chunk in rag_chunks:
		drug = chunk['drug_name']
		category = chunk['category']
		stats['chunks_per_drug'][drug] = stats['chunks_per_drug'].get(drug, 0) + 1
		stats['chunks_per_category'][category] = stats['chunks_per_category'].get(category, 0) + 1
	
	with open(RAG_DIR / "stats.json", 'w', encoding='utf-8') as f:
		json.dump(stats, f, indent=2)
	
	print(f"Total chunks : {len(rag_chunks)}")
	print(f"Saved to {RAG_DIR}")
	for drug, count in stats['chunks_per_drug'].items():
		print(f" - {drug}: {count} chunks")

### Execute All Function Above 

In [51]:
# finetuning dataset
all_qa_pairs = []
mediqationqa_data = process_mediqationqa("../data/finetuning/MedInfo2019-QA-Medications.xlsx")
all_qa_pairs.extend(mediqationqa_data)

# rag dataset
rag_chunks = build_rag_knowledge_base()

# save dataset 
if all_qa_pairs:
  save_finetuning_data(all_qa_pairs)
  
if rag_chunks:
  save_rag_data(rag_chunks)

print("Preprocessing completed.")

Building RAG Knowledge Base from DailyMed...
‚úÖ Found SPL for Aspirin
   Title     : BAYER LOW DOSE (ASPIRIN) TABLET [BAYER HEALTHCARE LLC.]
   SETID     : 075b103e-0bb4-4b7a-ac0e-5645bcbd0a07
   Published : December 05, 2025
üì¶ Successfully fetched XML for Aspirin
 ‚úì Extracted 2 chunks
‚úÖ Found SPL for Ibuprofen
   Title     : IBUPROFEN TABLET [APHENA PHARMA SOLUTIONS - TENNESSEE, LLC ]
   SETID     : 4522cb9e-4999-c455-e063-6294a90a294f
   Published : December 05, 2025
üì¶ Successfully fetched XML for Ibuprofen
 ‚úì Extracted 6 chunks
‚úÖ Found SPL for Acetaminophen
   Title     : EXTRA STRENGTH PAIN RELIEF (ACETAMINOPHEN) TABLET [GERI-CARE PHARMACEUTICAL CORP]
   SETID     : 7570aaa2-3238-4cd3-b788-915caa970dba
   Published : December 05, 2025
üì¶ Successfully fetched XML for Acetaminophen
 ‚úì Extracted 3 chunks
‚úÖ Found SPL for Amoxicillin
   Title     : AMOXICILLIN FOR SUSPENSION [NUCARE PHARMACEUTICALS, INC.]
   SETID     : 4528f48d-5bda-b307-e063-6294a90a294c
   Publis