In [1]:
import os 
import pandas as pd
import json 
import requests
import pandas as pd 
from typing import List, Dict 
import xml.etree.ElementTree as ET
from pathlib import Path
import time 
import xmltodict
from datasets import load_dataset 
from tqdm import tqdm
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OUTPUT_DIR = Path("../data/")
FINETUNING_DIR = OUTPUT_DIR / "finetuning"
RAG_DIR = OUTPUT_DIR / "rag"

FINETUNING_DIR.mkdir(parents=True, exist_ok=True)
RAG_DIR.mkdir(parents=True, exist_ok=True)

## Dataset For Finetuning : load and process

In [17]:
def load_mediqationqa(file_path):
	df = pd.read_excel(file_path)
	# convert to finetuning format
	finetuning_data = []
	for _, row in df.iterrows():
		# clean and format 
		question = str(row['Question']).strip()
		answer = str(row['Answer']).strip()
		
		if question and answer and question != 'nan' and answer != 'nan':
			finetuning_data.append({
						"instruction" : question,
						"output": answer,
			})
	
	return finetuning_data

In [18]:
def load_MedQuAd(file_path="../data/finetuning/mplusdrugs_with_answers.csv"):
	df = pd.read_csv(file_path)
	print("Original samples:", len(df))

	# STEP 1: sample 3500 stratify
	TARGET_SIZE = 3500
	df_sampled, _ = train_test_split(
		df,
		train_size=TARGET_SIZE,
		stratify=df["question_type"],
		random_state=42
	)

	# STEP 2: now split again into train/test (80:20) while keeping stratification
	train_df, test_df = train_test_split(
		df_sampled,
		test_size=0.2,
		stratify=df_sampled["question_type"],
		random_state=42
	)

	print("Train distrib:")
	print(train_df["question_type"].value_counts())
	print("\nTest distrib:")
	print(test_df["question_type"].value_counts())

	# convert to finetuning format
	def convert(df):
		data = []
		for _, row in df.iterrows():
			q = str(row["question"]).strip()
			a = str(row["answer"]).strip()
			if q and a:
				data.append({"instruction": q, "output": a})
		return data

	return convert(train_df), convert(test_df)

In [19]:
def remove_duplicate_questions(finetuning_data):
	seen = set()
	unique_data = []

	for item in finetuning_data:
		q = item["instruction"].strip().lower()

		if q not in seen:
			seen.add(q)
			unique_data.append(item)

	print(f"Removed duplicates. Before: {len(finetuning_data)}, After: {len(unique_data)}")
	return unique_data


## Dataset for Knowledge Base for simple RAG using FAISS : DailyMed

In [3]:
import requests
import xmltodict
from datetime import datetime

def fetch_dailymed_drug_info(drug_name: str, verbose=True):
	"""
	Final version: Fetch DailyMed XML using API v1 with correct drug filtering.
	- Step 1: Search SPLs by exact drug name (v1)
	- Step 2: Select latest SPL version
	- Step 3: Download SPL XML via v2 endpoint
	"""

	# -------------------------
	# STEP 1 ‚Äî QUERY API v1
	# -------------------------
	url = f"https://dailymed.nlm.nih.gov/dailymed/services/v1/drugname/{drug_name}/human/spls.json"
	
	try:
		response = requests.get(url, timeout=10)
		response.raise_for_status()
		data = response.json()
	except Exception as e:
		if verbose:
			print(f"‚ùå Error searching SPL for {drug_name}: {e}")
		return None

	rows = data.get("DATA", [])
	if not rows:
		if verbose:
			print(f"‚ö† No SPL entries found for drug '{drug_name}'")
		return None

	# -------------------------
	# STEP 2 ‚Äî Pick latest SPL
	# -------------------------
	def parse_date(date_str):
		try:
			return datetime.strptime(date_str, "%B %d, %Y")
		except:
			return datetime.min

	# rows format:
	# [ SETID, TITLE, SPL_VERSION, PUBLISHED_DATE ]
	latest_row = max(rows, key=lambda r: parse_date(r[3]))

	setid = latest_row[0]
	title = latest_row[1]
	published = latest_row[3]

	if verbose:
		print(f"‚úÖ Found SPL for {drug_name}")
		print(f"   Title     : {title}")
		print(f"   SETID     : {setid}")
		print(f"   Published : {published}")

	# -------------------------
	# STEP 3 ‚Äî DOWNLOAD XML
	# -------------------------
	xml_url = f"https://dailymed.nlm.nih.gov/dailymed/services/v2/spls/{setid}.xml"

	try:
		xml_resp = requests.get(xml_url, timeout=15)
		xml_resp.raise_for_status()
		xml_dict = xmltodict.parse(xml_resp.content)
		if verbose:
			print(f"üì¶ Successfully fetched XML for {drug_name}")
		return xml_dict

	except Exception as e:
		if verbose:
			print(f"‚ùå Failed to fetch XML for SETID {setid}: {e}")
		return None


In [4]:
data = fetch_dailymed_drug_info("Ibuprofen")

‚úÖ Found SPL for Ibuprofen
   Title     : IBUPROFEN CAPSULE, LIQUID FILLED [CHAIN DRUG MARKETING ASSOCIATION INC.]
   SETID     : 4573753d-117b-a6f2-e063-6394a90ae2f9
   Published : December 10, 2025
üì¶ Successfully fetched XML for Ibuprofen


In [5]:
# regex to clean html tag from text 
import re

def strip_tags(xml_string):
	# hapus semua tag <...>
	text = re.sub(r"<[^>]+>", " ", xml_string)
	# normalisasi whitespace
	text = re.sub(r"\s+", " ", text)
	return text.strip()

In [6]:
def chunk_text(text, max_length=400, overlap=100):
	# breakdown to sentences
	sentences = re.split(r'(?<=[.!?])\s+', text)
	
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
		sentence_len = len(sentence)

		# if adding sentence exceeds the max_length
		if current_length + sentence_len > max_length:
			chunks.append(" ".join(current_chunk))
			# start new chunk with overlap
			overlap_sentences = current_chunk[-3:]  # overlap by last 3 sentences
			current_chunk = overlap_sentences.copy()
			current_length = sum(len(s) for s in current_chunk)
		
		current_chunk.append(sentence)
		current_length += sentence_len
	
	# save last chunk
	if current_chunk:
		chunks.append(" ".join(current_chunk))
	
	return chunks

In [7]:
def extract_drug_sections(xml_dict, drug_name):
	if not xml_dict:
		return []
	
	section_of_interest = {
			'dosage' : ['DOSAGE', 'DOSAGE AND ADMINISTRATION'], 
			'contraindications': ['CONTRAINDICATIONS SECTION', 'CONTRAINDICATIONS'],
			'side_effects': ['ADVERSE REACTIONS', 'SIDE EFFECTS'],
			'mechanism': ['MECHANISM OF ACTION', 'CLINICAL PHARMACOLOGY'],
			'warnings': ['WARNINGS', 'WARNINGS AND PRECAUTIONS'], 
			'indications': ['INDICATIONS', 'INDICATIONS AND USAGE'], 
			'interactions': ['DRUG INTERACTIONS'],
			'overdosage': ['OVERDOSAGE']
	}

	rag_chunks = []

	try:
		components = (
			xml_dict.get("document", {})
							.get("component", {})
							.get("structuredBody", {})
							.get("component", [])
		)

		# fallback if structureBody is not present
		# if not components:
		# 	print("Falling back to direct components")
		# 	components = xml_dict.get("document", {}).get("component", [])

		# components should be a list 
		if isinstance(components, dict):
			components = [components]
		
		for comp in components:
			section = comp.get("section", {})
			if not section:
				continue

			# extract section title 
			code = section.get("code", {})
			title = code.get("@displayName", "").upper()

			# extract raw text (it can be list or dict)
			text = section.get("text", "")
			if isinstance(text, dict):
				# convert HTML-ish XML content to string
				text = xmltodict.unparse({"text": text}, pretty=False)

			if isinstance(text, list):
				text = "\n".join(str(t) for t in text)
			
			text = str(text).strip()

			# clean tags 
			text = strip_tags(text)

			# match section with interest list
			for category, keywords in section_of_interest.items():
				if any(k in title for k in keywords):
					if len(text) > 50:
						chunks = chunk_text(text, max_length=400, overlap=100)
						for i, ch in enumerate(chunks):
							rag_chunks.append({
									"drug_name": drug_name,
									"category": category,
									"section_title": f"{title} (chunk {i+1})",
									"text": ch,
									"source": "DailyMed"
                })
							
					break # stop checking other categories once matched
			
		return rag_chunks
	
	except Exception as e:
		print(f"Error extracting sections for {drug_name}: {e}")
		return []
	

In [8]:
xml_dict = fetch_dailymed_drug_info("Acetaminophen")
chunks = extract_drug_sections(xml_dict, "Acetaminophen")

print(len(chunks))
for c in chunks:
	print(c["drug_name"])
	print(c["category"])
	print(c["section_title"])
	print(c["text"])
	print()


‚úÖ Found SPL for Acetaminophen
   Title     : ACETAMINOPHEN SOLUTION [PAI HOLDINGS, LLC DBA PAI PHARMA]
   SETID     : fac4e0c6-684f-45a1-99f2-de4de4017cc8
   Published : December 10, 2025
üì¶ Successfully fetched XML for Acetaminophen
4
Acetaminophen
indications
INDICATIONS & USAGE SECTION (chunk 1)
‚Ä¢ for the temporary relief of minor aches and pains due to ‚Ä¢ for the minor pain from arthritis ‚Ä¢ and to reduce fever headache muscular aches backache sore throat flu the common cold toothache premenstrual and menstrual cramps

Acetaminophen
dosage
OVERDOSAGE SECTION (chunk 1)

Acetaminophen
dosage
DOSAGE & ADMINISTRATION SECTION (chunk 1)


Acetaminophen
dosage
DOSAGE & ADMINISTRATION SECTION (chunk 2)



In [None]:
TARGET_DRUGS =[
	"Aspirin", "Ibuprofen", "Acetaminophen",
	"Amoxicillin", "Azithromycin", "Ciprofloxacin",
	"Metformin", "Atorvastatin", "Lisinopril",
	"Omeprazole", "Levothyroxine", "Albuterol",
	"Gabapentin", "Sertraline", "Losartan",
	"Vitamin D", "Vitamin B12", "Vitamin C", 
]


def build_rag_knowledge_base():
	print("Building RAG Knowledge Base from DailyMed...")

	all_rag_chunks = []
	for drug in TARGET_DRUGS:
		time.sleep(1)  # to respect API rate limits
		# fetch xml data
		xml_dict = fetch_dailymed_drug_info(drug)

		if not xml_dict:
			print(f"No dailymed data for {drug}")
			continue 

		# extract section 
		chunks = extract_drug_sections(xml_dict, drug)
		print(f" ‚úì Extracted {len(chunks)} chunks")

		all_rag_chunks.extend(chunks)

		time.sleep(1)  # to respect API rate limits

	print(f"‚úì Total RAG chunks collected: {len(all_rag_chunks)}")
	return all_rag_chunks

### Save finetuning dataset and rag dataset
if the two cell below run more than once it will override curren json and jsonl file

In [20]:
def save_finetuning_data(train_data, test_data):
		print(f"Saving dataset...")
		print(f"Train: {len(train_data)} samples")
		print(f"Test : {len(test_data)} samples")

		# Shuffle train only (test tidak perlu)
		train_data = shuffle(train_data, random_state=42)

		# Save JSON
		with open(FINETUNING_DIR/"train.json", "w", encoding="utf-8") as f:
				json.dump(train_data, f, indent=2, ensure_ascii=False)

		with open(FINETUNING_DIR/"test.json", "w", encoding="utf-8") as f:
				json.dump(test_data, f, indent=2, ensure_ascii=False)

		# Save JSONL
		with open(FINETUNING_DIR/"train.jsonl", "w", encoding="utf-8") as f:
				for item in train_data:
						f.write(json.dumps(item, ensure_ascii=False) + "\n")

		with open(FINETUNING_DIR/"test.jsonl", "w", encoding="utf-8") as f:
				for item in test_data:
						f.write(json.dumps(item, ensure_ascii=False) + "\n")

		print(f"Data saved to {FINETUNING_DIR}")

In [10]:
def save_rag_data(rag_chunks):
	print("Saving RAG knowledge base...")
	
	# save as JSON
	with open(RAG_DIR / "knowledge_base.json", 'w', encoding='utf-8') as f:
		json.dump(rag_chunks, f, indent=2, ensure_ascii=False)
	
	# save as JSONL
	with open(RAG_DIR / "knowledge_base.jsonl", 'w', encoding='utf-8') as f:
		for chunk in rag_chunks:
			f.write(json.dumps(chunk, ensure_ascii=False) + "\n")
	
	# save statistic 
	stats = {
		'total_chunks': len(rag_chunks),
		'chunks_per_drug': {},
		'chunks_per_category': {}
	}

	for chunk in rag_chunks:
		drug = chunk['drug_name']
		category = chunk['category']
		stats['chunks_per_drug'][drug] = stats['chunks_per_drug'].get(drug, 0) + 1
		stats['chunks_per_category'][category] = stats['chunks_per_category'].get(category, 0) + 1
	
	with open(RAG_DIR / "stats.json", 'w', encoding='utf-8') as f:
		json.dump(stats, f, indent=2)
	
	print(f"Total chunks : {len(rag_chunks)}")
	print(f"Saved to {RAG_DIR}")
	for drug, count in stats['chunks_per_drug'].items():
		print(f" - {drug}: {count} chunks")

### Execute All Function Above 

In [21]:
# load medinfo2019
medinfo_train = load_mediqationqa("../data/finetuning/MedInfo2019-QA-Medications.xlsx")

# load MedQuAD (stratified)
medquad_train, medquad_test = load_MedQuAd("../data/finetuning/mplusdrugs_with_answers.csv")

# Merge
train_data = medquad_train + medinfo_train
test_data = medquad_test

# Remove duplicates (opsional)
train_data = remove_duplicate_questions(train_data)
test_data = remove_duplicate_questions(test_data)

# save dataset 
save_finetuning_data(train_data, test_data)

print("Preprocessing completed.")

Original samples: 5000
Train distrib:
question_type
precautions              454
side effects             453
indication               453
usage                    442
emergency or overdose    413
storage and disposal     401
severe reaction            9
contraindication           6
Name: count, dtype: int64

Test distrib:
question_type
precautions              114
side effects             113
indication               113
usage                    110
emergency or overdose    104
storage and disposal     100
severe reaction            2
contraindication           2
Name: count, dtype: int64
Removed duplicates. Before: 3489, After: 3449
Removed duplicates. Before: 700, After: 699
Saving dataset...
Train: 3449 samples
Test : 699 samples
Data saved to ..\data\finetuning
Preprocessing completed.


In [12]:
# rag dataset
rag_chunks = build_rag_knowledge_base()
	
if rag_chunks:
	save_rag_data(rag_chunks)

print("Preprocessing completed.")

Building RAG Knowledge Base from DailyMed...
‚úÖ Found SPL for Aspirin
   Title     : ENTERIC COATED ASPIRIN REGULAR STRENGTH (ASPIRIN) TABLET, DELAYED RELEASE [BRYANT RANCH PREPACK]
   SETID     : e05140fa-9766-4f97-bffc-e36a01b9d38f
   Published : December 10, 2025
üì¶ Successfully fetched XML for Aspirin
 ‚úì Extracted 5 chunks
‚úÖ Found SPL for Ibuprofen
   Title     : IBUPROFEN CAPSULE, LIQUID FILLED [CHAIN DRUG MARKETING ASSOCIATION INC.]
   SETID     : 4573753d-117b-a6f2-e063-6394a90ae2f9
   Published : December 10, 2025
üì¶ Successfully fetched XML for Ibuprofen
 ‚úì Extracted 6 chunks
‚úÖ Found SPL for Acetaminophen
   Title     : ACETAMINOPHEN SOLUTION [PAI HOLDINGS, LLC DBA PAI PHARMA]
   SETID     : fac4e0c6-684f-45a1-99f2-de4de4017cc8
   Published : December 10, 2025
üì¶ Successfully fetched XML for Acetaminophen
 ‚úì Extracted 4 chunks
‚úÖ Found SPL for Amoxicillin
   Title     : AMOXICILLIN FOR SUSPENSION [NUCARE PHARMACEUTICALS, INC.]
   SETID     : 4528f48d-5bda-b30