In [33]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [34]:
medquad = pd.read_csv("../data/finetuning/MedQuAD.csv")
# only keep rows where document_source is 'MPlusDrugs'
medquad = medquad[medquad['document_source'] == 'MPlusDrugs']
medquad = medquad[['document_url', 'question_type','question']]
medquad = medquad.dropna()
# format the document_url to valid url medlineplus.gov
medquad['document_url'] = medquad['document_url'].str.replace('www.nlm.nih.gov', 'medlineplus.gov')
medquad['document_url'] = medquad['document_url'].str.replace('/medlineplus/', '/', regex=False)
medquad.head()

Unnamed: 0,document_url,question_type,question
34552,https://medlineplus.gov/druginfo/meds/a602014....,indication,Who should get Glycopyrrolate and why is it pr...
34553,https://medlineplus.gov/druginfo/meds/a602014....,usage,How should Glycopyrrolate be used and what is ...
34554,https://medlineplus.gov/druginfo/meds/a602014....,precautions,Are there safety concerns or special precautio...
34555,https://medlineplus.gov/druginfo/meds/a602014....,dietary,What special dietary instructions should I fol...
34556,https://medlineplus.gov/druginfo/meds/a602014....,forget a dose,What should I do if I forget a dose of Glycopy...


In [35]:
# Filter question type based on knowledge_base.json categories
keywords = ['indication', 'usage', 'side effects', 'important warning', 
            'severe reaction', 'contraindication', 'dose', 'precautions', 
            'mechanism', 'emergency or overdose', 'storage and disposal', 
            'other information']

exclude_keywords = ['dietary', 'forget a dose', 'brand names', 'brand names of combination products',
                    'how can i learn more', 'why get vaccinated', 'information']

include_mask = medquad['question_type'].str.contains('|'.join(keywords), case=False, na=False)
exclude_mask = ~medquad['question_type'].str.contains('|'.join(exclude_keywords), case=False, na=False)
medquad_filtered = medquad[include_mask & exclude_mask]

medquad_filtered.head()

Unnamed: 0,document_url,question_type,question
34552,https://medlineplus.gov/druginfo/meds/a602014....,indication,Who should get Glycopyrrolate and why is it pr...
34553,https://medlineplus.gov/druginfo/meds/a602014....,usage,How should Glycopyrrolate be used and what is ...
34554,https://medlineplus.gov/druginfo/meds/a602014....,precautions,Are there safety concerns or special precautio...
34557,https://medlineplus.gov/druginfo/meds/a602014....,side effects,What are the side effects or risks of Glycopyr...
34558,https://medlineplus.gov/druginfo/meds/a602014....,storage and disposal,What should I know about storage and disposal ...


In [36]:
print("Number of rows after filtering question_type:", len(medquad_filtered))

Number of rows after filtering question_type: 7888


In [37]:
medquad_sampled = medquad_filtered.sample(n=5000, random_state=42)

In [38]:
# export to csv
medquad_sampled.to_csv("../data/finetuning/mplusdrugs_input.csv", index=False)

In [39]:
import re
import html

def clean_text_basic(text: str) -> str:
	# Unescape HTML entities (&nbsp; etc)
	text = html.unescape(text)

	# Remove strange characters common in scraped Medline
	text = text.replace("Â", "").replace("â€“", "-").replace("â€”", "-")
	text = text.replace("•", "- ")  # normalize bullets if needed

	# Remove excessive spaces
	text = re.sub(r"\s+", " ", text)

	# Remove space before punctuation
	text = re.sub(r"\s+([.,!?;:])", r"\1", text)

	return text.strip()

In [40]:
def normalize_list(items: list[str]) -> str:
	items = [clean_text_basic(i) for i in items if i.strip()]
	if not items:
		return ""
	
	if len(items) == 1:
		return items[0]

	# join bullet points to single paragraph
	return " ".join([i.rstrip('.') + '.' for i in items])


In [41]:
def deduplicate_paragraphs(text: str) -> str:
	paragraphs = [p.strip() for p in text.split(". ") if p.strip()]
	seen = set()
	unique = []
	for p in paragraphs:
		if p not in seen:
			unique.append(p)
			seen.add(p)
	return ". ".join(unique)


In [42]:
def clean_answer(text: str) -> str:
	if not text or not isinstance(text, str):
		return ""

	# 1. Basic cleaning
	text = clean_text_basic(text)

	# 2. Deduplicate paragraphs
	text = deduplicate_paragraphs(text)

	# 3. Normalize spacing again
	text = re.sub(r"\s+", " ", text).strip()

	return text


In [43]:
# --------------------------------------
# Mapping section headings
# --------------------------------------
SECTION_MAP = {
	"indication": ["Why is this medication prescribed", "Indications"],
	"usage": ["How should this medication be used", "Usage", "How should this medicine be used?"],
	"precautions": ["What special precautions should I follow", "Precautions"],
	"side effects": ["Side Effects"],
	"storage and disposal": ["Storage and Disposal"],
	"emergency or overdose": ["In Case of Emergency/Overdose", "Emergency"],
	"other information": ["What other information should I know", "Other information", "Other uses for this medicine"],
	"important warning": ["IMPORTANT WARNING"],
	"severe reaction": ["Severe side effects", "Severe reaction"],
	"contraindication": ["Who should not get", "Who should not take", "Contraindications"],
	"dose": ["To take a dose", "Dosage", "Dose"],
}

# --------------------------------------
# Function: extract section from HTML
# --------------------------------------
def extract_section(html, qtype):
	soup = BeautifulSoup(html, "html.parser")

	candidates = SECTION_MAP.get(qtype.lower())
	if not candidates:
		return {"status": "unknown_qtype", "text": None}

	# Find all sections tags
	all_sections = soup.find_all("section")

	for sec in all_sections:
		div_sec = sec.find("div", class_="section")
		if not div_sec:
			continue

		# Find section-title
		header = div_sec.find("div", class_="section-title")
		if not header:
			continue

		h = header.find("h2")
		if not h:
			continue

		heading_text = h.get_text(strip=True)

		# Match with qtype
		if not any(key.lower() in heading_text.lower() for key in candidates):
			continue

		# Found the matching section, extract body
		body = div_sec.find("div", class_="section-body")
		if not body:
			return {"status": "empty_section", "text": None}

		# Extract text from body
		parts = []
		for child in body.find_all(recursive=False):
			parts.append(child.get_text(" ", strip=True))

		content = " ".join(parts).strip()

		if content:
			return {"status": "ok", "text": content}
		else:
			return {"status": "empty_section", "text": None}

	return {"status": "heading_not_found", "text": None}


# --------------------------------------
# Main scraper pipeline
# --------------------------------------
def scrape_mplusdrugs(csv_path, out_path="mplusdrugs_with_answers.csv"):
	df = pd.read_csv(csv_path)

	answers = []
	error_logs = []

	for idx, row in df.iterrows():
		url = row["document_url"]
		qtype = row["question_type"] 

		print(f"[{idx+1}/{len(df)}] {url} | type: {qtype}")

		try:
			resp = requests.get(url, timeout=15)
			html = resp.text

			result = extract_section(html, qtype)
			
			clean = clean_answer(result["text"])

			answers.append(clean or "")

			# If not ok, log the error
			if result["status"] != "ok":
				error_logs.append({
					"index": idx,
					"url": url,
					"qtype": qtype,
					"status": result["status"]
				})

		except Exception as e:
			answers.append("")
			error_logs.append({
				"index": idx,
				"url": url,
				"qtype": qtype,
				"status": str(e)
			})

		time.sleep(0.5)

	df["answer"] = answers
	df.to_csv(out_path, index=False)

	print("\nDone. Saved:", out_path)

	if error_logs:
		print("\n---- ERROR LOGS ----")
		for e in error_logs:
			print(e)


# --------------------------------------
# Run the scraper
# --------------------------------------
scrape_mplusdrugs("../data/finetuning/mplusdrugs_input.csv", out_path="../data/finetuning/mplusdrugs_with_answers.csv")

[1/5000] https://medlineplus.gov/druginfo/meds/a607019.html | type: contraindication
[2/5000] https://medlineplus.gov/druginfo/meds/a695013.html | type: side effects
[3/5000] https://medlineplus.gov/druginfo/meds/a608013.html | type: usage
[4/5000] https://medlineplus.gov/druginfo/meds/a603023.html | type: side effects
[6/5000] https://medlineplus.gov/druginfo/meds/a682708.html | type: usage
[7/5000] https://medlineplus.gov/druginfo/meds/a615005.html | type: usage
[8/5000] https://medlineplus.gov/druginfo/meds/a682826.html | type: side effects
[9/5000] https://medlineplus.gov/druginfo/meds/a609029.html | type: precautions
[10/5000] https://medlineplus.gov/druginfo/meds/a697002.html | type: side effects
[11/5000] https://medlineplus.gov/druginfo/meds/a698021.html | type: indication
[12/5000] https://medlineplus.gov/druginfo/meds/a699059.html | type: storage and disposal
[14/5000] https://medlineplus.gov/druginfo/meds/a601237.html | type: emergency or overdose
[15/5000] https://medlinepl