In [11]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin


In [12]:
# Base URLs
index_url = "https://medlineplus.gov/healthtopics.html"
base_url = "https://medlineplus.gov/"


In [13]:
# Step 1: Get all category/topic URLs
def get_topic_urls():
    print("Getting topic categories...")
    res = requests.get(index_url)
    soup = BeautifulSoup(res.content, "html.parser")
    topic_links = soup.select("ul.alpha-links li a")
    
    # Fix: use urljoin instead of base_url + a['href']
    topic_urls = [urljoin(base_url, a['href']) for a in topic_links if a['href'].endswith("healthtopics.html")]
    return topic_urls

# Step 2: From each topic, get all condition pages
def get_condition_links(topic_urls):
    condition_links = []
    for url in topic_urls:
        print(f"Scraping topic: {url}")
        try:
            res = requests.get(url)
            soup = BeautifulSoup(res.content, "html.parser")
            links = soup.select("div.section-body li.item a")
            for a in links:
                href = a.get('href')
                if href and href.endswith('.html'):
                    full_url = urljoin(base_url, href)  # ⬅️ Use urljoin here
                    condition_links.append((full_url, a.text.strip()))
            time.sleep(0.5)
        except Exception as e:
            print(f"Failed to fetch topic page: {url}, error: {e}")
    return condition_links

# Step 3: Get description from each condition page
def get_descriptions(condition_links, max_count=None):
    data = []
    for i, (url, name) in enumerate(condition_links):
        if max_count and i >= max_count:
            break
        try:
            print(f"{i+1}. Scraping condition: {name}")
            res = requests.get(url)
            soup = BeautifulSoup(res.content, "html.parser")
            desc_block = soup.select_one("section#topic-summary p")
            description = desc_block.text.strip() if desc_block else ""
            if description:
                data.append({"term": name, "description": description})
            time.sleep(0.5)
        except Exception as e:
            print(f"❌ Failed to fetch: {url}, error: {e}")
    return data

# Step 4: Save to CSV
def save_to_csv(data, filename="medline_conditions.csv"):
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["term", "description"])
        writer.writeheader()
        writer.writerows(data)
    print(f"✅ Saved {len(data)} conditions to {filename}")

In [14]:
# Run full pipeline
if __name__ == "__main__":
    topic_urls = get_topic_urls()
    condition_links = get_condition_links(topic_urls)
    data = get_descriptions(condition_links, max_count=500)  # Adjust count here
    save_to_csv(data)

Getting topic categories...
Scraping topic: https://medlineplus.gov/all_healthtopics.html
Scraping topic: https://medlineplus.gov/all_healthtopics.html
1. Scraping condition: A1C
2. Scraping condition: Aortic Aneurysm
3. Scraping condition: Aortic Aneurysm
4. Scraping condition: Abdominal Pain
5. Scraping condition: Ectopic Pregnancy
6. Scraping condition: Birth Defects
7. Scraping condition: Blood
8. Scraping condition: Abortion
9. Scraping condition: Medicines
10. Scraping condition: Over-the-Counter Medicines
11. Scraping condition: Aspergillosis
12. Scraping condition: Abscess
13. Scraping condition: Child Abuse
14. Scraping condition: Elder Abuse
15. Scraping condition: Intimate Partner Violence
16. Scraping condition: Safety
17. Scraping condition: Falls
18. Scraping condition: First Aid
19. Scraping condition: Wounds and Injuries
20. Scraping condition: Esophagus Disorders
21. Scraping condition: Heel Injuries and Disorders
22. Scraping condition: Dwarfism
23. Scraping condition