In [16]:
###Import libaries

import requests
import time
import random
import csv
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
today_date = datetime.today().strftime('%Y-%m-%d')

###Filter specific diseases for Nigeria

# Convert disease names to lowercase for better matching
# Load and inspect the CSV file
file_path = "/Users/yhlien/Desktop/cs1101_01/medlineplus_articles_link.csv"
df = pd.read_csv(file_path)
df.columns = ["Disease", "Link"]  # Ensure correct column names
df["Disease_lower"] = df["Disease"].fillna("").astype(str).str.lower()
#remove the disease name including vaccine|prophylaxis|prevention
df = df[~df["Disease"].str.contains('vaccine|prophylaxis|prevention', case=False, na=False, regex=True)]


tropical_diseases = [
    "Malaria", "Dengue fever", "Chikungunya", "Yellow fever", "Zika virus", "Rift Valley fever",
    "African trypanosomiasis", "Leishmaniasis", "Schistosomiasis", "Lymphatic filariasis",
    "Onchocerciasis", "Loiasis", "Dracunculiasis", "Buruli ulcer", "Yaws", "Cholera", "Typhoid fever",
    "Amoebiasis", "Giardiasis", "Cryptosporidiosis", "Ebola virus disease", "Marburg virus disease",
    "Lassa fever", "Rabies", "Anthrax", "Brucellosis", "Leptospirosis"
]

common_diseases_africa = [
    "Tuberculosis", "HIV/AIDS", "Pneumonia", "Meningitis", "COVID-19", "Influenza", "Measles",
    "Syphilis", "Gonorrhea", "Chlamydia", "HPV", "Hepatitis B", "Hepatitis C", "Diabetes", "Hypertension",
    "Sickle Cell Disease", "Cancer"
]

# Function to filter diseases from the dataset
def filter_diseases(df, disease_list):
    return df[df["Disease_lower"].apply(lambda x: any(d.lower() in x for d in disease_list))]

# Extract tropical diseases and common diseases in Africa
df_tropical_diseases = filter_diseases(df, tropical_diseases)
df_common_diseases = filter_diseases(df, common_diseases_africa)

# Drop the lowercase helper column
df_tropical_diseases = df_tropical_diseases.drop(columns=["Disease_lower"])
df_common_diseases = df_common_diseases.drop(columns=["Disease_lower"])

specific_disease = pd.concat([df_tropical_diseases, df_common_diseases], ignore_index=True)
specific_disease = specific_disease.drop_duplicates()



In [17]:
###MedlinePlus Disease Scraper: Extracting Disease Names & Symptoms

df = specific_disease
# Ensure column name is correct
url_column = "Link"  # Adjust if your column name is different

# Initialize a list to store the scraped data
disease_data = []

# Loop through each URL in the DataFrame
for index, row in specific_disease.iterrows():
    url = row[url_column]

    print(f"🔍 Scraping: {url} ({index+1}/{len(df)})...")

    # Random delay to avoid bot detection
    time.sleep(random.uniform(5, 7))

    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the disease name (usually within the <h1> tag)
        disease_name = soup.find('h1').get_text(strip=True)

        # Find the symptoms section <div id="section-2">
        symptoms_section = soup.find('div', {'class': 'section-body', 'id': 'section-2'})

        # Initialize a list to store symptoms
        symptoms = []

        if symptoms_section:
            # Extract the introductory symptom paragraph
            intro_paragraph = symptoms_section.find('p').get_text(strip=True)
            symptoms.append(intro_paragraph)

            # Extract all symptoms from the <ul> list
            symptom_list = symptoms_section.find('ul')
            if symptom_list:
                symptoms += [li.get_text(strip=True) for li in symptom_list.find_all('li')]

        # Format symptoms as a comma-separated string
        symptoms_str = ', '.join(symptoms)

        # Append to results list
        disease_data.append([disease_name, symptoms_str])
    
    else:
        print(f"⚠️ Failed to retrieve: {url} (Status code: {response.status_code})")

# Convert list to DataFrame
df_output = pd.DataFrame(disease_data, columns=["Disease", "Symptoms"])

# Save to CSV
output_filename = "/Users/yhlien/Desktop/cs1101_01/medlineplus_disease_symptoms.csv"
df_output.to_csv(output_filename, index=False, encoding='utf-8')

print(f"✅ Data successfully saved to {output_filename} with Disease and Symptoms columns!")


🔍 Scraping: https://medlineplus.gov/ency/article/001325.htm (1/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/003534.htm (2/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/000597.htm (3/212)...
🔍 Scraping: https://medlineplus.gov/ency/patientinstructions/000821.htm (4/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/000303.htm (5/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/001374.htm (6/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/001339.htm (7/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/001386.htm (8/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/001376.htm (9/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/000621.htm (10/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/001334.htm (11/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/001321.htm (12/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/003536.htm (13/212)...
🔍 Scraping: https://medlineplus.gov/ency/article/001332.htm (