## Using disease links file to scrap Health Condition", "Overview", "Diagnosis", "Treatment", "Symptoms", "Drugs Used" from drugs.com (url with /condition/ only)

In [16]:
# Load the DataFrame containing URLs
df_urls = pd.read_csv("/Users/yhlien/Desktop/cs1101_01/drugscom_disease_urls.csv")  #  CSV file path
# Remove duplicates based on "Health Condition"
df_urls = df_urls.drop_duplicates(subset=["Disease"], keep="first")


# Ensure the column name matches your CSV structure
urls = df_urls["link"].tolist()  # Convert column to a list
disease_names = df_urls["Disease"].tolist()  # Disease names

# Keywords for diagnosis detection
diagnostic_keywords = ["genetic testing", "biomarker", "lab test", "neurotransmitter",
                        "enzyme deficiency", "mutation", "biochemical", "diagnosis"]

# Initialize list to store results
data = []

# Loop through each URL in the DataFrame
for disease_name, url in zip(disease_names, urls):
    print(f"Scraping data from: {url}")

    try:
        # Send a GET request
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes (4xx, 5xx)
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract disease name (removing ".html")
        # disease_name = 
        #disease_name = url.split("/")[-1].replace("-", " ").replace(".html", "").title()

        # Find Overview & Symptoms
        overview_section = soup.find("div", class_="ddc-expand-read-more")

        # Initialize variables
        overview_text = "Overview not available"
        symptoms_text = "Symptoms not available"
        diagnostic_info = "No explicit diagnostic information found."
        treatment_text = "Treatment information not available"
        drugs_used = []

        if overview_section:
            all_paragraphs = overview_section.find_all("p")  

            # Extract overview (1st paragraph) and symptoms (3rd paragraph)
            if len(all_paragraphs) >= 1:
                overview_text = all_paragraphs[0].get_text(strip=True)
            if len(all_paragraphs) >= 3:
                symptoms_text = all_paragraphs[2].get_text(strip=True)

            # Extract diagnostic clues
            diagnostic_clues = [para.get_text(strip=True) for para in all_paragraphs 
                                if any(keyword in para.get_text(strip=True).lower() for keyword in diagnostic_keywords)]
            if diagnostic_clues:
                diagnostic_info = " ".join(diagnostic_clues)

        # Extract Treatment Section
        treatment_section = soup.find("h2", string="Treatment")
        if treatment_section:
            treatment_paragraphs = []
            for sibling in treatment_section.find_next_siblings():
                if sibling.name == "p":
                    treatment_paragraphs.append(sibling.get_text(strip=True))
                elif sibling.name == "h2":  
                    break
            if treatment_paragraphs:
                treatment_text = " ".join(treatment_paragraphs)

        # Extract Drug Information
        generic_name_section = soup.find("dt", string="Generic name:")
        generic_name = generic_name_section.find_next_sibling("dd").get_text(strip=True) if generic_name_section else "Not available"

        brand_name_section = soup.find("dt", string="Brand name:")
        brand_name = brand_name_section.find_next_sibling("dd").find("span").get_text(strip=True) if brand_name_section else "Not available"

        # **Fix: Remove duplicate "Not available" entries**
        drugs_used = list(set([generic_name, brand_name]))  # Remove duplicates
        if "Not available" in drugs_used and len(drugs_used) > 1:
            drugs_used.remove("Not available")

        # Append data to list
        data.append([disease_name, overview_text, diagnostic_info, treatment_text, symptoms_text, ", ".join(drugs_used)])

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        data.append([url, "Error fetching data", "Error fetching data", "Error fetching data", "Error fetching data", "Error fetching data"])

# Convert to DataFrame
df_results = pd.DataFrame(data, columns=["Health Condition", "Overview", "Diagnosis", "Treatment", "Symptoms", "Drugs Used"])

# Save to CSV
csv_filename = "/Users/yhlien/Desktop/cs1101_01/drugscom_condition_overview_diagnosis_treatment_symptoms_drugs.csv"
df_results.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"Data successfully saved to {csv_filename}")


Scraping data from: https://www.drugs.com/condition/adrenal-tuberculosis.html
Scraping data from: https://www.drugs.com/condition/inhalation-bacillus-anthracis.html
Scraping data from: https://www.drugs.com/condition/inhalation-bacillus-anthracis.html
Scraping data from: https://www.drugs.com/condition/inhalation-bacillus-anthracis.html
Scraping data from: https://www.drugs.com/condition/aspergillosis-meningitis.html
Scraping data from: https://www.drugs.com/condition/aspergillosis-meningitis-with-5-fc.html
Scraping data from: https://www.drugs.com/condition/avian-influenza.html
Data successfully saved to /Users/yhlien/Desktop/cs1101_01/drugscom_condition_overview_diagnosis_treatment_symptoms_drugs.csv
