In [23]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
import re
import time
import asyncio
from tqdm import tqdm

In [16]:
base_url = "https://www.webmd.com"
topics_path = "a-to-z-guides/health-topics"

def get_topics(page_query):
    url = f"{base_url}/{topics_path}?pg={page_query}"

    # Make a request to the url with a user agent header
    # to prevent 403 errors
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = bs(r.content, "html.parser")

    # Find the ul with the class az-index-results-group-list
    # and get all the li elements
    topics = soup.find("ul", {"class": "az-index-results-group-list"}).find_all("li")
    topics_df = pd.DataFrame(columns=["topic", "url"])
    for topic in topics:
        # Get the text and href from the a element
        topic_text = topic.find("a").get_text()
        topic_url = topic.find("a").get("href")

        # Add the topic and url to the dataframe
        topics_df = topics_df.append(
            {"topic": topic_text, "url": topic_url}, ignore_index=True
        )
    print(f"Found {len(topics)} topics on page {page_query}")
    return topics_df

In [17]:
# Perform the same operation on every page for a-z topics
topics_df = pd.DataFrame(columns=["topic", "url"])
for page in range(1, 26 + 1):
    page = chr(page + 96)
    print(f"Getting topics from page {page}...")
    topics_df = topics_df.append(get_topics(page), ignore_index=True)
    time.sleep(1)
topics_df


Getting topics from page a...
Found 116 topics on page a
Getting topics from page b...
Found 101 topics on page b
Getting topics from page c...
Found 126 topics on page c
Getting topics from page d...
Found 64 topics on page d
Getting topics from page e...
Found 67 topics on page e
Getting topics from page f...
Found 76 topics on page f
Getting topics from page g...
Found 83 topics on page g
Getting topics from page h...
Found 104 topics on page h
Getting topics from page i...
Found 51 topics on page i
Getting topics from page j...
Found 16 topics on page j
Getting topics from page k...
Found 21 topics on page k
Getting topics from page l...
Found 62 topics on page l
Getting topics from page m...
Found 93 topics on page m
Getting topics from page n...
Found 50 topics on page n
Getting topics from page o...
Found 36 topics on page o
Getting topics from page p...
Found 109 topics on page p
Getting topics from page q...
Found 1 topics on page q
Getting topics from page r...
Found 67 topic

Unnamed: 0,topic,url
0,A1AT Deficiency,https://www.webmd.com/lung/copd/alpha-1-antitr...
1,AAT,https://www.webmd.com/lung/copd/alpha-1-antitr...
2,AAT Deficiency,https://www.webmd.com/lung/copd/alpha-1-antitr...
3,Abdominal Migraine,https://www.webmd.com/migraines-headaches/cycl...
4,Abercrombie Syndrome,https://www.webmd.com/cancer/lymphoma/amyloido...
...,...,...
1599,Zambusch's Disease,https://www.webmd.com/oral-health/oral-lichen-...
1600,Z-E Syndrome,https://www.webmd.com/digestive-disorders/zoll...
1601,ZES,https://www.webmd.com/digestive-disorders/zoll...
1602,Zika,https://www.webmd.com/a-to-z-guides/zika-virus...


In [20]:
topics_df.to_csv("topics.csv", index=False)

In [22]:
# Get all of the topics from topics_df with a url that does
# not end in default.htm
topics_df = pd.read_csv("topics.csv")
topics_df = topics_df[topics_df["url"].str.endswith("default.htm") == False]
topics_df

Unnamed: 0,topic,url
0,A1AT Deficiency,https://www.webmd.com/lung/copd/alpha-1-antitr...
1,AAT,https://www.webmd.com/lung/copd/alpha-1-antitr...
2,AAT Deficiency,https://www.webmd.com/lung/copd/alpha-1-antitr...
3,Abdominal Migraine,https://www.webmd.com/migraines-headaches/cycl...
4,Abercrombie Syndrome,https://www.webmd.com/cancer/lymphoma/amyloido...
...,...,...
1599,Zambusch's Disease,https://www.webmd.com/oral-health/oral-lichen-...
1600,Z-E Syndrome,https://www.webmd.com/digestive-disorders/zoll...
1601,ZES,https://www.webmd.com/digestive-disorders/zoll...
1602,Zika,https://www.webmd.com/a-to-z-guides/zika-virus...


In [89]:
def scrape_page(topic, url):
    # Make a request to the url with a user agent header
    # to prevent 403 errors
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = bs(r.content, "html.parser")

    try:
        # Fint the div with the class "article__body" or "article-body"
        # and get all the section elements
        sections = soup.find("div", {"class": "article__body"}).find_all("section")
        if len(sections) == 0:
            sections = soup.find("div", {"class": "article-body"}).find_all("section")

        data = []
        for section in sections:
            # Get the first h2 element to get the title of the section
            title = section.find("h2").get_text() if section.find("h2") else ""
            body = " ".join([p.get_text() for p in section.find_all("p")])
            data.append({"title": title, "body": body})

        # Print our the a dataframe with the data
        print(f"Found {len(data)} sections for {topic}")
        print(pd.DataFrame(data))
        
        
        # If the section does not have a title, then append the body to the 
        # body of the section above it
        for i in range(len(data)):
            if data[i]["title"] == "":
                data[i - 1]["body"] += data[i]["body"]
                data[i]["body"] = ""
        data = [d for d in data if d["body"] != ""]

        pot_cols = {
            "overview": ["what is", "what are"],
            "symptoms": ["symptom", "signs"],
            "causes": ["cause", "risk"],
            "tests": ["test", "diagnosis"],
            "treatment": ["treat"],
        }
        obj = {col: "" for col in pot_cols.keys()}

        obj["topic"] = topic
        obj["url"] = url

        for d in data:
            # Check if the title of the d contains any of the columns
            # if it does then put it in the dict
            for col_key in pot_cols.keys():
                for col in pot_cols[col_key]:
                    if col in d["title"].lower():
                        obj[col_key] = d["body"]

        # If the value for "what is" is empty, then put the first section
        # in the "what is" column
        if obj["overview"] == "":
            obj["overview"] = data[0]["body"]

        return pd.DataFrame(obj, columns=["topic", "url", *pot_cols.keys()], index=[0])
    except Exception as e:
        return pd.DataFrame(columns=["topic", "url"], data=[[topic, url]])

In [90]:
# Scrape a random sample of 10 topics and concat their results to a df
df = pd.DataFrame()
for i in range(5):
    topic = topics_df.sample(1).iloc[0]
    print(f"Scraping {topic['topic']}...")
    df = df.append(scrape_page(topic["topic"], topic["url"]), ignore_index=True)
    time.sleep(1)

    

Scraping Exercise: Flat Abs Slideshow...
Scraping Fitness: 30-Minute Workout Slideshow...
Scraping Bioterrorism...
Found 1 sections for Bioterrorism
  title                                               body
0        Detection Biological agents are engineered pro...
Scraping GVHD...
Found 5 sections for GVHD
       title                                               body
0             If you've had a bone marrow or stem cell trans...
1   Symptoms  GVHD can show up in several different parts of...
2             There are two main types, based on when sympto...
3     Causes  During chemotherapy, cells inside your bone ma...
4  Treatment  If your GVHD is severe enough to need treatmen...
Scraping Lymphedema...
Found 5 sections for Lymphedema
              title                                               body
0                    Lymphedema is swelling that’s caused by a coll...
1  Causes and Types  If your lymphatic system is damaged or a block...
2          Symptoms  The most common s

In [91]:
df.to_csv("data.csv", index=False)