## Cancer Care Data Extraction

In [1]:
# Import Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re

In [2]:
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 

In [3]:
def get_sections():
    
    URL = 'https://www.cancer.net/navigating-cancer-care'
    response = requests.get(URL, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    sections_url = []
    sections = soup.find('article')
    sections = sections.find('div', class_='field-name-field-page-sub-pages').find('div', class_='field-items')
    sections = sections.find_all('article')

    for section in sections:
        sections_url.append('https://www.cancer.net' + section.select('header h3 a')[0]['href'])

    return sections_url

In [4]:
all_sections = get_sections()
print(len(all_sections))
print(all_sections)

15
['https://www.cancer.net/navigating-cancer-care/cancer-basics', 'https://www.cancer.net/navigating-cancer-care/diagnosing-cancer', 'https://www.cancer.net/navigating-cancer-care/managing-your-care', 'https://www.cancer.net/navigating-cancer-care/financial-considerations', 'https://www.cancer.net/navigating-cancer-care/how-cancer-treated', 'https://www.cancer.nethttps://www.cancer.net/coping-with-cancer/physical-emotional-and-social-effects-cancer', 'https://www.cancer.net/navigating-cancer-care/dating-sex-and-reproduction', 'https://www.cancer.net/navigating-cancer-care/advanced-cancer', 'https://www.cancer.net/navigating-cancer-care/when-cancer-not-your-only-health-concern', 'https://www.cancer.net/navigating-cancer-care/children', 'https://www.cancer.net/navigating-cancer-care/young-adults-and-teenagers', 'https://www.cancer.net/navigating-cancer-care/adults-65', 'https://www.cancer.net/navigating-cancer-care/us-veterans', 'https://www.cancer.net/navigating-cancer-care/prevention-

In [5]:
def get_section_topics(section_url):
    response = requests.get(section_url, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    section_topics_url = []
    section_topics = soup.find('article')
    section_topics = section_topics.find('div',class_='field-name-field-page-sub-pages').find('div',class_='field-items')

    for section_topic in section_topics:
        section_topics_url.append('https://www.cancer.net'+section_topic.select('header h3 a')[0]['href'])

    return section_topics_url

In [6]:
get_section_topics('https://www.cancer.net/navigating-cancer-care/diagnosing-cancer')

['https://www.cancer.net/navigating-cancer-care/diagnosing-cancer/when-doctor-says-%E2%80%9Ccancer%E2%80%9D',
 'https://www.cancer.net/navigating-cancer-care/diagnosing-cancer/questions-ask-your-health-care-team',
 'https://www.cancer.net/navigating-cancer-care/diagnosing-cancer/tests-and-procedures',
 'https://www.cancer.net/navigating-cancer-care/diagnosing-cancer/reports-and-results',
 'https://www.cancer.net/navigating-cancer-care/diagnosing-cancer/stages-cancer']

In [10]:
def get_topic_content(topic_url):

    response = requests.get(topic_url, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    cancer_care_df  = {
        'question':[],
        'answer':[]
    }

    articles = soup.find_all('article')
    
    if len(articles) > 1:
        section_topics = get_section_topics(topic_url)
        for section_topic_url in section_topics:
            get_topic_content(section_topic_url)
    
    else:
        topic_name = re.sub(r'[^a-zA-Z0-9\s\'-]', '', topic_url.split('/')[-1])
        questions = soup.find_all('h3')
        
        for question in questions:
            cancer_care_df['question'].append(question.text.strip())
            
            answer = ""
            next_element = question.find_next_sibling()
            while next_element and next_element.name != 'h3':
                answer += next_element.text.strip() + " "
                next_element = next_element.find_next_sibling()
            
            cancer_care_df['answer'].append(answer)

        df = pd.DataFrame(cancer_care_df)
        dir_path = 'data/cancer-care'
        os.makedirs(dir_path, exist_ok=True)
        data_path = os.path.join(dir_path,topic_name+'.csv')
        if len(cancer_care_df['question']) > 0 and len(cancer_care_df['answer']) > 0:
            df.to_csv(data_path, index=False)

        return cancer_care_df

In [11]:
get_topic_content('https://www.cancer.net/navigating-cancer-care/cancer-basics/cancer-care-team/oncology-team')

{'question': ['What is an oncologist?',
  'Who else is on my oncology team?',
  'Questions to ask about your health care team',
  'Related Resources',
  'More Information',
  'More in this section'],
 'answer': ['An oncologist is a doctor who specializes in diagnosing and treating cancer. Your oncologist oversees your care from diagnosis throughout the course of the disease. In cancer care, a patient is often treated by a team of oncologists who specialize in different areas of oncology and types of treatments. For instance, a medical oncologist uses medications to treat cancer, a radiation oncologist specializes in radiation therapy to treat cancer, and a surgical oncologist is a cancer surgeon. Learn more about the different types of oncologists. ',
  "In addition to oncologists, there may be other specialists on your cancer care team. Here are descriptions of different providers who may be involved in your care. This may be helpful as you learn the role of each person involved in yo

In [3]:
try:
    sections = get_sections()
    for section_url in sections:
        try:
            section_topics = get_section_topics(section_url)
            for section_topic_url in section_topics:
                try:
                    get_topic_content(section_topic_url)
                except:
                    continue
        except:
            continue

except Exception as e:
    pass