## Extract types of cancer data

In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import os

In [2]:
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 

In [3]:
def get_cancer_catalogs():
    URL = 'https://www.cancer.net/cancer-types'
    response = requests.get(URL, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    cancer_catalogs = soup.find('div', id='quicktabs-container-cancer_types')
    cancer_catalogs = cancer_catalogs.select('div .quicktabs-tabpage')
    return cancer_catalogs

In [4]:
get_cancer_catalogs()

[<div class="quicktabs-tabpage" id="quicktabs-tabpage-cancer_types-0"><div class="view view-cancer-types-v2 view-id-cancer_types_v2 view-display-id-panel_pane_6 view-dom-id-43246ee2a744ccf98e0728bce7506856">
 <div class="view-content">
 <div class="item-list"> <ul> <li class="views-row views-row-1 views-row-odd views-row-first">
 <div class="views-field views-field-title"> <span class="field-content" lang="en"><a href="/cancer-types/bladder-cancer">Bladder Cancer</a></span> </div></li>
 <li class="views-row views-row-2 views-row-even">
 <div class="views-field views-field-title"> <span class="field-content" lang="en"><a href="/cancer-types/breast-cancer">Breast Cancer</a></span> </div></li>
 <li class="views-row views-row-3 views-row-odd">
 <div class="views-field views-field-title"> <span class="field-content" lang="en"><a href="/cancer-types/colorectal-cancer">Colorectal Cancer</a></span> </div></li>
 <li class="views-row views-row-4 views-row-even">
 <div class="views-field views-fi

In [5]:
def get_cancer_details_url(cancer_url):
    cancer_details_url = requests.get(cancer_url, headers = headers)
    soup = BeautifulSoup(cancer_details_url.content, 'html.parser')
    view_all_link = soup.find('a', string='View All Pages')
    
    if view_all_link:
        cancer_details_url = 'https://www.cancer.net' + view_all_link['href'] 
    else:
        cancer_details_url = cancer_url
    
    return cancer_details_url


In [6]:
get_cancer_details_url('https://www.cancer.net/cancer-types/carney-complex')

'https://www.cancer.net/cancer-types/carney-complex'

In [8]:
def get_cancer_details(cancer_url, name):
    cancer_df  = {
        'question':[],
        'answer':[]
    }
    cancer_details = requests.get(cancer_url, headers = headers)
    soup = BeautifulSoup(cancer_details.content, 'html.parser')
    questions = soup.find_all('h3')
    
    for question in questions:
        cancer_df['question'].append(question.text.strip())
        
        answer = ""
        next_element = question.find_next_sibling()
        while next_element and next_element.name != 'h3':
            answer += next_element.text.strip() + " "
            next_element = next_element.find_next_sibling()
        
        cancer_df['answer'].append(answer)

    df = pd.DataFrame(cancer_df)
    dir_path = 'data/types-of-cancer'
    os.makedirs(dir_path, exist_ok=True)
    data_path = os.path.join(dir_path,name+'.csv')
    if len(cancer_df['question']) > 0 and len(cancer_df['answer']) > 0:
            df.to_csv(data_path, index=False)

    return cancer_df

In [10]:
get_cancer_details('https://www.cancer.net/cancer-types/adenoid-cystic-carcinoma/view-all', 'test')

{'question': ['About the salivary glands',
  'About adenoid cystic carcinoma',
  'How many people are diagnosed with AdCC?',
  'What is the survival rate for AdCC?',
  'How adenoid cystic carcinoma is treated',
  'What is cancer staging?',
  'TNM staging system',
  'Tumor (T)',
  'Node (N)',
  'Metastasis (M)',
  'Stage groups for AdCC',
  'How adenoid cystic carcinoma is treated',
  'Surgery',
  'External-beam radiation therapy',
  'Neutron and proton radiation therapy',
  'Physical, emotional, and social effects of cancer',
  'Metastatic adenoid cystic carcinoma',
  'Therapies using\xa0medication to treat metastatic AdCC',
  'Remission and the chance of recurrence',
  'If treatment does not work',
  'What are clinical trials?',
  'Deciding to join a clinical trial',
  'Patient safety and informed consent',
  'Finding a clinical trial',
  'Looking for More About the Latest Research?',
  'Coping with physical side effects',
  'Coping with emotional and social effects',
  'Coping with t

In [9]:
try:
    cancer_catalogs = get_cancer_catalogs()
    for cancer_catalog in cancer_catalogs:
        try:
            cancer_types = cancer_catalog.select('.view .view-content .item-list ul li')

            for cancer_type in cancer_types:
                try:
                    base_url = 'https://www.cancer.net' + cancer_type.find('a').get('href')
                    name = base_url.split('/')[-1]
                    cancer_details_url = get_cancer_details_url(base_url)
                    cancer_details = get_cancer_details(cancer_details_url, name)

                    if (len(cancer_details['question']) == 0) or (len(cancer_details['answer']) == 0):
                        print(name, base_url)
                except:
                    continue
        except:
            continue

except Exception as e:
    pass