## Coping With Cancer Data Extraction

In [1]:
# Import Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os

In [2]:
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 

In [3]:
def get_sections():
    
    URL = 'https://www.cancer.net/coping-with-cancer'
    response = requests.get(URL, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    sections_url = []
    sections = soup.find('article')
    sections = sections.find('div', class_='field-name-field-page-sub-pages').find('div', class_='field-items')
    sections = sections.find_all('article')

    for section in sections:
        sections_url.append('https://www.cancer.net' + section.select('header h3 a')[0]['href'])

    return sections_url

In [4]:
get_sections()

['https://www.cancer.net/coping-with-cancer/managing-emotions',
 'https://www.cancer.net/coping-with-cancer/physical-emotional-and-social-effects-cancer',
 'https://www.cancer.net/coping-with-cancer/talking-with-family-and-friends',
 'https://www.cancer.net/coping-with-cancer/caring-loved-one',
 'https://www.cancer.net/coping-with-c%C3%A1ncer/finding-social-support-and-information',
 'https://www.cancer.net/coping-with-cancer/cancer-oncologists-perspective']

In [9]:
def get_section_topics(section_url):
    response = requests.get(section_url, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    section_topics_url = []
    section_topics = soup.find('article')
    section_topics = section_topics.find('div',class_='field-name-field-page-sub-pages').find('div',class_='field-items')

    for section_topic in section_topics:
        section_topics_url.append('https://www.cancer.net'+section_topic.select('header h3 a')[0]['href'])

    return section_topics_url

In [10]:
get_section_topics('https://www.cancer.net/coping-with-cancer/managing-emotions')

['https://www.cancer.net/coping-with-cancer/managing-emotions/self-image-and-cancer',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/coping-with-uncertainty',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/managing-stress',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/how-cope-with-anger',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/anxiety',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/depression',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/managing-fear-side-effects-caused-cancer-treatment',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/coping-with-guilt',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/coping-with-metastatic-cancer',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/grief-and-loss',
 'https://www.cancer.net/coping-with-cancer/managing-emotions/post-traumatic-stress-disorder-and-cancer',
 'https://www.cancer.net/coping-with-ca

In [7]:
def get_topic_content(topic_url):

    response = requests.get(topic_url, headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    coping_with_cancer_df  = {
        'question':[],
        'answer':[]
    }

    articles = soup.find_all('article')
    
    if len(articles) > 1:
        section_topics = get_section_topics(topic_url)
        for section_topic_url in section_topics:
            get_topic_content(section_topic_url)
    
    else:
        topic_name = topic_url.split('/')[-1]
        questions = soup.find_all('h3')
        
        for question in questions:
            coping_with_cancer_df['question'].append(question.text.strip())
            
            answer = ""
            next_element = question.find_next_sibling()
            while next_element and next_element.name != 'h3':
                answer += next_element.text.strip() + " "
                next_element = next_element.find_next_sibling()
            
            coping_with_cancer_df['answer'].append(answer)

        df = pd.DataFrame(coping_with_cancer_df)
        dir_path = 'data/coping-with-cancer'
        os.makedirs(dir_path, exist_ok=True)
        data_path = os.path.join(dir_path,topic_name+'.csv')
        if len(coping_with_cancer_df['question']) > 0 and len(coping_with_cancer_df['answer']) > 0:
            df.to_csv(data_path, index=False)

        return coping_with_cancer_df

In [22]:
get_topic_content('https://www.cancer.net/coping-with-cancer/physical-emotional-and-social-effects-cancer/managing-physical-side-effects')

In [8]:
try:
    sections = get_sections()
    for section_url in sections:
        try:
            section_topics = get_section_topics(section_url)
            for section_topic_url in section_topics:
                try:
                    get_topic_content(section_topic_url)
                except:
                    continue
        except:
            continue

except Exception as e:
    pass