## Cancer Types Data Extraction

In [1]:
# Import Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re

In [2]:
os.chdir('../')
%pwd

'/home/utpal108/dev/Python/Projects/cancer.net-web-scraping'

In [3]:
from webScraping.constants import *
from webScraping.utils import getSectionUrl, getFullUrl

In [12]:
# Components
class DataExtraction:

    def __init__(self, dir_name):
        self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
        self.dir_path = os.path.join('artifacts','data',dir_name)
        self.skip_contents = skip_contents
        self.partially_skip_contents = partially_skip_contents
        os.makedirs(self.dir_path, exist_ok=True)      
    
    def get_cancer_catalogs(self, URL):
        response = requests.get(URL, headers = self.headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        cancer_catalogs = soup.find('div', id='quicktabs-container-cancer_types')
        cancer_catalogs = cancer_catalogs.select('div .quicktabs-tabpage')
        return cancer_catalogs
    
    def get_cancer_details_url(self, cancer_url):
        cancer_details_url = requests.get(cancer_url, headers = self.headers)
        soup = BeautifulSoup(cancer_details_url.content, 'html.parser')
        view_all_link = soup.find('a', string='View All Pages')
        
        if view_all_link:
            cancer_details_url = getFullUrl(view_all_link['href']) 
        else:
            cancer_details_url = cancer_url
        
        return cancer_details_url
    
    def get_cancer_details(self, cancer_url, cancer_name):
        
        cancer_details = requests.get(cancer_url, headers = self.headers)
        soup = BeautifulSoup(cancer_details.content, 'html.parser')
        
        df = {
            'question':[],
            'answer':[]
        }

        questions = soup.find_all('h3')
        skip_contents = [content.lower() for content in self.skip_contents]
        partially_skip_contents = [content.lower() for content in self.partially_skip_contents]
        questions = [question for question in questions if question.text.strip() != '' and question.text.strip().lower() not in skip_contents and not any(re.findall(r"\b(" + "|".join(partially_skip_contents) + r")\b", question.text.strip().lower()))]
        
        for question in questions:
            df['question'].append(question.text.strip())
            answer = ""
            next_element = question.find_next_sibling()
            while next_element and next_element.name != 'h3':
                answer += next_element.text.strip() + " "
                next_element = next_element.find_next_sibling()
            
            df['answer'].append(answer)

        if len(df['question']) > 0 and len(df['answer']) > 0:
            data_path = os.path.join(self.dir_path,cancer_name+'.csv')
            df = pd.DataFrame(df)
            df.to_csv(data_path, index=False)
              

In [13]:
data_extraction = DataExtraction('cancer-types')
cancer_catalogs = data_extraction.get_cancer_catalogs('https://www.cancer.net/cancer-types')
for cancer_catalog in cancer_catalogs:
    cancer_types = cancer_catalog.select('.view .view-content .item-list ul li')
    for cancer_type in cancer_types:
        base_url = getFullUrl(cancer_type.find('a').get('href'))
        cancer_name = base_url.split('/')[-1]       
        cancer_details_url = data_extraction.get_cancer_details_url(base_url)
        cancer_details = data_extraction.get_cancer_details(cancer_details_url, cancer_name)

In [5]:
# Pipeline
try:
    data_extraction = DataExtraction('cancer-types')
    cancer_catalogs = data_extraction.get_cancer_catalogs('https://www.cancer.net/cancer-types')
    for cancer_catalog in cancer_catalogs:
        try:
            cancer_types = cancer_catalog.select('.view .view-content .item-list ul li')
            for cancer_type in cancer_types:
                try:
                    base_url = getFullUrl(cancer_type.find('a').get('href'))
                    cancer_name = base_url.split('/')[-1]       
                    cancer_details_url = data_extraction.get_cancer_details_url(base_url)
                    cancer_details = data_extraction.get_cancer_details(cancer_details_url, cancer_name)
                except:
                    continue
        except:
            continue

except Exception as e:
    pass