## Blog Data Extraction

In [2]:
# Import Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re

In [3]:
os.chdir('../')
%pwd

'/home/utpal108/dev/Python/Projects/cancer.net-web-scraping'

In [4]:
from webScraping.constants import *
from webScraping.utils import getSectionUrl, getFullUrl

In [5]:
# Components
class DataExtraction:

    def __init__(self, dir_name):
        self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
        self.dir_path = os.path.join('artifacts','data',dir_name)
        self.skip_contents = skip_contents
        self.partially_skip_contents = partially_skip_contents
        os.makedirs(self.dir_path, exist_ok=True)
        
    
    def get_total_pages(self, base_url='https://www.cancer.net/blog', single_page_url='https://www.cancer.net/blog?page='):
        response = requests.get(base_url, headers=self.headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        total_pages = []
        pages = soup.find('li', {'class': 'pager-last'})
        if pages is not None:
            total_pages = int(pages.a['href'].split('=')[-1])
            total_pages = [(single_page_url + str(page_no)) for page_no in range(1, (total_pages+1))]

        return total_pages

    def get_blog_posts(self, page_url):
        response = requests.get(page_url, headers=self.headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        all_posts = []
        posts = soup.find('div',{'class' : 'view-content'})
        if posts is not None:
            posts = posts.find_all('div', {'class' : 'views-row'})
            for post in posts:
                blog_post_url = post.find('div',{'class' : 'views-field-title'}).a['href']
                all_posts.append(getFullUrl(blog_post_url))
        return all_posts


    def get_topic_content(self, topic_url):

        response = requests.get(topic_url, headers = self.headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        df = {
            'question':[],
            'answer':[]
        }

        articles = soup.find_all('article')
        
        if len(articles) > 1:
            section_topics = self.get_section_topics(topic_url)
            for section_topic_url in section_topics:
                self.get_topic_content(section_topic_url)
        
        topic_name = topic_url.split('/')[-1]
        questions = soup.find_all('h3')
        skip_contents = [content.lower() for content in self.skip_contents]
        partially_skip_contents = [content.lower() for content in self.partially_skip_contents]
        questions = [question for question in questions if question.text.strip() != '' and question.text.strip().lower() not in skip_contents and not any(re.findall(r"\b(" + "|".join(partially_skip_contents) + r")\b", question.text.strip().lower()))]
        
        for question in questions:
            df['question'].append(question.text.strip())
            
            answer = ""
            next_element = question.find_next_sibling()
            while next_element and next_element.name != 'h3':
                answer += next_element.text.strip() + " "
                next_element = next_element.find_next_sibling()
            
            df['answer'].append(answer)

        
        if len(df['question']) > 0 and len(df['answer']) > 0:
            data_path = os.path.join(self.dir_path,topic_name+'.csv')
            df = pd.DataFrame(df)
            df.to_csv(data_path, index=False)

        

In [6]:
data_extraction = DataExtraction('blog')
total_pages = data_extraction.get_total_pages('https://www.cancer.net/blog')
print(total_pages)
# type(total_pages)
# data_extraction.get_page_contents(total_pages)

['https://www.cancer.net/blog?page=1', 'https://www.cancer.net/blog?page=2', 'https://www.cancer.net/blog?page=3', 'https://www.cancer.net/blog?page=4', 'https://www.cancer.net/blog?page=5', 'https://www.cancer.net/blog?page=6', 'https://www.cancer.net/blog?page=7', 'https://www.cancer.net/blog?page=8', 'https://www.cancer.net/blog?page=9', 'https://www.cancer.net/blog?page=10', 'https://www.cancer.net/blog?page=11', 'https://www.cancer.net/blog?page=12', 'https://www.cancer.net/blog?page=13', 'https://www.cancer.net/blog?page=14']


In [7]:
posts = data_extraction.get_blog_posts('https://www.cancer.net/blog?page=1')
posts

['https://www.cancer.net/blog/2024-02/3-steps-making-right-decisions-you-during-cancer-experts-perspective',
 'https://www.cancer.net/blog/2024-02/10-things-helped-me-cope-with-my-2-cancer-diagnoses-survivors-story',
 'https://www.cancer.net/blog/2024-02/2024-february-plenary',
 'https://www.cancer.net/blog/2024-02/4-steps-we-can-all-take-world-cancer-day-reduce-global-cancer-burden',
 'https://www.cancer.net/blog/2024-02/can-being-exposed-wildfire-smoke-affect-your-cancer-risk']

In [9]:
data_extraction.get_topic_content('https://www.cancer.net/blog/2024-02/balancing-hope-with-realism-setting-expectations-when-starting-new-cancer-treatment')

In [10]:
data_extraction = DataExtraction('blog')
pages = data_extraction.get_total_pages(base_url='https://www.cancer.net/blog', single_page_url='https://www.cancer.net/blog?page=')
print(pages)
for page in pages:
    blog_posts = data_extraction.get_blog_posts(page)
    for blog_post_url in blog_posts:
        print(blog_post_url)
        data_extraction.get_topic_content(blog_post_url)

['https://www.cancer.net/blog?page=1', 'https://www.cancer.net/blog?page=2', 'https://www.cancer.net/blog?page=3', 'https://www.cancer.net/blog?page=4', 'https://www.cancer.net/blog?page=5', 'https://www.cancer.net/blog?page=6', 'https://www.cancer.net/blog?page=7', 'https://www.cancer.net/blog?page=8', 'https://www.cancer.net/blog?page=9', 'https://www.cancer.net/blog?page=10', 'https://www.cancer.net/blog?page=11', 'https://www.cancer.net/blog?page=12', 'https://www.cancer.net/blog?page=13', 'https://www.cancer.net/blog?page=14']
https://www.cancer.net/blog/2024-02/3-steps-making-right-decisions-you-during-cancer-experts-perspective
https://www.cancer.net/blog/2024-02/10-things-helped-me-cope-with-my-2-cancer-diagnoses-survivors-story
https://www.cancer.net/blog/2024-02/2024-february-plenary
https://www.cancer.net/blog/2024-02/4-steps-we-can-all-take-world-cancer-day-reduce-global-cancer-burden
https://www.cancer.net/blog/2024-02/can-being-exposed-wildfire-smoke-affect-your-cancer-ri