In [None]:
#######  References  ##########
# https://medium.com/codex/web-scraping-paginated-webpages-with-python-selenium-and-beautifulsoup4-8b415f833132
# Youtube plus Github - https://www.youtube.com/c/JohnWatsonRooney ; https://github.com/jhnwr

In [None]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import os
import re
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException

In [None]:
service = webdriver.ChromeService() #https://www.selenium.dev/documentation/webdriver/drivers/service/
options = Options()
options.add_argument('--incognito')
options.add_argument('start-maximized')
driver = webdriver.Chrome(service=service, options=options)

In [None]:
# Base URL for the blog
BASE_URL = 'https://xxx.ca/blog'

In [None]:
# Function to scrape a single blog page
def scrape_blog_page(blog_url):
    driver.get(blog_url)
    html = driver.page_source
    soup = bs(html, features="html.parser")
    
    # Extract page title
    title_tag = soup.find("div", class_="section-title")
    page_title = (
        title_tag.find("h3").get_text(strip=True)
        if title_tag and title_tag.find("h3")
        else "Untitled"
    )
    
    # Extract image URL
    image_tag = soup.find("div", class_="col-lg-7")
    image_url = (
        image_tag.find("img")["src"] if image_tag and image_tag.find("img") else None
    )
    if image_url and not image_url.startswith("http"):
        image_url = f"https://xxx.ca{image_url}"  # Convert to absolute URL
    
    # Extract blog content
    content_tag = soup.find("div", class_="col-lg-12")
    blog_content = content_tag.find("div", class_="section-text my-5") if content_tag else None
    
    if blog_content:
        # Remove the social share div
        social_share = blog_content.find("div", class_="social-share")
        if social_share:
            social_share.decompose()  # Remove the div and its contents
        
        # Remove class attributes from the content
        for tag in blog_content.find_all(True):  # Finds all child tags
            tag.attrs = {}
        content_html = str(blog_content)
    else:
        content_html = ""
    
    return {
        "url": blog_url,
        "title": page_title,
        "image_url": image_url,
        "content_html": content_html,
    }

In [None]:
# Function to scrape blog URLs from a single page
def scrape_page_for_links(page_url):
    driver.get(page_url)
    html = driver.page_source
    soup = bs(html, features="html.parser")
    
    # Extract blog links from div tags with class 'col-md-6'
    links = [
        box.find("a", href=True)["href"]
        for box in soup.find_all("div", class_="col-md-6")
    ]
    # Convert relative links to absolute if needed
    links = [
        link if link.startswith("http") else f"https://xxx.ca{link}" for link in links
    ]
    return links

In [None]:
# Function to scrape all paginated pages
def scrape_paginated_website(start_page, end_page):
    for page_number in range(start_page, end_page + 1):
        # Construct page URL
        page_url = BASE_URL if page_number == 1 else f"{BASE_URL}/p{page_number}"
        print(f"Scraping page: {page_url}")
        
        # Create subfolder for the page
        folder_name = f"BlogPage{page_number}"
        os.makedirs(folder_name, exist_ok=True)
        
        # Get all blog links on the current page
        blog_links = scrape_page_for_links(page_url)
        
        for blog_url in blog_links:
            print(f"Scraping blog: {blog_url}")
            blog_data = scrape_blog_page(blog_url)
            
            # Save blog content to an HTML file in the subfolder
            blog_filename = os.path.join(folder_name, f"{blog_url.split('/')[-1]}.html")
            with open(blog_filename, "w", encoding="utf-8") as file:
                file.write("<html><body>")
                # Make the H1 tag a hyperlink to the blog URL
                file.write(f"<h1><a href='{blog_data['url']}'>{blog_data['title']}</a></h1>")
                if blog_data["image_url"]:
                    file.write(f"<img src='{blog_data['image_url']}' alt='Blog Image'>")
                file.write(blog_data["content_html"])
                file.write("</body></html>")
    print("All blogs scraped and saved.")

In [None]:
# Main script
if __name__ == "__main__":
    try:
        START_PAGE = 1
        END_PAGE = 6  # Adjust based on the total number of pages
        scrape_paginated_website(START_PAGE, END_PAGE)
        print("Scraping completed! Blog HTML files saved in respective subfolders.")
    finally:
        driver.quit()  # Ensure the driver is closed