# Download Newsletters from The Talon Conspiracy
 

In [None]:
from utils.selenium_resource import SeleniumResource
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    StaleElementReferenceException,
    TimeoutException,
    NoSuchElementException,
)
from urllib.parse import urlparse
from typing import List, Optional
import logging
import time
from dataclasses import dataclass
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
#  regular imports
# Add path

In [3]:
@dataclass
class IssuuItem:
    issuu_name: str
    issuu_url: str
    issuu_img_src: str


@dataclass
class TTCContent:
    ttc_content_title: str
    ttc_items: List[IssuuItem]


@dataclass
class TTCTag:
    "The Talon Conpiracy - Items by Tag"
    ttc_accessible_name: str
    ttc_tag_name: str
    ttc_tag_url: str
    ttc_tag_content: List[TTCContent]

In [None]:
from datetime import datetime
from logging.handlers import RotatingFileHandler


def setup_logging(log_level=logging.INFO) -> logging.Logger:
    """
    Configure logging to both file and console with rotation and formatting.

    Args:
        log_level: The logging level to use (default: logging.INFO)
    """
    # Create logs directory if it doesn't exist
    log_file = os.path.join(os.getcwd(), 'ttc_scrape.log')

    # Create formatter
    formatter = logging.Formatter(
        '[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )

    # Configure root logger
    root_logger = logging.getLogger()
    root_logger.setLevel(log_level)

    # Clear any existing handlers
    root_logger.handlers = []

    # Create rotating file handler (10MB max size, keep 5 backup files)
    file_handler = RotatingFileHandler(
        filename=log_file,
        maxBytes=10*1024*1024,  # 10MB
        backupCount=5,
        encoding='utf-8'
    )
    file_handler.setFormatter(formatter)
    file_handler.setLevel(log_level)

    # Create console handler
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    console_handler.setLevel(log_level)

    # Add handlers to root logger
    root_logger.addHandler(file_handler)
    root_logger.addHandler(console_handler)

    # Log system info at startup
    logger = logging.getLogger("TTC Scrape Logger")
    logger.info('='*50)
    logger.info(f'Logging initiated at {
                datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    logger.info(f'Log file location: {log_file}')
    logger.info('='*50)
    return logger


class WebScraper:
    def __init__(self, max_retries: int = 3, timeout: int = 10):
        self.max_retries = max_retries
        self.timeout = timeout
        self.logger = setup_logging()

    def wait_and_find_element(
            self,
            driver,
            by: By,
            value: str,
            timeout: Optional[int] = None,
    ) -> Optional[any]:
        """Safely wait for and find an element with retries."""
        timeout = timeout or self.timeout
        for attempt in range(self.max_retries):
            try:
                return WebDriverWait(driver, timeout).until(
                    EC.presence_of_element_located((by, value))
                )
            except StaleElementReferenceException:
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(1)
            except TimeoutException:
                self.logger.warning(
                    f"Timeout waiting for element {by}={value}")
                return None

    def wait_and_find_elements(
        self,
        driver,
        by: By,
        value: str,
        timeout: Optional[int] = None,
    ) -> List[any]:
        """Safely wait for and find elements with retries."""
        timeout = timeout or self.timeout
        for attempt in range(self.max_retries):
            try:
                elements = WebDriverWait(driver, timeout).until(
                    EC.presence_of_all_elements_located((by, value))
                )
                return elements
            except StaleElementReferenceException:
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(1)
            except TimeoutException:
                self.logger.warning(
                    f"Timeout waiting for elements {by}={value}")
                return []

    def get_attribute_safely(self, element, attribute: str) -> Optional[str]:
        """Safely get an attribute from an element with retries."""
        for attempt in range(self.max_retries):
            try:
                return element.get_attribute(attribute)
            except StaleElementReferenceException:
                if attempt == self.max_retries - 1:
                    return None
                time.sleep(1)


def scrape_ttc_content():
    ttc_url = "https://thetalonconspiracy.com/"
    client = SeleniumResource()
    scraper = WebScraper()
    ttc_tags = []

    try:
        client.setup_for_execution()
        driver = client.driver
        driver.get(ttc_url)

        # Main scraping loop
        while True:
            # Wait for sidebar and get links
            sidebar_tag_links = scraper.wait_and_find_elements(
                driver,
                By.XPATH,
                "//div[@id='sidebarleft']//li//a[@href]"
            )
            if not sidebar_tag_links:
                break

            # Process each tag link
            for link in sidebar_tag_links:
                ttc_accessible_name = link.accessible_name
                tag_url = scraper.get_attribute_safely(link, 'href')
                if not tag_url:
                    continue

                parsed_url = urlparse(tag_url).path
                tag_name = parsed_url.strip("/").split("/")[-1]

                # Navigate to tag page
                driver.get(tag_url)

                # Get content results
                content_results = scraper.wait_and_find_elements(
                    driver,
                    By.CLASS_NAME,
                    "results_content"
                )

                ttc_content = []
                for idx, result in enumerate(content_results):
                    try:
                        # Get the Category type. Include only periodicals.

                        category_labels = [category.text for category in result.find_elements(
                            By.CSS_SELECTOR,
                            "h3 a[rel]"
                        )]

                        if "PERIODICALS" not in category_labels:
                            continue

                        # Get title from the current result element
                        result_title = result.find_element(By.TAG_NAME, "h1")
                        if not result_title:
                            logging.warning(
                                f"No title found for result {idx + 1}")
                            continue

                        title_text = result_title.text
                        logging.info(f"Found title for result {
                                     idx + 1}: {title_text}")

                        # Get links with images
                        links_with_images = result.find_elements(
                            By.CSS_SELECTOR,
                            "a:has(img)",
                        )
                        if not links_with_images:
                            logging.warning(f"No links with images {idx + 1}")
                            continue

                        ttc_issuus = []
                        for link_img in links_with_images:
                            issuu_href = scraper.get_attribute_safely(
                                link_img, "href")
                            img_element = link_img.find_element(
                                By.TAG_NAME,
                                "img",
                            )
                            if not img_element:
                                continue

                            issuu_img_src = scraper.get_attribute_safely(
                                img_element, "src")
                            if not issuu_href or not issuu_img_src:
                                continue

                            issu_name = urlparse(issuu_img_src).path.strip(
                                "/").split("/")[-1]
                            issuu_item = IssuuItem(
                                issuu_name=issu_name,
                                issuu_url=issuu_href,
                                issuu_img_src=issuu_img_src
                            )
                            ttc_issuus.append(issuu_item)

                        ttc_content_result = TTCContent(
                            ttc_content_title=result_title.text,
                            ttc_items=ttc_issuus.copy()
                        )
                        ttc_content.append(ttc_content_result)

                    except StaleElementReferenceException:
                        continue
                    except NoSuchElementException:
                        logging.warning(f"Missing h1 tag in result {idx + 1}")
                        continue
                    except StaleElementReferenceException:
                        logging.warning(
                            f"Result {idx + 1} became stale, skipping")
                        continue

                ttc_tag = TTCTag(
                    ttc_accessible_name=ttc_accessible_name,
                    ttc_tag_name=tag_name,
                    ttc_tag_url=tag_url,
                    ttc_tag_content=ttc_content.copy()
                )
                ttc_tags.append(ttc_tag)

                # Navigate back
                driver.get(ttc_url)


            break  # Exit the main loop after processing all tags

        return ttc_tags

    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        return []
    finally:
        client.teardown_after_execution()

In [9]:
scrape_ttc_content()

[2024-11-27 05:05:03] INFO [TTC Scrape Logger:50] Logging initiated at 2024-11-27 05:05:03
[2024-11-27 05:05:03] INFO [TTC Scrape Logger:52] Log file location: /app/nb/ttc_scrape.log
[2024-11-27 05:07:29] INFO [root:184] Found title for result 1: NO COMPROMISE #15-17
[2024-11-27 05:20:54] ERROR [root:253] An error occurred: Message: stale element reference: stale element not found
  (Session info: chrome=131.0.6778.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
#0 0x57b53c77f31a <unknown>
#1 0x57b53c2956e0 <unknown>
#2 0x57b53c2a3881 <unknown>
#3 0x57b53c2ea936 <unknown>
#4 0x57b53c2df252 <unknown>
#5 0x57b53c308462 <unknown>
#6 0x57b53c2d8a18 <unknown>
#7 0x57b53c30862e <unknown>
#8 0x57b53c326ed7 <unknown>
#9 0x57b53c308203 <unknown>
#10 0x57b53c2d6cc0 <unknown>
#11 0x57b53c2d7c9e <unknown>
#12 0x57b53c74cd0b <unknown>
#13 0x57b53c750c92 <unknown>
#14 0x57b53c739

[]