# Download Periodicals from The Talon Conspiracy
 
 This is a inteactive development notebook for building utilities to scrape periodicals from The Talon Conspiracy. 

Include Selenium as the core scraping utility.

In [18]:
# Add path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [19]:
#  regular imports
from utils.selenium_resource import SeleniumResource
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import (
    StaleElementReferenceException,
    TimeoutException,
    NoSuchElementException,
)
from urllib.parse import urlparse
from typing import List, Optional
import logging
import time
import pickle
from dataclasses import dataclass

Dataclasses to store metadata and important dowload link targets.

In [None]:
@dataclass
class IssuuItem:
    issuu_name: str
    issuu_url: str
    issuu_img_src: str


@dataclass
class TTCContent:
    ttc_content_title: str
    ttc_items: List[IssuuItem]


@dataclass
class TTCTag:
    "The Talon Conpiracy - Items by Tag"
    ttc_accessible_name: str
    ttc_tag_name: str
    ttc_tag_url: str
    # content is added after tag scrape
    ttc_tag_content: Optional[List[TTCContent]] = None

In [33]:

from datetime import datetime
from logging.handlers import RotatingFileHandler


def setup_logging(log_level=logging.INFO) -> logging.Logger:
    """
    Configure logging to both file and console with rotation and formatting.

    Args:
        log_level: The logging level to use (default: logging.INFO)
    """
    # Create logs directory if it doesn't exist
    log_file = os.path.join(os.getcwd(), 'ttc_scrape.log')

    # Create formatter
    formatter = logging.Formatter(
        '[%(asctime)s] %(levelname)s [%(name)s:%(lineno)s] %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S'
    )

    # Configure root logger
    root_logger = logging.getLogger()
    root_logger.setLevel(log_level)

    # Clear any existing handlers
    root_logger.handlers = []

    # Create rotating file handler (10MB max size, keep 5 backup files)
    file_handler = RotatingFileHandler(
        filename=log_file,
        maxBytes=10*1024*1024,  # 10MB
        backupCount=5,
        encoding='utf-8'
    )
    file_handler.setFormatter(formatter)
    file_handler.setLevel(log_level)

    # Create console handler
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    console_handler.setLevel(log_level)

    # Add handlers to root logger
    root_logger.addHandler(file_handler)
    root_logger.addHandler(console_handler)

    # Log system info at startup
    logger = logging.getLogger("TTC Scrape Logger")
    logger.info('='*50)
    logger.info(f'Logging initiated at {
                datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    logger.info(f'Log file location: {log_file}')
    logger.info('='*50)
    return logger


class WebScraper:
    def __init__(self, max_retries: int = 3, timeout: int = 10):
        self.max_retries = max_retries
        self.timeout = timeout
        self.logger = setup_logging()

    def wait_and_find_element(
            self,
            driver,
            by: By,
            value: str,
            timeout: Optional[int] = None,
    ) -> Optional[any]:
        """Safely wait for and find an element with retries."""
        timeout = timeout or self.timeout
        for attempt in range(self.max_retries):
            try:
                return WebDriverWait(driver, timeout).until(
                    EC.presence_of_element_located((by, value))
                )
            except StaleElementReferenceException:
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(1)
            except TimeoutException:
                self.logger.warning(
                    f"Timeout waiting for element {by}={value}")
                return None

    def wait_and_find_elements(
        self,
        driver,
        by: By,
        value: str,
        timeout: Optional[int] = None,
    ) -> List[any]:
        """Safely wait for and find elements with retries."""
        timeout = timeout or self.timeout
        for attempt in range(self.max_retries):
            try:
                elements = WebDriverWait(driver, timeout).until(
                    EC.presence_of_all_elements_located((by, value))
                )
                return elements
            except StaleElementReferenceException:
                if attempt == self.max_retries - 1:
                    raise
                time.sleep(1)
            except TimeoutException:
                self.logger.warning(
                    f"Timeout waiting for elements {by}={value}")
                return []

    def get_attribute_safely(self, element, attribute: str) -> Optional[str]:
        """Safely get an attribute from an element with retries."""
        for attempt in range(self.max_retries):
            try:
                return element.get_attribute(attribute)
            except StaleElementReferenceException:
                if attempt == self.max_retries - 1:
                    return None
                time.sleep(1)

    def click_download_button(
        self,
        driver,
        timeout: Optional[int] = None,
        wait_time: int = 5
    ) -> bool:
        """
        Click the download button with data-tooltip="Download" and wait for download to complete.

        Args:
            driver: Selenium WebDriver instance
            timeout: Optional timeout override (uses instance timeout if not specified)
            wait_time: Time to wait after clicking for download to complete (default: 5 seconds)

        Returns:
            bool: True if download button was found and clicked successfully, False otherwise
        """
        try:
            # Find download button using data-tooltip attribute
            download_button = self.wait_and_find_element(
                driver,
                By.CSS_SELECTOR,
                '[data-tooltip="Download"]',
                timeout
            )

            if not download_button:
                self.logger.warning("Download button not found")
                return False

            # Check if button is clickable
            if not download_button.is_enabled():
                self.logger.warning("Download button is not enabled")
                return False

            # Scroll button into view to ensure it's clickable
            driver.execute_script(
                "arguments[0].scrollIntoView(true);", download_button)
            time.sleep(1)  # Brief pause after scrolling

            # Click the download button
            download_button.click()
            self.logger.info("Download button clicked successfully")

            # Wait for specified time to allow download to complete
            time.sleep(wait_time)

            return True

        except Exception as e:
            self.logger.error(f"Error during download: {str(e)}")
            return False

In [None]:
def scrape_ttc_tags(
        ttc_url="https://thetalonconspiracy.com/",
        client=SeleniumResource(),
        scraper=WebScraper(),
        ttc_tags=[],
) -> List[TTCTag]:

    try:
        client.setup_for_execution()
        driver = client.driver
        driver.get(ttc_url)

        # Wait for sidebar and get links
        sidebar_tag_links = scraper.wait_and_find_elements(
            driver,
            By.XPATH,
            "//div[@id='sidebarleft']//li//a[@href]"
        )

        # Main scraping loop
        ttc_tags = []
        while True:

            # Process each tag link
            for link in sidebar_tag_links:
                ttc_accessible_name = link.accessible_name
                ttc_tag_url = scraper.get_attribute_safely(link, 'href')
                if not ttc_tag_url:
                    continue

                parsed_url = urlparse(ttc_tag_url).path
                ttc_tag_name = parsed_url.strip("/").split("/")[-1]
                ttc_tags.append(
                    TTCTag(
                        ttc_accessible_name=ttc_accessible_name,
                        ttc_tag_name=ttc_tag_name,
                        ttc_tag_url=ttc_tag_url,
                        ttc_tag_content=None,  # added later
                    )
                )
            break  # Exit the main loop after processing all tags
        scraper.logger.info("Scraping TTC Tags complete.")
        return ttc_tags

    except Exception as e:
        scraper.logger.error(f"An error occurred: {str(e)}")
        return []
    finally:
        client.teardown_after_execution()

#####################################
ttc_tags = scrape_ttc_tags()

[2024-11-27 06:56:56] INFO [TTC Scrape Logger:162] Scraping TTC Tags complete.


In [None]:
def scrape_ttc_content_by_tag(
    ttc_tag: TTCTag,
    client=SeleniumResource(),
    scraper=WebScraper(),
) -> TTCTag:
    try:
        scraper.logger.info(
            f"Scraping content from tag: {
                ttc_tag.ttc_accessible_name!r}")

        client.setup_for_execution()
        driver = client.driver
        # Navigate to tag page

        driver.get(ttc_tag.ttc_tag_url)

        # Get content results
        content_results = scraper.wait_and_find_elements(
            driver,
            By.CLASS_NAME,
            "results_content"
        )

        ttc_content = []
        for result in content_results:
            try:
                # Get the Category type. Include only periodicals.

                category_labels = [category.text for category in result.find_elements(
                    By.CSS_SELECTOR,
                    "h3 a[rel]"
                )]

                if "PERIODICALS" not in category_labels:
                    continue

                # Get title from the current result element
                result_title = result.find_element(By.TAG_NAME, "h1")
                if not result_title:
                    scraper.logger.warning(
                        "No title found for content result")
                    continue

                title_text = result_title.text
                scraper.logger.info(f"Found title for result {title_text}")

                # Get links with images
                links_with_images = result.find_elements(
                    By.CSS_SELECTOR,
                    "a:has(img)",
                )
                if not links_with_images:
                    scraper.logger.warning("No links with images")
                    continue

                ttc_issuus = []
                for link_img in links_with_images:
                    issuu_href = scraper.get_attribute_safely(
                        link_img, "href")
                    img_element = link_img.find_element(
                        By.TAG_NAME,
                        "img",
                    )
                    if not img_element:
                        continue

                    issuu_img_src = scraper.get_attribute_safely(
                        img_element, "src")
                    if not issuu_href or not issuu_img_src:
                        continue

                    issu_name = urlparse(issuu_img_src).path.strip(
                        "/").split("/")[-1]
                    issuu_item = IssuuItem(
                        issuu_name=issu_name,
                        issuu_url=issuu_href,
                        issuu_img_src=issuu_img_src
                    )
                    ttc_issuus.append(issuu_item)

                ttc_content_result = TTCContent(
                    ttc_content_title=result_title.text,
                    ttc_items=ttc_issuus.copy()
                )
                ttc_content.append(ttc_content_result)

            except StaleElementReferenceException:
                continue
            except NoSuchElementException:
                logging.warning("Missing h1 tag in result.")
                continue
            except StaleElementReferenceException:
                logging.warning(
                    "Result became stale, skipping")
                continue
            ttc_tag.ttc_tag_content = ttc_content
            return ttc_tag
    except Exception as e:
        scraper.logger.error(f"An error occurred: {str(e)}")
        return None
    finally:
        client.teardown_after_execution()


############################################
ttc_content = [scrape_ttc_content_by_tag(tag) for tag in ttc_tags]

In [34]:
def scrape_ttc_by_periodical_content(
    url: str,
    client=SeleniumResource(),
    scraper=WebScraper(),
) -> TTCContent:
    try:

        client.setup_for_execution()
        driver = client.driver
        # Navigate to tag page

        driver.get(url)

        # Get content results
        content_results = scraper.wait_and_find_elements(
            driver,
            By.CLASS_NAME,
            "results_content"
        )

        ttc_content = []
        for result in content_results:
            try:
                # Get the Category type. Include only periodicals.

                category_labels = [category.text for category in result.find_elements(
                    By.CSS_SELECTOR,
                    "h3 a[rel]"
                )]

                if "PERIODICALS" not in category_labels:
                    continue

                # Get title from the current result element
                result_title = result.find_element(By.TAG_NAME, "h1")
                if not result_title:
                    scraper.logger.warning(
                        "No title found for content result")
                    continue

                title_text = result_title.text
                scraper.logger.info(f"Found title for result {title_text}")

                # Get links with images
                links_with_images = result.find_elements(
                    By.CSS_SELECTOR,
                    "a:has(img)",
                )
                if not links_with_images:
                    scraper.logger.warning("No links with images")
                    continue

                ttc_issuus = []
                for link_img in links_with_images:
                    issuu_href = scraper.get_attribute_safely(
                        link_img, "href")
                    img_element = link_img.find_element(
                        By.TAG_NAME,
                        "img",
                    )
                    if not img_element:
                        continue

                    issuu_img_src = scraper.get_attribute_safely(
                        img_element, "src")
                    if not issuu_href or not issuu_img_src:
                        continue

                    issu_name = urlparse(issuu_img_src).path.strip(
                        "/").split("/")[-1]
                    issuu_item = IssuuItem(
                        issuu_name=issu_name,
                        issuu_url=issuu_href,
                        issuu_img_src=issuu_img_src
                    )
                    ttc_issuus.append(issuu_item)

                ttc_content_result = TTCContent(
                    ttc_content_title=result_title.text,
                    ttc_items=ttc_issuus.copy()
                )
                ttc_content.append(ttc_content_result)

            except StaleElementReferenceException:
                continue
            except NoSuchElementException:
                logging.warning("Missing h1 tag in result.")
                continue
            except StaleElementReferenceException:
                logging.warning(
                    "Result became stale, skipping")
                continue
        return ttc_content
    except Exception as e:
        scraper.logger.error(f"An error occurred: {str(e)}")
        return None
    finally:
        client.teardown_after_execution()

[2024-12-03 18:27:28] INFO [TTC Scrape Logger:50] Logging initiated at 2024-12-03 18:27:28
[2024-12-03 18:27:28] INFO [TTC Scrape Logger:52] Log file location: /app/nb/ttc_scrape.log


In [28]:
periodical_urls = [f"https://thetalonconspiracy.com/category/periodicals/page/{
    page_num}/"for page_num in range(1, 11)]
periodical_urls

['https://thetalonconspiracy.com/category/periodicals/page/1/',
 'https://thetalonconspiracy.com/category/periodicals/page/2/',
 'https://thetalonconspiracy.com/category/periodicals/page/3/',
 'https://thetalonconspiracy.com/category/periodicals/page/4/',
 'https://thetalonconspiracy.com/category/periodicals/page/5/',
 'https://thetalonconspiracy.com/category/periodicals/page/6/',
 'https://thetalonconspiracy.com/category/periodicals/page/7/',
 'https://thetalonconspiracy.com/category/periodicals/page/8/',
 'https://thetalonconspiracy.com/category/periodicals/page/9/',
 'https://thetalonconspiracy.com/category/periodicals/page/10/']

In [29]:
periodical_content = [scrape_ttc_by_periodical_content(
    url) for url in periodical_urls]

[2024-12-03 18:03:26] INFO [TTC Scrape Logger:42] Found title for result FRONTLINE NEWS – THE COMPLETE SET!
[2024-12-03 18:03:26] INFO [TTC Scrape Logger:42] Found title for result HISTORY AS IT WAS MADE: A TROVE OF ANTI-HUNTINGDON LIFE SCIENCES NEWSLETTERS.
[2024-12-03 18:03:27] INFO [TTC Scrape Logger:42] Found title for result ANIMOSITY #3
[2024-12-03 18:03:27] INFO [TTC Scrape Logger:42] Found title for result BITE BACK (U.K.) #1&2
[2024-12-03 18:03:27] INFO [TTC Scrape Logger:42] Found title for result N.A.L.L. NEWSLETTERS – SUMMER AND SPRING 1985
[2024-12-03 18:03:27] INFO [TTC Scrape Logger:42] Found title for result TURNING POINT #7-8
[2024-12-03 18:03:28] INFO [TTC Scrape Logger:42] Found title for result EARTH FIRST! JOURNAL 1987
[2024-12-03 18:03:28] INFO [TTC Scrape Logger:42] Found title for result WILD ROCKIES REVIEW VOL. 2 #1-2
[2024-12-03 18:03:28] INFO [TTC Scrape Logger:42] Found title for result N.A.L.L. NEWSLETTER #7-8
[2024-12-03 18:03:30] INFO [TTC Scrape Logger:4

In [None]:
def download_content_issuus(
    content: TTCContent,
    client=SeleniumResource(),
    scraper=WebScraper(),
):
    try:
        ...
    except Exception as e:
        scraper.logger.error(f"An error occurred: {str(e)}")
        return None
    finally:
        client.teardown_after_execution()
    
        

[[TTCContent(ttc_content_title='FRONTLINE NEWS – THE COMPLETE SET!', ttc_items=[IssuuItem(issuu_name='FrontLineNews1.jpg', issuu_url='https://issuu.com/conflictgypsy/docs/alffrontline_1?mode=window&viewMode=doublePage', issuu_img_src='http://www.thetalonconspiracy.com/content/FrontLineNews1.jpg'), IssuuItem(issuu_name='frontline2thumb.jpg', issuu_url='https://issuu.com/conflictgypsy/docs/frontline2?mode=window&viewMode=doublePage', issuu_img_src='http://www.thetalonconspiracy.com/images/frontline2thumb.jpg'), IssuuItem(issuu_name='frontline3.jpg', issuu_url='https://issuu.com/conflictgypsy/docs/frontline3?mode=window&viewMode=doublePage', issuu_img_src='http://www.thetalonconspiracy.com/content/frontline3.jpg'), IssuuItem(issuu_name='frontline4.jpg', issuu_url='https://issuu.com/conflictgypsy/docs/frontline-4?mode=window&viewMode=doublePage', issuu_img_src='http://www.thetalonconspiracy.com/content/frontline4.jpg')]),
  TTCContent(ttc_content_title='HISTORY AS IT WAS MADE: A TROVE OF A