A scraper for Autobiography collection.
Source: garamantas.lv

First, all links to the autobiographies were identified in the collection, using https://garamantas.lv/lv/collection/index?CollectionSearch%5Brepository_id%5D=1115628 as a base URL. Key to the autobiography links was combination of "/lv/collection/" and IDs of each collection. The result, however, will exceed the 219 autobiographies.

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
import re

def find_unique_collection_pages(base_url):
    """
    Finds all unique collection pages containing '/lv/collection/' in the base URL.
    Returns a list of absolute URLs.
    """
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    collection_links = set()  # Using a set to ensure uniqueness

    for link in soup.find_all('a', href=True):
        absolute_url = urljoin(base_url, link['href'])
        if '/lv/collection/' in absolute_url:
            collection_links.add(absolute_url)

    return list(collection_links)

def scrape_and_save_links(collection_links):
    """
    Scrapes links from each collection page and saves unique links to 'links.csv'.
    """
    unique_links = set()  # Set to store unique links
    for collection_link in collection_links:
        response = requests.get(collection_link)
        soup = BeautifulSoup(response.content, 'html.parser')
        target_links = soup.find_all('a', href=True)

        for link in target_links:
            href = link['href']
            # Check if the link matches the specified structure
            if re.match(r'/lv/collection/\d{7,}/[a-zA-Z]+', href):
                unique_links.add(href)  # Add to set to ensure uniqueness

    # Save the unique links to 'links.csv'
    with open('links.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Link'])
        for link in unique_links:
            writer.writerow([link])

    print(f"{len(unique_links)} autobiography links saved to links.csv")

# Example usage:
base_url = 'https://garamantas.lv/lv/collection/index?CollectionSearch%5Brepository_id%5D=1115628'
collection_pages = find_unique_collection_pages(base_url)
scrape_and_save_links(collection_pages)

428 unique links saved to links.csv


428 unique links are a significantly more than the actual number of 219 autobiographies, but this selection allowed to analyse the structures of the data in the repository. With a simple function it will be possible to filter the unnecessary files and reconstitute them as full URLs for each of the autobiographies.