A scraper for Autobiography collection.
Source: garamantas.lv

First, all links to the autobiographies were identified in the collection, using https://garamantas.lv/lv/collection/index?CollectionSearch%5Brepository_id%5D=1115628 as a base URL. Key to the autobiography links was combination of "/lv/collection/" and IDs of each collection. The result, however, will exceed the 219 autobiographies.

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
import re

def find_unique_collection_pages(base_url):

    response = requests.get(base_url, timeout=20)
    soup = BeautifulSoup(response.content, 'html.parser')
    collection_links = set()  # Using a set to ensure uniqueness

    for link in soup.find_all('a', href=True):
        absolute_url = urljoin(base_url, link['href'])
        if '/lv/collection/' in absolute_url:
            collection_links.add(absolute_url)

    return list(collection_links)

# Define sorting key to extract and convert digits following '/collection/' into integers
def sorting_key(link):
    match = re.search(r'/collection/(\d{7,})/', link)
    if match:
        return int(match.group(1))
    return float('inf')  # Return a very large number if no match found

def scrape_and_save_links(collection_links):
    unique_links = set()  # Set to store unique links
    for collection_link in collection_links:
        response = requests.get(collection_link, timeout=20)
        soup = BeautifulSoup(response.content, 'html.parser')
        target_links = soup.find_all('a', href=True)

        for link in target_links:
            href = link['href']
            # Check if the link matches the specified structure
            if re.match(r'/lv/collection/\d{7,}/[a-zA-Z]+', href):
                unique_links.add(href)  # Add to set to ensure uniqueness

    # Sort the unique links by the string of digits following '/collection/' in ascending order
    unique_links_sorted = sorted(unique_links, key=sorting_key)

    # Save the unique links to 'links.csv'
    with open('links.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Link'])  # Add the header row
        for link in unique_links_sorted:
            writer.writerow([link])

base_url = 'https://garamantas.lv/lv/collection/index?CollectionSearch%5Brepository_id%5D=1115628'
collection_pages = find_unique_collection_pages(base_url)
scrape_and_save_links(collection_pages)

More than 400 unique links are a significantly more than the actual number of 219 autobiographies, but this selection allowed to analyse the structures of the data in the repository. With a simple function it will be possible to filter the unnecessary files and reconstitute them as full URLs for each of the autobiographies.

In [None]:
# The code will find all unique diaries from csv file and save them to 'diaries.csv' by concatenating the base URL to string before "?", /
# and adding base URL to the beginning of each link.

import csv

infile = open('links.csv', mode = 'r', encoding = 'utf-8')
links =  infile.readlines()

infile.close()

base_url = 'https://garamantas.lv' 

links = links[1:] # removing header
for link in range(len(links)):
    if char := links[link].find('?'):
        links[link] = links[link][:char]
    else:
        continue

new_links = sorted(set(base_url + link for link in links), key=sorting_key)

with open('diaries.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Link'])
    for link in new_links:
        writer.writerow([link])

print(f"There are {len(new_links)} diaries saved to diaries.csv")

Next task is the finding of all verified transcripts existing for the diaries. The study of the structure indicated that the texts that have to be saved are in file-pages, which will have to be found.

In [25]:
# Next task is the finding of all verified transcripts existing for the diaries.
# The study of the structure indicated that the texts that have to be saved are in file-pages,
# which will have to be found in the data-page attribute of the li tag. .

def scrape_page_links(url, link_criteria):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        rows = soup.find_all(link_criteria)
        links = set(urljoin(url, row.find('a')['href']) for row in rows)
        return links
    except requests.RequestException as e:
        print(f"Error fetching the content: {e}")
        return set()
    except Exception as e:
        print(f"An error occurred: {e}")
        return set()

def scrape_autobiography_collection(url):
    return scrape_page_links(url, lambda tag: tag.name == 'li' and tag.find('a') and 'data-page' in tag.find('a').attrs
                                     and 'Illustration' not in tag.find('a')['href'])

def scrape_manuscript_pages(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        rows = soup.find_all(lambda tag: tag.name == 'tr' and 'data-key' in tag.attrs and tag.find('td', string='Manuskripts'))
        manuscript_links = []
        for row in rows:
            link = row.find('a')['href']
            manuscript_links.append(urljoin('https://garamantas.lv', link))
        
        return set(manuscript_links)
    except requests.RequestException as e:
        print(f"Error fetching the content: {e}")
        return set()
    except Exception as e:
        print(f"An error occurred: {e}")
        return set()

def scrape_text_from_html(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        
        tab_content_div = soup.find('div', class_='tab-content')
        
        if tab_content_div:
            tr_elements = tab_content_div.find_all('tr')
            
            pabeigts_found = any(tr.find('th', string='Pabeigts') and tr.find('td', string='Jā') for tr in tr_elements)
            parbaudits_found = any(tr.find('th', string='Pārbaudīts') and tr.find('td', string='Jā') for tr in tr_elements)
            
            if pabeigts_found and parbaudits_found:
                transcription_td = soup.find('td', class_='file-trascription-text')
                
                if transcription_td:
                    transcription_text = transcription_td.get_text(separator='\n').strip()
                    return transcription_text
                else:
                    return None
            else:
                return None
        else:
            print("Could not find the tab-content div.")
            return None
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    

In [33]:
import os
import pandas as pd

data_folder = 'Data'
corpus_folder = 'Data/Corpus'

# Create Data folder if it doesn't exist
if not os.path.exists(data_folder):
    os.makedirs(data_folder)

# Create Corpus folder if it doesn't exist
if not os.path.exists(corpus_folder):
    os.makedirs(corpus_folder)

# Define functions to save files in the respective folders
def save_to_corpus_folder(filename, content):
    filepath = os.path.join(corpus_folder, filename)
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(content)

def save_csv_to_data(filename, df):
    filepath = os.path.join(data_folder, filename)
    df.to_csv(filepath, index=False, encoding='utf-8-sig')

def get_file_name_from_url(url):
    url_sections = url.split('/')
    return '_'.join(url_sections[url_sections.index('collection')+1:])

def get_file_id_from_url(url):
    url_sections = url.split('/')
    return '_'.join(url_sections[url_sections.index('file')+1:])        

def save_txt_to_corpus(filename, texts):
    filepath = os.path.join(corpus_folder, filename)
    with open(filepath, 'w', encoding='utf-8') as file:
        for text in texts:
            file.write(text + '\n')

def scrape_and_save_texts(url, links):
    text_count = 0  # Variable to store the count of extracted texts
    manuscript_pages_count = 0  # Variable to store the count of manuscript pages extracted
    csv_filename = get_file_name_from_url(url) + '.csv'
    with open(os.path.join(data_folder, csv_filename), 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['File-ID', 'Transcribed Text'])  # Write headers
        for link in links:
            manuscript_pages = scrape_manuscript_pages(link)
            manuscript_pages_count += len(manuscript_pages)  # Update total count of manuscript pages
            for manuscript_page in manuscript_pages:
                file_id = get_file_id_from_url(manuscript_page)
                text = scrape_text_from_html(requests.get(manuscript_page, timeout=20).content)
                if text:
                    writer.writerow([file_id, text])
                    text_count += 1  
    csv_to_txt(csv_filename)
    print(f"Total manuscript pages extracted: {manuscript_pages_count}")
    print(f"Total texts extracted: {text_count}")

def csv_to_txt(csv_file):
    df = pd.read_csv(os.path.join(data_folder, csv_file))
    texts = df['Transcribed Text']
    txt_filename = csv_file.replace('.csv', '.txt')
    if len(texts) > 0:
        try:
            save_txt_to_corpus(txt_filename, texts)
            print(f"Text file '{txt_filename}' successfully created.")
        except Exception as e:
            print(f"Failed to create text file '{txt_filename}': {e}")
    else:
        print(f"No transcribed and verified texts found. Text file '{txt_filename}' not created.")


In [29]:
# Load the first link from the diaries.csv file for testing purposes ignoring header

diaries_file = pd.read_csv('diaries.csv')
autobiography_collection_url =  diaries_file['Link'][0]
manuscript_hrefs = scrape_autobiography_collection(autobiography_collection_url)
scrape_and_save_texts(autobiography_collection_url, manuscript_hrefs)
 

Text file '1115688_Karla-Valguma-dienasgramatas.txt' successfully created.
 Manuscript pages extracted:  20
Texts extracted:  56


In [34]:
# If test is successful, the code can be run for all links in diaries.csv file, except the first one, as it has already been processed.

for link in diaries_file['Link'][1:]:
    manuscript_hrefs = scrape_autobiography_collection(link)
    scrape_and_save_texts(link, manuscript_hrefs)

# print the number of files in the corpus folder
print(f"Number of files in the corpus folder: {len(os.listdir(corpus_folder))}")

No transcribed and verified texts found. Text file '1115691_Ilzes-Spergas-skolas-dienasgramatas.txt' not created.
Total manuscript pages extracted: 216
Total texts extracted: 0
Text file '1115696_Edgara-Reinsona-dienasgramata.txt' successfully created.
Total manuscript pages extracted: 73
Total texts extracted: 72
No transcribed and verified texts found. Text file '1116161_Rutas-Luses-atminas.txt' not created.
Total manuscript pages extracted: 0
Total texts extracted: 0
Text file '1117599_Irmas-Viksninas-dienasgramata.txt' successfully created.
Total manuscript pages extracted: 62
Total texts extracted: 60
Text file '1117599_nezinamas-rigas-skolnieces-dienasgramata.txt' successfully created.
Total manuscript pages extracted: 62
Total texts extracted: 60
Text file '1117608_Dainas-Rozenblates-Atminu-zimejumi.txt' successfully created.
Total manuscript pages extracted: 162
Total texts extracted: 118
Text file '1119253_Adama-Purmala-dzivesstasts.txt' successfully created.
Total manuscript 