In [None]:
# Import the required libraries
import re
import httpx
from selectolax.parser import HTMLParser
from dataclasses import dataclass, asdict, fields
import csv
from urllib.parse import urljoin, urlsplit

# Set the site url
site_url = "https://scotlandsplaces.gov.uk"

# Set the main url this county that you want to scrape
main_url = "https://scotlandsplaces.gov.uk/digital-volumes/ordnance-survey-name-books/lanarkshire-os-name-books-1858-1861"


# Create a dataclass to store the information
@dataclass
class placename_information:
    page_title: str | None
    placename: str = "No table on page"
    various_spellings: str = "No table on page"
    authority: str = "No table on page"
    situation: str = "No table on page"
    description: str = "No table on page"


def get_volumes(url):
    response = httpx.get(url)
    html = HTMLParser(response.text)
    links = html.css("a")

    volume_numbers = []
    for link in links:
        href = link.attributes.get("href")
        if href:
            match = re.search(
                r"/digital-volumes/ordnance-survey-name-books/lanarkshire-os-name-books-1858-1861/lanarkshire-volume-\d+",
                href,
            )
            if match:
                full_url = urljoin(site_url, match.group())
                volume_numbers.append(full_url + "/")
    return list(set(volume_numbers))


# Create a function to get the page numbers
def get_page_numbers(volume_url):
    response = httpx.get(volume_url)
    html = HTMLParser(response.text)
    links = html.css("a")

    page_numbers = []
    for link in links:
        href = link.attributes.get("href")
        if href:
            match = re.search(r"\d+$", href)
            if match:
                page_numbers.append(int(match.group()))

    return list(set(page_numbers))


# Create a function to get the html
def get_html(volume_url, page):
    url = f"{volume_url}{page}"
    response = httpx.get(url)
    return HTMLParser(response.text)


# Create a function to parse the placename information
def parse_placename(html):
    page_header = html.css_first("h1.page-header").text()
    name = html.css("div.well tr")  # get each table row from table
    results = []
    for item in name:
        item_parts = item.css("td")
        if len(item_parts) == 5:  # this will avoid the table headers
            new_item = placename_information(
                page_title=page_header,
                placename=item_parts[0].text().strip(),
                various_spellings=item_parts[1].text().replace("\n", "|").strip(),
                authority=item_parts[2].text().replace("\n", "|").strip(),
                situation=item_parts[3].text().strip(),
                description=item_parts[4].text().strip(),
            )
            results.append(asdict(new_item))
    # print(results)
    return results


# Create a function to write the results to a csv
def to_csv(res, filename):
    with open(f"{filename}.csv", "a", newline="") as file:
        fieldnames = [field.name for field in fields(placename_information)]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        if file.tell() == 0:
            writer.writeheader()
        writer.writerows(res)


# Create a main function to run the code
def main():
    volumes = get_volumes(main_url)
    print(volumes)
    for volume in volumes:
        volume_name = urlsplit(volume).path.split("/")[-2]
        page_numbers = get_page_numbers(volume)
        print(page_numbers)
        for page in page_numbers:
            html = get_html(volume, page)
            results = parse_placename(html)
            print(results)
            to_csv(results, volume_name)


if __name__ == "__main__":
    main()
