# Scraping chapter-only texts from the public domain sources

https://docs.google.com/document/d/1mu6Fu-XArUpYg7UqwZazfRrFdL_0H328U_uoCKa9Yj4/edit#

In [1]:
import os
import re
import requests
import lxml.html as lh
import logging as log

logger = log.getLogger(__name__)
logger.setLevel(log.DEBUG)

In [3]:
from texts import refs

In [None]:
for r in refs:
    if len(r["chapters"]) != 20:
        print(f'{r} {len(r["chapters"])}')

In [None]:
for i, record in enumerate(refs):
    name = record["name"]
    url = record["url"]
    locator = record["locator"]  # xpath
    title_locator = record["title_locator"]
    parts = record["chapters"]

    body = response.text
    if not os.path.exists(f"stories/{name}"):
        os.mkdir(f"stories/{name}")
    if not os.path.exists(f"stories/{name}/page.html"):
        log.debug(f"{name}, {url}, {parts}")
        response = requests.get(url)
        if response.status_code >= 400:
            log.error(
                f"Request for {url} returned with error code {response.status_code}"
            )
            continue
        with open(f"stories/{name}/page.html", "w") as fout:
            fout.writelines(body)

    log.debug(f"stories/{name}/page.html")
    doc = lh.parse(f"stories/{name}/page.html")
    for rid in parts:
        log.debug(rid)
        query = locator % rid  # apply rid tuple on f-string locator
        #         print(query)
        entries = doc.xpath(query)

        log.debug(entries)
        if not entries or len(entries) == 0:
            log.error(f"Unable to find: " + query)
        fulltext = "\n".join(e.text_content().strip() for e in entries).strip()
        fulltext = fulltext.replace("â€‹", "")

        query = title_locator % rid[0]  # apply rid tuple on f-string locator
        #         print(query)
        entries = doc.xpath(query)
        if not entries or len(entries) == 0:
            log.error(f"Unable to find title: " + query)
        fname = "".join(e.strip() for e in entries).strip()
        fname = re.sub("[^A-Za-z0-9]+", "_", fname)
        print(f"{name}/{fname}.txt")
        #         fname = re.sub("[^A-Za-z0-9]+", "", rid[0])
        with open(f"stories/{name}/{fname}.txt", "w") as f:
            f.writelines(fulltext)

In [None]:
# TODO: cleanup

# for italy:
# 1. remove [Pg XXX]
# 2. remove references [XX], with X being a number
# (maybe) 3. remove notes: [Z] and lines starting with [Z]
note_regex = "\[(Pg )?\d+\]"
page_regex = r"\[Pg \d{1,3}]"
# Lettered footnotes seem to contain that is content-specific, so possibly part of the corpus
# note_regex = r"\[[A-Z]\]"