In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re

import nlp_project_functions as functions

import logging

logfile = "./logs/preprocessing.log"
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

file_handler = logging.FileHandler(logfile)
file_handler.setFormatter(formatter)
file_handler.setLevel(logging.INFO)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel(logging.INFO)

logger = logging.getLogger("preprocessing.log")
logger.setLevel(logging.INFO)

logger.addHandler(file_handler)
logger.addHandler(stream_handler)

In [2]:
generic_names = ["König", "Königs", "Pfarrkirche", 
                 "Kirche", "Kirchen", "Kirch", 
                 "Land", "Mantua", "Dom",
                 "Tempel", "Stifter", "Stadtkirche",
                 "Stifters"]

In [3]:
for i in range(1,112):
    nr = str(i).rjust(4, '0')
    logger.info(f"Processing orgelpredigt_text E00{nr}")
    
    # fetch web page content and parse into beautiful soup object
    url = f"https://orgelpredigt.ur.de/E00{nr}txt"
    orgel_response = requests.get(url)
    orgel_soup = BeautifulSoup(orgel_response.text, 'html.parser')
    
    text = orgel_soup.find("div", {"class": "edition"})
    
    # initial clean up
    for tooltip in text.findAll("span", {"class": "tip"}):
        tooltip.decompose()
    for note in text.findAll("span", {"class": "note"}):
        note.decompose()
    for icon in text.findAll("a", {"class": "icon"}):
        icon.decompose()
    for pagebreak in text.findAll("span", {"class": "pagebreak"}):
        pagebreak.replace_with(" ")
    for supplied in text.findAll("span", {"class": "supplied"}):
        supplied.decompose()

    # filter out orgelpredigt_texts that are not edited
    if len(text.find_all("div", {'class': 'titel'})) == 0 and len(text.find_all("h3")) == 0:
        logger.info(f"Skipping orgelpredigt_text E00{nr}")
        continue
    else:
        # 1. Get list of normalised names
        names_list = text.find_all("div", {"class": "inhalt"})
        
        # skip orgelpredigt_texts that lack semantic markup
        if len(names_list[1].contents) < 6:
            logger.info("No semantic markup")
            continue
        
        # work around for some specific lists of entities
        if "Orgelpredigten" in names_list[1].contents[5].contents[7].text:
            names_list = names_list[1].contents[7].contents[9].contents[1]
        else:
            names_list = names_list[1].contents[5].contents[9].contents[1]
        
        if len(names_list) == 0:
            logger.warning("Names list is weirdly empty")

        # remove entity lists
        for x in text.findAll(["ol", "ul", "h2", "sup"]):
            x.decompose()
        for div in text.findAll("div", {"class": "inhalt"}):
            div.decompose()
        for div in text.findAll("div", {"class": "overlay"}):
            div.decompose()
        for div in text.findAll("div", {"class": "zitierlink"}):
            div.decompose()

        # remove line breaks
        for br in text.select("br"):
            br.replace_with(" ")

        # throw out table cells that have no additional markup
        for x in text.findAll("td"):
            if not x.find("span"):
                x.string = " "
                x.unwrap()

        orgelpredigt_text = functions.extract_names(text)

        # make entities simpler
        orgelpredigt_text = re.sub(r'<a href=\"E01[^>]*>([^<]*)</a>', r"<PERSON>\1</PERSON>", orgelpredigt_text)
        orgelpredigt_text = re.sub(r'<a href=\"E03[^>]*>([^<]*)</a>', r"<LOCATION>\1</LOCATION>", orgelpredigt_text)

        # add names that have no markup
        orgelpredigt_text = re.sub(r'(Herr\s|Herrn\s)([A-Z][a-zäöüß]+\s[A-Z][a-zäöüß]+)(?![^<P]*</PERSON>)', r'\1<PERSON>\2</PERSON>', orgelpredigt_text)

        # add in missing entities markup
        names_list = functions.create_name_list(orgelpredigt_text)

        for name in names_list[0]:
            if name not in generic_names:
                orgelpredigt_text = re.sub(r'[^>]' + name + r'\s', f"<PERSON>{name}</PERSON>", orgelpredigt_text)

        for name in names_list[1]:
            if name not in generic_names:
                orgelpredigt_text = re.sub(r'[^>]' + name + r'\s', f"<LOCATION>{name}</LOCATION>", orgelpredigt_text)

        #orgelpredigt_text = functions.sermon_cleanup(orgelpredigt_text)

        # add spaces around tags
        orgelpredigt_text = re.sub(r'</(PERSON|LOCATION)>(\S)', r'</\1> \2', orgelpredigt_text)
        orgelpredigt_text = re.sub(r'(\S)<(PERSON|LOCATION)>', r'\1 <\2>', orgelpredigt_text)

        with open(f"data/data_preprocessed/E00{nr}_text.txt", 'w', encoding='utf8') as f:
            f.write(orgelpredigt_text)
            logger.info(f"Saved text for orgelpredigt_text E00{nr}")

2024-03-07 18:51:44,765 - preprocessing.log - INFO - Processing orgelpredigt_text E000001
2024-03-07 18:51:45,252 - preprocessing.log - INFO - Saved text for orgelpredigt_text E000001
2024-03-07 18:51:45,252 - preprocessing.log - INFO - Processing orgelpredigt_text E000002
2024-03-07 18:51:45,744 - preprocessing.log - INFO - Saved text for orgelpredigt_text E000002
2024-03-07 18:51:45,744 - preprocessing.log - INFO - Processing orgelpredigt_text E000003
2024-03-07 18:51:46,536 - preprocessing.log - INFO - Saved text for orgelpredigt_text E000003
2024-03-07 18:51:46,537 - preprocessing.log - INFO - Processing orgelpredigt_text E000004
2024-03-07 18:51:46,654 - preprocessing.log - INFO - Skipping orgelpredigt_text E000004
2024-03-07 18:51:46,654 - preprocessing.log - INFO - Processing orgelpredigt_text E000005
2024-03-07 18:51:46,776 - preprocessing.log - INFO - Skipping orgelpredigt_text E000005
2024-03-07 18:51:46,777 - preprocessing.log - INFO - Processing orgelpredigt_text E000006
20

In [4]:
for file in os.listdir('data/abacus_download'):
    if file.endswith("xml"):
        logger.inf0(f"Processing {file}")
        
        with open(f"data/abacus_download/{file}") as f:
            text = f.read()
        
        aba_soup = BeautifulSoup(text, 'xml')

        #get content tags
        dedication = aba_soup.find("div", {"type": "dedication"})
        body = aba_soup.find("body")
        back = aba_soup.find("back")

        aba_soup.body.wrap(aba_soup.new_tag("wrapper"))

        if not dedication == None:
            body.insert_before(dedication)
        if not back == None:
            body.insert_after(back)

        wrapper = aba_soup.find("wrapper")

        # remove page numbers, ornaments and catches
        for item in wrapper.findAll("fw", {"type": "header"}):
            item.decompose()
        for item in wrapper.findAll("fw", {"type": "catch"}):
            item.decompose()
        for item in wrapper.findAll("seg", {"type": "ornament"}):
            item.decompose()

        aba_text = functions.extract_names_aba(wrapper)
        aba_text = re.sub(r'\s+', ' ', aba_text)

        # make entities simpler
        aba_text = re.sub(r'<persName (.*?)>\s*(.*?)\s*<\/persName>', r"<PERSON>\2</PERSON>", aba_text)
        aba_text = re.sub(r'<placeName (.*?)>\s*(.*?)\s*<\/placeName>', r"<LOCATION>\2</LOCATION>", aba_text)

        # remove / and whitespace around punctuation
        aba_text = re.sub(r' ([\.,;:!?])', r"\1", aba_text)
        aba_text = re.sub(r'(?<!<)/', '', aba_text)
        aba_text = aba_text.replace('<wrapper>', '').replace('</wrapper>', '')

        with open(f'data/data_preprocessed/{file[:-4]}.txt', 'w', encoding='utf8') as f:
            f.write(aba_text)

Processing Abraham-Loesch_Wienn.xml
Processing Abraham-Mercks_Wien.xml
Processing Abraham-Todten_Capelle.xml
Processing Abraham-Augustini_feuriges_Hertz.xml
Processing Abraham-Todten_Bruderschaft.xml
