In [102]:
from bs4 import BeautifulSoup

import pathlib
import re

import pandas as pd

import db_connection
import os
import json

In [103]:
# connect to database
cursor, connection = db_connection.get_connection()

In [104]:
data_path = str(pathlib.Path().resolve())

In [120]:
cursor.execute("""SELECT  e00id, e00volltext
                    FROM e00_orgelpredigten""")
sermons = cursor.fetchall()

In [106]:
tables = []
jsons = []
for i in os.listdir("sermon_tables"):
    tables.append(i[:-4])
for j in os.listdir("sermons_chunked"):
    jsons.append(j[:-5])

In [107]:
tables_unique = list(set(tables) - set(jsons))
jsons_unique = list(set(jsons) - set(tables))

print("Tables unique:", tables_unique)
print("JSONs unique:", jsons_unique)

Tables unique: ['E000042', 'E000020', 'E000039', 'E000038', 'E000061', 'E000036', 'E000046', 'E000024', 'E000015', 'E000027', 'E000045', 'E000048']
JSONs unique: []


In [108]:
def amend_database_entry(cursor, connection, id, amended_text):
    sql = f"UPDATE e00_orgelpredigten SET e00volltext = '{amended_text}' WHERE e00id = '{id}'"
    cursor.execute(sql)
    connection.commit()
    print(cursor.rowcount, "record(s) affected") 

In [109]:
def remove_whitespace_in_id(match):
    id_value = match.group(1)
    cleaned = ''.join(id_value.split())  # Remove all whitespace
    return f'id="{cleaned}"'

In [110]:
def cleanup_sermon(text: str) -> str:
      """Cleans up organ sermon text and returns barebones xml.

      The text is stripped of all editorial and layout information.
      All tags are removed, apart from the following:
            * musikwerk: A direkt quote from a song
            * quelle: A quote from an authority
            * literatur: A quote from literature
            * bibel: A quote from the bible
            * quote: General quoted passages
            * orgelpredigt: A quote from another organ sermon
      The text is returned within <xml></xml>-tags, thus hopefully
      turning it into well-formed XML.

      Args:
          text (str): The input text

      Returns:
          str: The cleaned up text.
      """

      rep = {"<lb />": "", 
            "\r": "",
            #"\n": " ",
            "ᵜ": " ",
            "ʬ": "&",
            "<sic>": "",
            "</sic>": "",
            "<choice>": "",
            "</choice>": "",
            "</span>": "",
            "<div>": "",
            "</div>": "",
            "<err>": "",
            "</err>": "",
            "<fn></fn>": "",
            "<fn />": "",
            'typ="real"': "",
            '<?xml version="1.0" encoding="UTF-8"?>': ""
            }

      rep = dict((re.escape(k), v) for k, v in rep.items()) 
      pattern = re.compile("|".join(rep.keys()))
      text = pattern.sub(lambda m: rep[re.escape(m.group(0))], text)

      text = re.sub(r'<pb page="[^"]+" ?\/?>(<\/pb>)?', "", text)              # remove page breaks
      text = re.sub(r'<h[123456]>([\S\s]+?)<\/h[123456]>', "", text)           # remove header tags

      text = re.sub(r'<x?person id="[^"]*">([\S\s]+?(?=<))<\/x?person>', r"\1", text)              # remove persons
      text = re.sub(r'<x?ort id="[^"]*">([\S\s]+?(?=<))<\/x?ort>', r"\1", text)                    # remove places
      text = re.sub(r'<x?orgel id="[^"]*">([\S\s]+?(?=<))<\/x?orgel>', r"\1", text)                # remove organ
      text = re.sub(r'<kunstwerk id="[^"]*">([\S\s]+?(?=<))<\/kunstwerk>', r'\1', text)            # remove kunstwerk
      text = re.sub(r'<ereignis id="[^"]*">([\S\s]+?(?=<))<\/ereignis>', r'\1', text)              # remove ereignis

      text = re.sub(r'<titel id="([^"]*)"( typ="real")? *>([\S\s]+?(?=<))</titel>', r'<quelle id="\1">\3</quelle>', text)

      text = re.sub(r'<supplied>[\S\s]+?<\/supplied>', "", text)                    # remove editorial additions
      text = re.sub(r'<corr>[\S\s]+?<\/corr>', "", text)                            # remove editorial corrections
      text = re.sub(r'<ref typ="trl">[\S\s]+?<\/ref>', "", text)                    # remove translations
      text = re.sub(r'<ref typ="anm">([\S\s]+?(?=<\/ref>))<\/ref>', "", text)       # remove editorial comments
      text = re.sub(r'<hi lang="[a-z]+">([\S\s]+?)<\/hi>', r"\1", text)             # remove typographical markup
      text = re.sub(r'(<hi rend="[a-z\-]+">)+([\S\s]+?)(<\/hi>)+', r"\2", text)     # remove typographical markup
      text = re.sub(r'<\/hi>', "", text)
      text = re.sub(r'<ref typ="ofn" symbol="\(\S+\)">', '', text)
      text = re.sub(r'<\/ref>', "", text)

      text = re.sub(r'<table([\S\s]+?(?=<))<\/table>', "", text)                          # remove tables

      text = re.sub(r'<div class="[a-z]+">', "", text)                                    # unwrap all <div> tags
      text = re.sub(r'=\s+', "", text)                                                    # undo hyphenations
      text = re.sub(r'<note>[\S\s]+?<\/note>', "", text)                                  # remove marginal notes
      text = re.sub(r'<span[^>]+>', "", text)

      text = re.sub(r'[\s\n]+', ' ', text)                                                # collapse all whitespace
      text = re.sub(r'(["\'\(\)»«›‹\.,\;])<', r'\1 <',text)
      text = re.sub(r'>(["\'\(\)»«›‹\.,\;])', '> \1', text)
      text = re.sub(r'id="([^"]+)"', remove_whitespace_in_id, text)                       # remove whitespace in IDs
      text = re.sub(r' </', '</', text)                                                   # remove whitespace before closing tags
      text = re.sub(r'\x01', '', text)

      #text = re.sub(r'[/\.,;=\?!|]', '', text)
      
      return "<xml> " + text + " </xml>"

In [111]:
def remove_quotes(soup):
    # find all quote tags that are inside other tags or contain other tags and remove them
    for quote in soup.find_all("quote"):
        # If <quote> has children tags or is nested inside another tag (not direct child of root)
        has_nested_tags = any(child.name for child in quote.children)
        is_nested_in_another_tag = quote.parent.name != "root"

        if has_nested_tags or is_nested_in_another_tag:
            quote.unwrap()
    
    return soup

In [112]:
def remove_punctuation(word):
    return re.sub(r'[=*\(\)\[\]]', '', word)

In [113]:
def soup_to_table(text: str) -> list:
    """Takes an xml string and turns it into a table of words and attributes

    Args:
        text (str): The text to be searched
    
    Returns:
        list: A table with the columns 'word', 'types', and 'reference'
    """

    return_table = []
    
    status = {
        "bibel": False,
        "quote": False,
        "quelle": False,
        "musikwerk": False,
        "literatur": False,
        "orgelpredigt": False
        }

    reference = []
    typemarker = ""

    openingtag = re.compile('<[^/]')
    irrelevantid = re.compile('id="E0[1234567][0-9]+"')

    words = re.split(r'[> ]+', text)

    for word in words:
        # update type assignment
        typemarker = ""
        for k, i in status.items():
            if i:
                typemarker = " ".join([typemarker, k])

        if re.match(openingtag, word):
            if word.startswith(("<xml", "</xml")):
                continue
            elif word.startswith("<bibel"):
                status["bibel"] = True
                continue
            elif word.startswith("<quote"):
                status["quote"] = True
                continue
            elif word.startswith("<quelle"):
                status["quelle"] = True
                continue
            elif word.startswith("<musikwerk"):
                status["musikwerk"] = True
                continue
            elif word.startswith("<literatur"):
                status["literatur"] = True
                continue
            elif word.startswith("<predigt"):
                status["orgelpredigt"] = True
                continue
            else:                               # ignore other stray tag fragments
                continue

        elif "</" in word:
            if word.endswith("</bibel"):
                status["bibel"] = False
                row = (word[:-7], typemarker, reference)
                reference = reference[:-1]
            elif word.endswith("</quote"):
                status["quote"] = False
                row = (word[:-7], typemarker, reference)
                reference = reference[:-1]
            elif word.endswith("</quelle"):
                status["quelle"] = False
                row = (word[:-8], typemarker, reference)
                reference = reference[:-1]
            elif word.endswith("</musikwerk"):
                status["musikwerk"] = False
                row = (word[:-11], typemarker, reference)
                reference = reference[:-1]
            elif word.endswith("</literatur"):
                status["literatur"] = False
                row = (word[:-11], typemarker, reference)
                reference = reference[:-1]
            elif word.endswith("</predigt"):
                status["orgelpredigt"] = False
                row = (word[:-9], typemarker, reference)
                reference = reference[:-1]
            else:
                row = (word.split("</")[0], typemarker, reference)
        
        elif word.startswith("id="):
            if re.match(irrelevantid, word):
                continue
            else:
                refs = re.findall('"(\S*?)"', word)
                if refs:
                    reference = reference + refs
                continue

        else:
            row = (word, typemarker, reference)
        return_table.append(row)

    return return_table

  refs = re.findall('"(\S*?)"', word)


In [119]:
sermons[41]

('E000042', '')

In [121]:
inbetween_text = ""
for k, i in  enumerate(sermons):                    # iterate over sermons to create tables
    predigtid = i[0]
    predigttext = i[1]
    if predigtid not in ["E000031", "E000018"]:                      # note: E000031 should be excepted since theres no markup!
        print(f"Starting. Nr: {k}, ID: {predigtid}")

        predigttext_cleaned = cleanup_sermon(predigttext)       # clean up raw markup
        soup = BeautifulSoup(predigttext_cleaned, 'html.parser')
        if len(soup.text) > 500:
            paras = []
            for par in soup.find_all("p"):
                if par.find("p"):
                    continue
                else:
                    
                    soup = remove_quotes(par)
                    text = soup_to_table(re.sub(r"\x01", "", str(soup)))    # turn markup into table
                                        # create df for long enough sermons
                    df = pd.DataFrame(text, columns=["word", "types", "reference"])
                    df['word'] = df['word'].apply(remove_punctuation)   # remove remaining punct.
                    df['word'] = df['word'].str.lower()                 # lowercase
                    df = df[df['word'].astype(bool)]                    # remove empty rows
                    words = df["word"].values.tolist()
                    types = df["types"].values.tolist()
                    references = df["reference"].values.tolist()

                    paras.append([words, types, references])

            delimiters = ["/", ".", ",", ":", "!", "?", ";"]

            all_paras = []

            for para in paras:
                if para[0] == []:
                    continue

                all_sents = []

                words = para[0]
                types = para[1]
                refs = para[2]

                inbetween_text += " ".join(words)

                sent_words = []
                sent_types = []
                sent_refs = []

                for i in range(len(words)):
                    if i == len(words) - 1:
                        sent_words.append(words[i])
                        sent_types.append(types[i])
                        sent_refs.append(refs[i])

                        all_sents.append([sent_words, sent_types, sent_refs])
                    else:
                        if any(x in words[i] for x in delimiters):
                            sent_words.append(words[i])
                            sent_types.append(types[i])
                            sent_refs.append(refs[i])
                            
                            if len(sent_words) >= 4:
                                all_sents.append([sent_words, sent_types, sent_refs])
                                
                                sent_words = []
                                sent_types = []
                                sent_refs = []
                        else:
                            sent_words.append(words[i])
                            sent_types.append(types[i])
                            sent_refs.append(refs[i])
                
                all_paras.append(all_sents)

            sermon_dict = []

            all_paragraphs = []

            for para in all_paras:
                all_sents = []
                
                for sent in para:

                    words = sent[0]
                    types = sent[1]
                    refs = sent[2]
                    sentence = {}
                    sentence["words"] = words
                    sentence["types"] = types
                    sentence["references"] = refs

                    all_sents.append(sentence)
                all_paragraphs.append(all_sents)
            
            with open(f"sermons_chunked/{predigtid}.json", "w") as f:
                json.dump(all_paragraphs, f, ensure_ascii=False)
            print(f"Saved {predigtid}")


Starting. Nr: 0, ID: E000001
Saved E000001
Starting. Nr: 1, ID: E000002
Saved E000002
Starting. Nr: 2, ID: E000003
Saved E000003
Starting. Nr: 3, ID: E000004
Starting. Nr: 4, ID: E000005
Starting. Nr: 5, ID: E000006
Starting. Nr: 6, ID: E000007
Saved E000007
Starting. Nr: 7, ID: E000008
Saved E000008
Starting. Nr: 8, ID: E000009
Saved E000009
Starting. Nr: 9, ID: E000010
Starting. Nr: 10, ID: E000011
Starting. Nr: 11, ID: E000012
Starting. Nr: 12, ID: E000013
Starting. Nr: 13, ID: E000014
Saved E000014
Starting. Nr: 14, ID: E000015
Saved E000015
Starting. Nr: 15, ID: E000016
Saved E000016
Starting. Nr: 16, ID: E000017
Starting. Nr: 18, ID: E000019
Starting. Nr: 19, ID: E000020
Saved E000020
Starting. Nr: 20, ID: E000021
Saved E000021
Starting. Nr: 21, ID: E000022
Starting. Nr: 22, ID: E000023
Saved E000023
Starting. Nr: 23, ID: E000024
Saved E000024
Starting. Nr: 24, ID: E000025
Starting. Nr: 25, ID: E000026
Starting. Nr: 26, ID: E000027
Saved E000027
Starting. Nr: 27, ID: E000028
Star