Aufteilen und Bereinigen des Volltexts in die einzelnen Sätze

In [None]:
# -*- coding: utf-8 -*-
import re
import sqlite3

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"
multiple_dots = r'\.{2,}'

# Connect to the database
conn = sqlite3.connect('sentiment.db')
cursor = conn.cursor()

# Create the sentence table if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS sentences (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        text_id INTEGER,
        sentence_text TEXT,
        sequence INTEGER,
        finbert_result TEXT,
        finbert_score REAL,
        dffnsa_result TEXT,
        dffnsa_score REAL,
        fsa_result TEXT,
        fsa_score REAL,
        final_sentiment TEXT,
        FOREIGN KEY (text_id) REFERENCES sentiment(id)
    )
''')

cursor.execute('SELECT id, text FROM sentiment')
text = cursor.fetchall()


def remove_author_notes(text):
    # Erweitern des Muster, um Updates oder andere Anmerkungen am Anfang zu erkennen
    text = re.sub(r'^\([^\)]+\)\s*', '', text)

    # Muster zum Abgleich von Autor und Ort
    text = re.sub(r'By\s.+?\s\(.+?\)\s*[-–]\s*', '', text)

    # Muster zum Erkennen von Monat und Tag (Datum)
    text = re.sub(r'\w{3}\s\d{1,2}\s\([^)]*\)\s*-\s*', '', text)

    # Muster zur Erkennung von zusätzlichen Hinweisen am Ende
    text = re.sub(r'\(Reporting by [\w\s,]+;\s*additional reporting by [\w\s,]+;\s*Editing by [\w\s,]+\)$', '', text, flags=re.IGNORECASE)

    return text

# https://stackoverflow.com/a/31505798 (source)
def split_into_sentences(text: str) -> list[str]:
    """
    Split the text into sentences.

    If the text contains substrings "<prd>" or "<stop>", they would lead 
    to incorrect splitting because they are used as markers for splitting.

    :param text: text to be split into sentences
    :type text: str

    :return: list of sentences
    :rtype: list[str]
    """
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    if "No." in text: text = text.replace("No.","No<prd>")
    if "Jan." in text: text = text.replace("Jan.","Jan<prd>")
    if "Feb." in text: text = text.replace("Feb.","Feb<prd>")
    if "Mar." in text: text = text.replace("Mar.","Mar<prd>")
    if "Apr." in text: text = text.replace("Apr.","Apr<prd>")
    if "Jun." in text: text = text.replace("Jun.","Jun<prd>")
    if "Jul." in text: text = text.replace("Jul.","Jul<prd>")
    if "Aug." in text: text = text.replace("Aug.","Aug<prd>")
    if "Sep." in text: text = text.replace("Sep.","Sep<prd>")
    if "Sept." in text: text = text.replace("Sept.","Sept<prd>")
    if "Oct." in text: text = text.replace("Oct.","Oct<prd>")
    if "Nov." in text: text = text.replace("Nov.","Nov<prd>")
    if "Dec." in text: text = text.replace("Dec.","Dec<prd>")
    if "Corp." in text: text = text.replace("Corp.","Corp<prd>")
    if "Ltd." in text: text = text.replace("Ltd.","Ltd<prd>")
    if "vs." in text: text = text.replace("vs.","vs<prd>")
    if "e.g." in text: text = text.replace("e.g.","e<prd>g<prd>")
    if "i.e." in text: text = text.replace("i.e.","i<prd>e<prd>")
    if "Sen." in text: text = text.replace("Sen.","Sen<prd>")
    if "Calif." in text: text = text.replace("Calif.","Calif<prd>")
    if "Gov." in text: text = text.replace("Gov.","Gov<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    text = text.replace('<ellipsis>', '...')
    text = text.replace('<qst>', '?')
    text = text.replace('<exc>', '!')
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    if sentences and not sentences[-1]: sentences = sentences[:-1]
    return sentences

def remove_parenthetical_sentences(sentences):
    cleaned_sentences = []
    for sentence in sentences:
        # Überprüfen, ob der gesamte Satz in Klammern steht
        if sentence.startswith('(') and sentence.endswith(')'):
            continue  # Überspringen
        cleaned_sentences.append(sentence)
    return cleaned_sentences

def merge_parenthetical_statements(text):
    # Ersetzte Punkte innerhalb von Klammern durch einen Platzhalter
    text = re.sub(r'\(([^)]+)\)', lambda m: "(" + m.group(1).replace('.', '<prd>') + ")", text)
    return text

def replace_ellipses(text):
    # Ersetzte '...' durch einen Platzhalter
    return re.sub(r'\.\.\.', '<ellipsis>', text)

def replace_sentence_stops_in_quotes(text):
    # Definiere eine Funktion, um Satzzeichen innerhalb eines zitierten Textes zu ersetzen
    def replace_stops(match):
        # Ersetze alle Punkte, Fragezeichen und Ausrufezeichen im zitierten Text
        temp = match.group(0).replace('.', '<prd>')
        temp = temp.replace('?', '<qst>')
        temp = temp.replace('!', '<exc>')
        return temp
    
    # Regex-Muster, das Text in Anführungszeichen erfasst
    quote_pattern = r'["“”](.*?)["“”]'
    
    # ersetzte alle Satzzeichen in Anführungszeichen
    text = re.sub(quote_pattern, replace_stops, text, flags=re.UNICODE)

    return text


for id, text in text:
    split_text = split_into_sentences(merge_parenthetical_statements(replace_ellipses(replace_sentence_stops_in_quotes(text))))
    cleaned_sentences = remove_parenthetical_sentences(split_text)

    sentences = []

    for sentence in cleaned_sentences:
        sentence = remove_author_notes(sentence)
        sentences.append(sentence)

    # Jeden Satz in die Datenbank einfügen
    for sequence, sentence in enumerate(sentences):
        # Kürzer als 20 und länger als 1100 Zeichen überspringen
        if len(sentence) < 20:
            continue
        if(len(sentence) > 1100):
            continue
        cursor.execute('''
            INSERT INTO sentences (text_id, sentence_text, sequence)
            VALUES (?, ?, ?)
        ''', (id, sentence, sequence))

conn.commit()
conn.close()