In [None]:
import json
import re
import core.utils as oa
from rapidfuzz import fuzz
import pandas as pd
import statistics

import io
import datetime
from numpyencoder import NumpyEncoder

In [2]:
def fuzzy_match(a, b, threshold=80):
    similarity_score = fuzz.ratio(a, b)
    return similarity_score

In [3]:
def is_song_in_book(id):
    match = re.findall(r'E10[0-9]{4}', id)[0]
    
    with open('songs_to_pages_mapping.json') as f:
        songbook_pages = json.load(f)
    if songbook_pages[match]["pages"] == '':
        return False
    else:
        return True

In [4]:
def song_page(id): 
    match = re.findall(r'E10[0-9]{4}', id)[0]
    with open('songs_to_pages_mapping.json') as f:
        songbook_pages = json.load(f)
    page = songbook_pages[match]["pages"]
    
    return [int(page) + 42, int(page) + 43, int(page) + 44]

In [5]:

def flatten(xss):
    return [x for xs in xss for x in xs]

In [6]:
def is_consecutive(L):
    return all(n-i==L[0] for i,n in enumerate(L))

In [7]:
def is_equal(L):
    return all(n == L[0] for n in L)

In [8]:
def check_page_proxy(numbers):
  """
  Checks if a list of numbers are either all the same or have a maximum difference of 1 between any two numbers.

  Args:
    numbers: A list of numbers.

  Returns:
    True if the numbers meet the criteria, False otherwise.  Returns False if the list is empty.
  """

  if not numbers:
    return False  # Handle empty list case

  first_number = numbers[0]
  all_same = True
  max_diff_one = True

  for number in numbers:
    if number != first_number:
      all_same = False
    if abs(number - first_number) > 1:
      max_diff_one = False

  return all_same or max_diff_one

In [9]:
def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """Removes duplicate matches for sentences in quote classification

    Args:
        df (pd.DataFrame): The Dataframe containing the sentence matches

    Returns:
        pd.DataFrame: The Dataframe with duplicates removed
    """
    def find_duplicate_satz(df):
        duplicate_indices = {}
        for par_satz, indices in df.groupby(['Paragraph', 'Satz']).groups.items():
            if len(indices) > 1:
                satz_id = str(par_satz[0]) +  "-" + str(par_satz[1])
                duplicate_indices[satz_id] = list(indices)  # Convert indices to a list
        return duplicate_indices

    for satz_id, indices in find_duplicate_satz(df.copy()).items():
        satz = [int(x) for x in satz_id.split("-")]
        if indices[0]-1 in df.index:
            check = df["Liederbuch"][indices[0]-1]
        else:
            check = df["Liederbuch"][indices[-1]+1]
        matches = df.query(f"Paragraph == {satz[0]} and Satz == {satz[1]} and Liederbuch == {check}").index
        if len(matches):
            match_index = matches[0]
            for i in indices:
                if i != match_index:
                    df.drop([i], inplace=True)

    df.sort_values(by=["Ähnlichkeit"], inplace=True)
    df.drop_duplicates(subset=['Paragraph','Satz'], keep='last', inplace=True)
    df.sort_index(inplace=True)
    df.reset_index(drop=True)

    return df

In [10]:
def reconsider_match(sent, pages):
    highest_match = 0
    matches = {}
    for page in pages:
        with open(f"source_texts/praxis_pietatis_verses/{page}.json") as f:
            verses = json.load(f)
        
        for verse in verses:
            sim_score = fuzz.ratio(sent, verse)
            if sim_score > highest_match:
                highest_match = sim_score
                matches[sim_score] = [verse, page]

    if highest_match > 0:
        return [matches[highest_match], highest_match]
    else:
        return [["no match", 0], 0.0]

In [11]:
def add_inferred_matches(guessed_hits: pd.DataFrame, sermon: oa.Sermon) -> pd.DataFrame:
    for n in range(3):
        additional_matches = []
        sent_add = lambda x : [x+2,x+3,x+4]
        for i in range(0, len(guessed_hits) - 2):
            chunk = guessed_hits.iloc[i:i+2]
            pages = chunk["Liederbuch"].to_list()
            pars = chunk["Paragraph"].to_list()
            sents = chunk["Satz"].to_list()
            if all(x==pars[0] for x in pars):   # abort if paragraphs change
                if sents[1] in sent_add(sents[0]):
                    missing_sent = " ".join(sermon.chunked[pars[0]][sents[0]+1]["words"])
                    match, sim_score = reconsider_match(missing_sent, [pages[0], pages[1]])
                    verse = match[0]
                    page = match[1]
                    additional_matches.append([missing_sent, 
                                            pars[0],
                                            sents[0]+1, 
                                            page, 
                                            verse, 
                                            float(f"{sim_score:.2f}"), 
                                            False])
                    
        new_matches = pd.DataFrame(additional_matches, columns=["Predigt", "Paragraph", "Satz", 
                                                        "Liederbuch", "Liedvers", 
                                                        "Ähnlichkeit", "Dopplung"])

        guessed_hits = pd.concat([guessed_hits, new_matches])
        guessed_hits.sort_values(["Paragraph", "Satz"], ascending=True, inplace=True)
        guessed_hits.reset_index(drop=True)
    
    return guessed_hits

In [12]:
def correct_inbetween_matches(df: pd.DataFrame) -> pd.DataFrame:
    for i in range(0, len(df) - 3):
        chunk = df.iloc[i:i+3]
        pages = chunk["Liederbuch"].to_list()
        pars = chunk["Paragraph"].to_list()
        sents = chunk["Satz"].to_list()
        if (all(x==pars[0] for x in pars) and not is_equal(pages)):   # abort if paragraphs change or pages are already the same
            if pages[0] == pages[2]:
                missing_sent = chunk["Predigt"][chunk.index[1]]
                print(missing_sent)
                match, sim_score = reconsider_match(missing_sent, [pages[0]])
                if sim_score > 60:
                    verse = match[0]
                    page = match[1]
                    new_data = [missing_sent, pars[1], sents[1], page, verse, float(f"{sim_score:.2f}"), False]
                    df.loc[(df['Paragraph'] == pars[1]) & (df["Satz"] == sents[1])] = new_data
                    #df.iloc[i] = new_data

    return df

In [13]:
relevant_page_texts = []
#for n in page_nrs:
for n in range(41, 1291):
    with open(f"source_texts/praxis_pietatis_verses/{n}.json") as f:
        page = json.load(f)
    page_info = {}
    page_info[n] = page
    relevant_page_texts.append(page_info)

In [14]:
with open("sermons_with_most_music.json", "r") as f:
    testsermons = json.load(f)

In [15]:
fuzziness = 80

In [17]:
#for id in testsermons:
id = "E000061"
print(f"Starting with {id}")
sermon = oa.Sermon(id)

# perform classification
hits = []
for i in range(len(sermon.chunked)):                # for each paragraph
    for j in range(len(sermon.chunked[i])):         # for each sentence
        if " bibel" in sermon.chunked[i][j]["types"]:
            continue
        else:
            query = " ".join(sermon.chunked[i][j]["words"])
            query = re.sub(r'[/.,;:?!]', '', query)
            for page in relevant_page_texts:
                for pagenr, verses in page.items():
                    for verse in verses:
                        sim_score = fuzz.ratio(query, verse)
                        if sim_score >= fuzziness:
                            hits.append([query, i, j, pagenr, verse, float(f"{sim_score:.2f}")])

guessed_hits = pd.DataFrame(hits, columns=["Predigt", "Paragraph", "Satz", "Liederbuch", "Liedvers", "Ähnlichkeit"])     # create dataframe
guessed_hits['Dopplung'] = guessed_hits.groupby('Satz')['Satz'].transform(lambda x: x.duplicated())

guessed_hits = remove_duplicates(guessed_hits).reset_index(drop=True)

guessed_hits = add_inferred_matches(guessed_hits, sermon)
guessed_hits = correct_inbetween_matches(guessed_hits)

guessed_hits.sort_values("Satz", ascending=True, inplace=True)
guessed_hits.reset_index(drop=True)

Starting with E000061
Query executed for E081026, but no data found.
Query executed for E081028, but no data found.
Query executed for E100159, but no data found.
Query executed for E100157, but no data found.
Query executed for E100160, but no data found.
Query executed for E100158, but no data found.
Query executed for E081029, but no data found.


  guessed_hits = pd.concat([guessed_hits, new_matches])


Unnamed: 0,Predigt,Paragraph,Satz,Liederbuch,Liedvers,Ähnlichkeit,Dopplung
0,wann dort herr jesu,48,0,473,wann dort herr jesu,100.0,True
1,herr gott vater mein starcker held etc,13,0,741,herr gott vater mein starcker held,94.44,False
2,und dergleichen stimmwercke mehr,19,0,299,und dergleichen trebern mehr,80.0,True
3,schickt das hertze da hinein,43,0,591,schickt das hertze dahinein,98.18,True
4,herr gott dich loben wir,72,0,969,herr gott dich loben wir,100.0,True
5,er hilft aus noht,36,0,165,du hilfft aus noth,80.0,True
6,"wird vor deinem throne,",48,1,473,wer doch gewesen die die person,48.15,False
7,den 5. 6. und 7.,13,1,741,daß mein schatz ist das a und d,42.55,False
8,wo ihr ewig wünscht zu seyn,43,1,591,wo ihr ewig wunscht zu feyn,92.59,False
9,"der fromme gott, und züchtiget mit massen;",36,1,875,er wird mich noch zu seiner zeit nicht lassen,52.87,False


In [None]:
validation = []
for i in range(len(sermon.chunked)):                # for each paragraph
    for j in range(len(sermon.chunked[i])):         # for each sentence
        if " musikwerk" in sermon.chunked[i][j]["types"]:
            line = " ".join(sermon.chunked[i][j]["words"])
            refs = ", ".join(set(flatten(sermon.chunked[i][j]["references"])))
            validation.append([line, i, j, refs])

known_hits = pd.DataFrame(validation, columns=["Predigt", "Paragraph", "Satz", "Referenz"])
known_hits = known_hits[known_hits['Referenz'].apply(is_song_in_book)]
known_hits["Ref_Seite"] = known_hits['Referenz'].apply(song_page)

converged_df = pd.merge(known_hits, guessed_hits, on=['Paragraph','Satz'], how='inner')
converged_df["in_page_list"]  = converged_df.apply(lambda row: row['Liederbuch'] in row['Ref_Seite'], axis=1)

# analysis
val_hits = len(known_hits)

merged_df = pd.merge(guessed_hits, known_hits, on=['Paragraph', 'Satz'], how='left', indicator=True)


Unnamed: 0,Predigt_x,Paragraph,Satz,Referenz,Ref_Seite,Predigt_y,Liederbuch,Liedvers,Ähnlichkeit,Dopplung,in_page_list
0,herr gott vater mein starcker held etc.,13,0,E100022,"[740, 741, 742]",herr gott vater mein starcker held etc,741,herr gott vater mein starcker held,94.44,False,True
1,wie schön leuchtet der morgenstern.,13,3,E100022,"[740, 741, 742]",wie schön leuchtet der morgenstern,71,wie schone leuchtt der morgenstern,94.12,True,False
2,"er hilft aus noht,",36,0,E100157,"[771, 772, 773]",er hilft aus noht,165,du hilfft aus noth,80.0,True,False
3,"der fromme gott, und züchtiget mit massen;",36,1,E100157,"[771, 772, 773]","der fromme gott, und züchtiget mit massen;",875,er wird mich noch zu seiner zeit nicht lassen,52.87,False,False
4,"wer gott vertraut, fest auf jhn baut,",36,2,E100157,"[771, 772, 773]",wer gott vertraut fest auf jhn baut,875,wer gott vertraut vest darauff baut,85.71,False,False
5,den wil er nicht verlassen.,36,3,E100157,"[771, 772, 773]",den wil er nicht verlassen,771,den wird er nicht verlassen,94.34,True,True
6,"schickt das hertze da hinein,",43,0,E100158,"[590, 591, 592]",schickt das hertze da hinein,591,schickt das hertze dahinein,98.18,True,True
7,wo ihr ewig wünscht zu seyn.,43,1,E100158,"[590, 591, 592]",wo ihr ewig wünscht zu seyn,591,wo ihr ewig wunscht zu feyn,92.59,False,True
8,"wann dort, herr jesu,",48,0,E100159,"[472, 473, 474]",wann dort herr jesu,473,wann dort herr jesu,100.0,True,True
9,"wird vor deinem throne,",48,1,E100159,"[472, 473, 474]","wird vor deinem throne,",473,wer doch gewesen die die person,48.15,False,True


In [20]:
merged_df

Unnamed: 0,Predigt_x,Paragraph,Satz,Liederbuch,Liedvers,Ähnlichkeit,Dopplung,Predigt_y,Referenz,Ref_Seite,_merge
0,wann dort herr jesu,48,0,473,wann dort herr jesu,100.0,True,"wann dort, herr jesu,",E100159,"[472, 473, 474]",both
1,herr gott vater mein starcker held etc,13,0,741,herr gott vater mein starcker held,94.44,False,herr gott vater mein starcker held etc.,E100022,"[740, 741, 742]",both
2,und dergleichen stimmwercke mehr,19,0,299,und dergleichen trebern mehr,80.0,True,,,,left_only
3,schickt das hertze da hinein,43,0,591,schickt das hertze dahinein,98.18,True,"schickt das hertze da hinein,",E100158,"[590, 591, 592]",both
4,herr gott dich loben wir,72,0,969,herr gott dich loben wir,100.0,True,"herr gott dich loben wir,",E100078,"[642, 643, 644]",both
5,er hilft aus noht,36,0,165,du hilfft aus noth,80.0,True,"er hilft aus noht,",E100157,"[771, 772, 773]",both
6,"wird vor deinem throne,",48,1,473,wer doch gewesen die die person,48.15,False,"wird vor deinem throne,",E100159,"[472, 473, 474]",both
7,den 5. 6. und 7.,13,1,741,daß mein schatz ist das a und d,42.55,False,,,,left_only
8,wo ihr ewig wünscht zu seyn,43,1,591,wo ihr ewig wunscht zu feyn,92.59,False,wo ihr ewig wünscht zu seyn.,E100158,"[590, 591, 592]",both
9,"der fromme gott, und züchtiget mit massen;",36,1,875,er wird mich noch zu seiner zeit nicht lassen,52.87,False,"der fromme gott, und züchtiget mit massen;",E100157,"[771, 772, 773]",both


In [None]:
test_score = {}

test_score["type"] = "similarity"
test_score["fuzziness"] = fuzziness
test_score["date"] = datetime.datetime.now()
test_score["sermons"] = []

for id in testsermons:
    print(f"Starting with {id}")
    sermon = oa.Sermon(id)

    # perform classification
    hits = []
    for i in range(len(sermon.chunked)):                # for each paragraph
        for j in range(len(sermon.chunked[i])):         # for each sentence
            if " bibel" in sermon.chunked[i][j]["types"]:
                continue
            else:
                query = " ".join(sermon.chunked[i][j]["words"])
                query = re.sub(r'[/.,;:?!]', '', query)
                for page in relevant_page_texts:
                    for pagenr, verses in page.items():
                        for verse in verses:
                            sim_score = fuzz.ratio(query, verse)
                            if sim_score >= fuzziness:
                                hits.append([query, i, j, pagenr, verse, float(f"{sim_score:.2f}")])
    
    guessed_hits = pd.DataFrame(hits, columns=["Predigt", "Paragraph", "Satz", "Liederbuch", "Liedvers", "Ähnlichkeit"])     # create dataframe
    guessed_hits['Dopplung'] = guessed_hits.groupby('Satz')['Satz'].transform(lambda x: x.duplicated())

    guessed_hits = remove_duplicates(guessed_hits).reset_index(drop=True)

    guessed_hits = add_inferred_matches(guessed_hits, sermon)
    guessed_hits = correct_inbetween_matches(guessed_hits)
    
    guessed_hits.sort_values("Satz", ascending=True, inplace=True)
    guessed_hits.reset_index(drop=True)

    # create validation set
    validation = []
    for i in range(len(sermon.chunked)):                # for each paragraph
        for j in range(len(sermon.chunked[i])):         # for each sentence
            if " musikwerk" in sermon.chunked[i][j]["types"]:
                line = " ".join(sermon.chunked[i][j]["words"])
                refs = ", ".join(set(flatten(sermon.chunked[i][j]["references"])))
                validation.append([line, i, j, refs])

    known_hits = pd.DataFrame(validation, columns=["Predigt", "Paragraph", "Satz", "Referenz"])
    known_hits = known_hits[known_hits['Referenz'].apply(is_song_in_book)]
    known_hits["Ref_Seite"] = known_hits['Referenz'].apply(song_page)

    converged_df = pd.merge(known_hits, guessed_hits, on=['Paragraph','Satz'], how='inner')
    converged_df["in_page_list"]  = converged_df.apply(lambda row: row['Liederbuch'] in row['Ref_Seite'], axis=1)

    # analysis
    val_hits = len(known_hits)
    
    merged_df = pd.merge(guessed_hits, known_hits, on=['Paragraph', 'Satz'], how='left', indicator=True)
    hits_not_in_val = len(merged_df[merged_df['_merge'] == 'left_only'].drop('_merge', axis=1))
    
    agreed_hits = converged_df["in_page_list"].value_counts()[True]
    divergent_hits = len(converged_df) - agreed_hits
    missed_hits = len(known_hits) - (agreed_hits + divergent_hits)
    avg_certainty = guessed_hits["Ähnlichkeit"].mean()

    precision = agreed_hits / (agreed_hits + divergent_hits + hits_not_in_val)
    recall = agreed_hits / val_hits

    f1 = (2 * precision * recall) / (precision + recall)

    results = {}

    results["id"] = id
    results["agreed_hits"] = agreed_hits
    results["divergent_hits"] = divergent_hits
    results["new_hits"] = hits_not_in_val
    results["missed_hits"] = missed_hits
    results["avg_certainty"] = avg_certainty

    results["precision"] = precision
    results["recall"] = recall
    results["f1-score"] = f1

    test_score["sermons"].append(results)

all_precision = [x["precision"] for x in test_score["sermons"]]
all_recall = [x["recall"] for x in test_score["sermons"]]
all_f1 = [x["f1-score"] for x in test_score["sermons"]]
all_avg_cert = [x["avg_certainty"] for x in test_score["sermons"]]

test_score["overall_precision"] = statistics.mean(all_precision)
test_score["overall_recall"] = statistics.mean(all_recall)
test_score["overall_f1"] = statistics.mean(all_f1)
test_score["overall_certainty"] = statistics.mean(all_avg_cert)

print(test_score)

Starting with E000036
Query executed for E100181, but no data found.
Query executed for E081074, but no data found.
Query executed for E081059, but no data found.
Query executed for E081062, but no data found.
Query executed for E081071, but no data found.
Query executed for E081061, but no data found.
Query executed for E081063, but no data found.
Query executed for E080981, but no data found.
Query executed for E100177, but no data found.
Query executed for E100185, but no data found.
Query executed for E081069, but no data found.
Query executed for E081060, but no data found.
Query executed for E081065, but no data found.
Query executed for E100180, but no data found.
Query executed for E081067, but no data found.
Query executed for E080921, but no data found.
Query executed for E081072, but no data found.
Query executed for E100182, but no data found.
Query executed for E081064, but no data found.
Query executed for E100158, but no data found.
Query executed for E080938, but no dat

  guessed_hits = pd.concat([guessed_hits, new_matches])


jst mein herr jesus christ
kein ohr hat je gehört
du solt seyn meines hertzenslicht
Starting with E000072
Starting with E000070


  guessed_hits = pd.concat([guessed_hits, new_matches])
  guessed_hits = pd.concat([guessed_hits, new_matches])


dem thu ich mich ergeben
Starting with E000055


  guessed_hits = pd.concat([guessed_hits, new_matches])
  guessed_hits = pd.concat([guessed_hits, new_matches])


Starting with E000061
Query executed for E081026, but no data found.
Query executed for E081028, but no data found.
Query executed for E100158, but no data found.
Query executed for E100160, but no data found.
Query executed for E100159, but no data found.
Query executed for E081029, but no data found.
Query executed for E100157, but no data found.
{'type': 'similarity', 'fuzziness': 80, 'date': datetime.datetime(2025, 8, 18, 0, 32, 11, 577579), 'sermons': [{'id': 'E000036', 'agreed_hits': 90, 'divergent_hits': 7, 'new_hits': 25, 'missed_hits': 29, 'avg_certainty': 87.71073770491805, 'precision': 0.7377049180327869, 'recall': 0.7142857142857143, 'f1-score': 0.7258064516129032}, {'id': 'E000072', 'agreed_hits': 29, 'divergent_hits': 5, 'new_hits': 13, 'missed_hits': 21, 'avg_certainty': 85.10021276595744, 'precision': 0.6170212765957447, 'recall': 0.5272727272727272, 'f1-score': 0.5686274509803921}, {'id': 'E000070', 'agreed_hits': 31, 'divergent_hits': 12, 'new_hits': 16, 'missed_hits'

  guessed_hits = pd.concat([guessed_hits, new_matches])


In [17]:
for key, value in test_score.items():
    print((key, value))

('type', 'similarity')
('fuzziness', 80)
('date', datetime.datetime(2025, 8, 18, 0, 32, 11, 577579))
('sermons', [{'id': 'E000036', 'agreed_hits': 90, 'divergent_hits': 7, 'new_hits': 25, 'missed_hits': 29, 'avg_certainty': 87.71073770491805, 'precision': 0.7377049180327869, 'recall': 0.7142857142857143, 'f1-score': 0.7258064516129032}, {'id': 'E000072', 'agreed_hits': 29, 'divergent_hits': 5, 'new_hits': 13, 'missed_hits': 21, 'avg_certainty': 85.10021276595744, 'precision': 0.6170212765957447, 'recall': 0.5272727272727272, 'f1-score': 0.5686274509803921}, {'id': 'E000070', 'agreed_hits': 31, 'divergent_hits': 12, 'new_hits': 16, 'missed_hits': 14, 'avg_certainty': 91.39457627118645, 'precision': 0.5254237288135594, 'recall': 0.543859649122807, 'f1-score': 0.5344827586206897}, {'id': 'E000055', 'agreed_hits': 12, 'divergent_hits': 8, 'new_hits': 16, 'missed_hits': 6, 'avg_certainty': 86.30555555555556, 'precision': 0.3333333333333333, 'recall': 0.46153846153846156, 'f1-score': 0.38709

In [None]:
test_score["date"] = test_score["date"].isoformat()


'2025-08-18T00:32:11.577579'

In [29]:
with open("test_results.json", "r") as f:
    test_results = json.load(f)

test_results.append(test_score)

with open("test_results.json", "w") as f:
    json.dump(test_results, f, ensure_ascii=False, cls=NumpyEncoder)