In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import db_connection
import ast
import re

from mysql.connector import Error
from core.utils import Sermon, is_id, get_short_info

from pathlib import Path
import os

import folium
import json
import os
from collections import Counter

from itertools import chain
from pathlib import Path

# root directory path
ROOT = Path(__file__).resolve().parents[2]

Counting the rows in the database yields
- 1122 entries for sources
- 194 entries for music
- 1129 entries for literature

In [None]:
def list_all_sources():
    all_sources = []
    source_occurrences = []
    with os.scandir(ROOT / "sermon_tables") as it:
        for entry in it:
            df = pd.read_csv(ROOT / f"sermon_tables/{entry.name}", 
                            sep="\t", 
                            converters={'reference': pd.eval})
            sources = df.reference.values.tolist()
            filtered_sources = [source for source in sources if source]
            flat_sources = list(chain.from_iterable(filtered_sources))
            unique_sources = set(flat_sources)
            all_sources.extend(flat_sources)
            source_occurrences.extend(unique_sources)
    return all_sources, source_occurrences

In [3]:
all_sources, source_occurrences = list_all_sources()

In [4]:
print(len(all_sources))
print(len(source_occurrences))

112247
5722


In [None]:
with open(ROOT / 'source_occurrences.json', 'w') as f:
    json.dump(source_occurrences, f, indent=4)

In [None]:
with open(ROOT / 'all_sources.json', 'w') as f:
    json.dump(all_sources, f, indent=4)

In [7]:
pattern = r'E[01][0-9]{5}'
all_source_ids = [source for source in all_sources if re.search(pattern, source)]
all_occ_ids = [source for source in source_occurrences if re.search(pattern, source)]

In [8]:
id_counts = Counter(all_source_ids)
longest_quotes = [id for id, count in id_counts.most_common(20)]

In [9]:
id_occ_counts = Counter(all_occ_ids)
most_frequent_quotes = [id for id, count in id_occ_counts.most_common(20)]

In [10]:
x = longest_quotes[1]
print(x[:3])
if x[:3] == ("E08" or "E09"):
    print("True")

E08
True


In [11]:
id = "E090024"
id.startswith(("E08", "E09"))

True

In [12]:
for id in longest_quotes:
    print(f"{id} -- {get_short_info(id)}")

E000003 -- Conrad Dieterich, 1575/01/09 (Gemünden (Wohra))-1639/03/22 (Ulm): Vlmische Orgel Predigt (Ulm 1624)
E080193 -- Mithob, Hector: Psalmodia Christiana (1665)
E090778 -- Plautus, M. Accius: Marci Accii Plauti Comoediae (1874)
E080223 -- Praetorius, Michael: Syntagmatis Musici Michaelis Praetorii C. Tomus Secundus De Organographia (1619)
E000079 -- Johann Ludwig Hartmann, 1640/02/03 (Rothenburg ob der Tauber)-1684/07/18 (Rothenburg ob der Tauber): Denck- und Danck-Säule (Rothenburg ob der Tauber [1673])
E080732 -- Vockerodt, Gottfried: Mißbrauch der freyen Künste/ insonderheit Der Music (1697)
E080378 -- Zwinger, Theodor: Theatrvm Hvmanae Vitae Theodori Zuingeri Bas[isliensis] Tertiatione ([1586])
E080672 -- Scriver, Christian: Seelen=Schatzes Vierdter Theil (1687)
E000031 -- Christoph Frick, 1577 (Burgdorf)-1640/04/09 (): Musica Christiana (Leipzig 1615)
E000005 -- Conrad Dieterich, 1575/01/09 (Gemünden (Wohra))-1639/03/22 (Ulm): Kirchweih= oder Orgel=Predigt (Leipzig 1632)
E080

In [14]:
for id in most_frequent_quotes:
    print(f"{id} -- {get_short_info(id)}")

E080223 -- Praetorius, Michael: Syntagmatis Musici Michaelis Praetorii C. Tomus Secundus De Organographia (1619)
E100017 -- anonym: In dulci jubilo
E080192 -- Gratianus de Clusio: Decretum Gratiani emendatum & notationibus illustratum, una cum glossis (1604)
E100022 -- Nicolai, Philipp: Wie schön leuchtet der Morgenstern
E100033 -- Anonym: Ach Gott wie manches Herzeleid
E090560 -- Augustinus, Aurelius ; Thimme, Wilhelm (Übers.): Confessiones. Bekenntnisse (2004)
Query executed for E100155, but no data found.
E100155 -- no_composer: no_title
E000003 -- Conrad Dieterich, 1575/01/09 (Gemünden (Wohra))-1639/03/22 (Ulm): Vlmische Orgel Predigt (Ulm 1624)
E100078 -- N.N.: Herr Gott, dich loben wir
E100125 -- Crüger, Johann: Sei Lob und Ehr dem höchsten Gut
E080378 -- Zwinger, Theodor: Theatrvm Hvmanae Vitae Theodori Zuingeri Bas[isliensis] Tertiatione ([1586])
E080322 -- Luther, Martin ; Sagittarius, Johann Christfried (Hrsg.): Der Achte Teil aller teutschen Bücher und Schrifften des theuren

In [18]:
# see overlap between sources with the most quoted words and most frequently occurring sources
sources_overlapping = set(longest_quotes) & set(most_frequent_quotes)
for x in sources_overlapping:
    print(f"{x} -- {get_short_info(x)}")

E000003 -- Conrad Dieterich, 1575/01/09 (Gemünden (Wohra))-1639/03/22 (Ulm): Vlmische Orgel Predigt (Ulm 1624)
E080378 -- Zwinger, Theodor: Theatrvm Hvmanae Vitae Theodori Zuingeri Bas[isliensis] Tertiatione ([1586])
E080223 -- Praetorius, Michael: Syntagmatis Musici Michaelis Praetorii C. Tomus Secundus De Organographia (1619)
E090570 -- Luther, Martin: D. Martin Luthers Werke (1969)
