In [None]:
import spacy
from spacy_layout import spaCyLayout
from pipeline_merge_pdf import process_pdf_folder
import os
import time

#Ordner, in dem PDFs liegen
pdf_folder = "../data/DSA/Der_blaue_Bruder"
merged_pdf_path = os.path.join(pdf_folder, "_combined.pdf")

#Prüfen ob eine merged-Datei bereits vorhanden ist. Falls nicht, diese erzeugen
if os.path.exists(merged_pdf_path):
    print("Die Datei _combined.pdf existiert bereits unter folgendem Pfad:",merged_pdf_path)
    print("Lade die Datei lokal..\n")
    time.sleep(0.5)
    nlp = spacy.blank("de")
    layout = spaCyLayout(nlp)
    doc = layout(merged_pdf_path)
else:
    print("Die Datei _combined.pdf existiert noch nicht. Merge-Pipeline wird durchgeführt..")
    #Pipeline ausführen
    doc = process_pdf_folder(pdf_folder)

#spaCy-Doc
print("\nAnzahl Layout-Spans:", len(doc.spans["layout"]))


In [None]:
from pipeline_preproc_pdf import process_doc_into_posts

#Posts extrahieren + bereinigen
posts = process_doc_into_posts(doc)

#Posts ausgeben
for i, p in enumerate(posts):
    print(f"--- Post {i+1} ---\n{p}\n")

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
result = tfidf.fit_transform(posts)

In [None]:
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
    print(ele1, ':', ele2)

In [None]:
print('\nWord indexes:')
print(tfidf.vocabulary_)
print('\ntf-idf value:')
print(result)
print('\ntf-idf values in matrix form:')
print(result.toarray())

In [None]:
# Installationshinweise (falls nötig)
# !pip install nltk seaborn matplotlib scikit-learn

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Deutsche Stopwords laden
german_stopwords = stopwords.words('german')

# TF-IDF Vectorizer mit Stopword-Entfernung
vectorizer = TfidfVectorizer(stop_words=german_stopwords)
X = vectorizer.fit_transform(posts)

# In DataFrame umwandeln für bessere Lesbarkeit
df_tfidf = pd.DataFrame(
    X.toarray(),
    columns=vectorizer.get_feature_names_out(),
    index=[f"Post {i+1}" for i in range(len(posts))]
)

df_tfidf


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

# ------------------------------
# 1. TF-IDF pro Post berechnen
# ------------------------------
german_stopwords = stopwords.words('german') #Stopwords aus NLTK laden

vectorizer = TfidfVectorizer(stop_words=german_stopwords)
X = vectorizer.fit_transform(posts)
feature_names = np.array(vectorizer.get_feature_names_out())

# ------------------------------
# 2. Relevante Wörter pro Post
# ------------------------------
def top_words_per_post(tfidf_vector, features, top_k=5):
    sorted_idx = tfidf_vector.toarray()[0].argsort()[::-1]
    top_idx = sorted_idx[:top_k]
    return features[top_idx], tfidf_vector.toarray()[0][top_idx]

top_words = []
for i in range(len(posts)):
    words, scores = top_words_per_post(X[i], feature_names)
    top_words.append(words)

# Alle Top-Wörter je Post als Liste flach machen
flat_words = np.unique(np.concatenate(top_words))

# ------------------------------
# 3. Clustering der Wörter
# ------------------------------

# Wir erstellen TF-IDF Vektoren NUR für die Top-Wörter
word_vectors = vectorizer.transform(flat_words)

# KMeans Cluster
k = 3
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(word_vectors)

# Cluster → Wörter
clusters = {}
for word, label in zip(flat_words, labels):
    clusters.setdefault(label, []).append(word)

# ------------------------------
# 4. Cluster benennen
# (Einfachste Variante: häufigstes Wort im Cluster)
# ------------------------------
cluster_labels = {}
for cid, words in clusters.items():
    cluster_labels[cid] = words[0]   # Placeholder

# ------------------------------
# 5. Häufigkeit: welches Cluster wie oft pro Post?
# ------------------------------
cluster_counts = []

for words in top_words:
    counts = {cid: 0 for cid in clusters.keys()}
    for w in words:
        # Finde den Cluster dieses Wortes
        cid = labels[list(flat_words).index(w)]
        counts[cid] += 1
    cluster_counts.append(counts)

df_cluster = pd.DataFrame(cluster_counts)
df_cluster.index = [f"Post {i+1}" for i in range(len(posts))]

print("Cluster -> Wörter:")
for cid, words in clusters.items():
    print(f"{cid}: {words}")

print("\nHäufigkeiten je Post:")
print(df_cluster)
