In [8]:
import spacy
from spacy.pipeline import Sentencizer
import pandas as pd
import csv
from collections import Counter
import lexicalrichness

In [4]:
nlp = spacy.load('de_core_news_sm')


In [5]:
def add_UPOS(df):
    if 'UPOS' not in df.columns:
        df['UPOS'] = None
    for _, row in df['words'].items():
        doc = nlp(str(row))
        for token in doc:
            df.loc[df['words'] == row, 'UPOS'] = token.pos_


In [14]:
def get_mattr_and_mtld(key):
    with open(f'exports/{key}_full.txt', 'r') as f:
        text = f.read()
    text = '\n'.join(text.split('--------------------NEW-TEXT------------------'))
    lex = lexicalrichness.LexicalRichness(text)
    return [lex.mattr(window_size=400), lex.mtld(threshold=0.72)]



In [26]:
def get_avg_sentence_lenght(key):
    with open(f'exports/{key}_full.txt', 'r') as f:
        text = f.read()
    text = '\n'.join(text.split('--------------------NEW-TEXT------------------'))
    print('nlp start')
    doc = nlp(text)
    print('nlp end')
    
    doc = nlp(text)

    saetze = list(doc.sents)
    anzahl_saetze = len(saetze)
    
    if anzahl_saetze == 0:
        return 0.0  # Vermeidung einer Division durch Null
    
    # Berechne die Wortanzahl pro Satz (ignoriere Satzzeichen und Leerzeichen)
    wortanzahlen = [sum(1 for token in satz if not token.is_punct) for satz in saetze]
    durchschnitt = sum(wortanzahlen) / anzahl_saetze
    
    return durchschnitt
    


In [36]:



def get_cvs_sentance_lenght(key, output_csv):
    with open(f'exports/{key}_full.txt', 'r') as f:
        text = f.read()
    text = '\n'.join(text.split('--------------------NEW-TEXT------------------'))
    # Lade das SpaCy-Sprachmodell
    nlp = spacy.load("de_core_news_sm")
    
    # Verarbeite den Text
    doc = nlp(text)
    
    # Extrahiere Sätze und berechne die Wortanzahl jedes Satzes (ohne Satzzeichen)
    wortanzahlen = [sum(1 for token in satz if not token.is_punct) for satz in doc.sents]
    gesamt_saetze = len(wortanzahlen)
    
    if gesamt_saetze == 0:
        print("Der Text enthält keine Sätze.")
        return
    
    # Berechne die absolute und relative Häufigkeit
    haeufigkeiten = Counter(wortanzahlen)
    relative_haeufigkeiten = {l: f / gesamt_saetze for l, f in haeufigkeiten.items()}
    
    # Speichere die Daten in einer CSV-Datei
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Satzlänge', 'Absolute Häufigkeit', 'Relative Häufigkeit'])
        for satzlaenge in sorted(haeufigkeiten.keys()):
            writer.writerow([satzlaenge, haeufigkeiten[satzlaenge], relative_haeufigkeiten[satzlaenge]])

    print(f"Die relativen Häufigkeiten wurden in '{output_csv}' gespeichert.")




In [12]:
test_df = pd.DataFrame({'words': ['Hund', 'gehen', 'schön', 'Haus', 'sein', 'Sucht', 'Flugzeug', 'Schiff', 'spielen', 'Bus', 'Auto', 'Fahrrad', 'Motorrad', 'Zug', 'U-Bahn'],
                        'amount': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]})

#add_UPOS(test_df)
#test_df.head()
print(get_mattr_and_mtld('gpt4o'))

[0.9492209834191255, 191.0044121726575]


In [6]:
gpt4o_df        = pd.read_csv('exports/GPT4o-2025-01-06_15:01:12.csv')      # GPT4o
gpt35t_df       = pd.read_csv('exports/GPT35t-2025-01-06_15:01:12.csv')     # GPT35t 
perplexity_df   = pd.read_csv('exports/PERPLEXITY-2025-01-06_15:01:12.csv') # PERPLEXITY
claude_df       = pd.read_csv('exports/CLAUDE-2025-01-06_15:01:12.csv')     # CLAUDE
human_df        = pd.read_csv('exports/HUMAN-2025-01-06_15:01:12.csv')      # HUMAN

In [37]:
print('GPT4o:', get_avg_sentence_lenght('gpt4o'))
print('GPT35t:', get_avg_sentence_lenght('gpt35t'))
print('PERPLEXITY:', get_avg_sentence_lenght('perplexity'))
print('CLAUDE:', get_avg_sentence_lenght('clde'))
print('HUMAN:', get_avg_sentence_lenght('human'))
get_cvs_sentance_lenght('gpt4o', 'includes/sentence_gpt4o.csv')
get_cvs_sentance_lenght('gpt35t', 'includes/sentence_gpt35t.csv')
get_cvs_sentance_lenght('perplexity', 'includes/sentence_perplexity.csv')
get_cvs_sentance_lenght('clde', 'includes/sentence_clde.csv')
get_cvs_sentance_lenght('human', 'includes/sentence_human.csv')

nlp start
nlp end
GPT4o: 17.136150234741784
nlp start
nlp end
GPT35t: 18.26618705035971
nlp start
nlp end
PERPLEXITY: 17.20806100217865
nlp start
nlp end
CLAUDE: 18.078212290502794
nlp start
nlp end
HUMAN: 15.709055876685934
Die relativen Häufigkeiten wurden in 'includes/sentence_gpt4o.csv' gespeichert.
Die relativen Häufigkeiten wurden in 'includes/sentence_gpt35t.csv' gespeichert.
Die relativen Häufigkeiten wurden in 'includes/sentence_perplexity.csv' gespeichert.
Die relativen Häufigkeiten wurden in 'includes/sentence_clde.csv' gespeichert.
Die relativen Häufigkeiten wurden in 'includes/sentence_human.csv' gespeichert.


In [7]:
add_UPOS(gpt4o_df)
print('1')
add_UPOS(gpt35t_df)
print('2')
add_UPOS(perplexity_df)
print('3')
add_UPOS(claude_df)
print('4')
add_UPOS(human_df)
print('5')

1
2
3
4
5


In [15]:
print('GPT4o', get_mattr_and_mtld('gpt4o'))
print('GPT35t', get_mattr_and_mtld('gpt35t'))
print('PERPLEXITY', get_mattr_and_mtld('perplexity'))
print('CLAUDE', get_mattr_and_mtld('clde'))
print('HUMAN', get_mattr_and_mtld('human'))

GPT4o [0.6318911622098246, 191.0044121726575]
GPT35t [0.6061037825810514, 173.20946654539702]
PERPLEXITY [0.5742907288870497, 126.1358580733073]
CLAUDE [0.6386005160421939, 217.49843817550925]
HUMAN [0.6245039142838209, 182.09963138421662]


In [22]:
dataframes = [(gpt4o_df, 'gpt4o'), (gpt35t_df, 'gpt35t'), (perplexity_df, 'perplexity'), (claude_df, 'claude'), (human_df, 'human')]

# Ergebnisse speichern
final_result = pd.DataFrame()

# Prozess für jeden DataFrame
def calculate_weighted_percentages(df):
    # Gruppiere nach UPOS-tag und summiere die Amounts
    grouped = df.groupby('UPOS')['amount'].sum().reset_index()

    # Berechne den Gesamtbetrag
    total_amount = grouped['amount'].sum()

    # Füge die Prozentzahlen hinzu
    grouped['percentage'] = (grouped['amount'] / total_amount) * 100

    # Nur die relevanten Spalten behalten
    return grouped[['UPOS', 'percentage']]

# Wende die Funktion auf jeden DataFrame an
for df, name in dataframes:
    result = calculate_weighted_percentages(df)
    result.rename(columns={'percentage': name}, inplace=True)

    # Zusammenführen mit dem finalen Ergebnis
    if final_result.empty:
        final_result = result
    else:
        final_result = pd.merge(final_result, result, on='UPOS', how='outer')

# Fehlende Werte mit 0 ersetzen
final_result.fillna(0, inplace=True)

# Prozentsätze formatieren
for col in final_result.columns[1:]:
    final_result[col] = final_result[col].apply(lambda x: f"{x:.1f}%")

# Spaltenreihenfolge anpassen
final_result = final_result[['UPOS'] + [name for _, name in dataframes]]

# Spaltennamen anpassen
final_result.rename(columns={'UPOS': 'UPOS'}, inplace=True)

# Speichere die Ergebnisse in einer CSV-Datei
final_result.to_csv('upos_percentages.csv', index=False)

print("Die Ergebnisse wurden in 'upos_percentages.csv' gespeichert.")


Die Ergebnisse wurden in 'upos_percentages.csv' gespeichert.


In [23]:
def analyze_grammatical_genders(key):
    
    with open(f'exports/{key}_full.txt', 'r') as f:
        text = f.read()
    text = '\n'.join(text.split('--------------------NEW-TEXT------------------'))
    print('nlp start')
    doc = nlp(text)
    print('nlp end')

    # Process the text
    doc = nlp(text)

    # Dictionary to store counts
    gender_counts = {"masculine": 0, "feminine": 0, "neuter": 0, "other": 0}

    # Iterate through tokens and count genders
    for token in doc:
        if token.morph.get("Gender"):
            gender = token.morph.get("Gender")[0]
            if gender == "Masc":
                gender_counts["masculine"] += 1
            elif gender == "Fem":
                gender_counts["feminine"] += 1
            elif gender == "Neut":
                gender_counts["neuter"] += 1
        else:
            gender_counts["other"] += 1

    # Calculate total and relative frequencies
    total = sum(gender_counts.values())
    relative_frequencies = {key: (value / total) * 100 if total > 0 else 0 for key, value in gender_counts.items()}

    # Create DataFrame
    df = pd.DataFrame({
        "Gender": gender_counts.keys(),
        "Absolute Frequency": gender_counts.values(),
        "Relative Frequency (%)": relative_frequencies.values()
    })

    return df

In [None]:
df = pd.DataFrame({
        "Gender": [],
        "Absolute [],
        "Relative []
    })