In [13]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import spacy
import pandas as pd

In [14]:
# Lade das Modell
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [15]:

def get_first_three_sentences(key: str):
    with open(f'exports/{key}_full.txt', 'r') as f:
        full_text = f.read()
    texts = full_text.split('--------------------NEW-TEXT------------------')

    nlp = spacy.load('de_core_news_sm')
    
    results = []
    
    for text in texts:
        # Verarbeiten des Textes mit SpaCy
        doc = nlp(text)
        
        # Filter, um Sätze zu bereinigen und sicherzustellen, dass sie nicht leer sind
        valid_sentences = [
            str(sentence).strip()
            for sentence in doc.sents
            if str(sentence).strip() not in ['', '\n', '\n\n', ' ']
        ]
        
        # Extrahieren der ersten drei gültigen Sätze
        first_three_sentences = valid_sentences[:3]
        
        # Kombinieren der Sätze in einen String und zur Ergebnisliste hinzufügen
        results.append(first_three_sentences)
    
    return results

In [27]:
def calculate_similarity_to_human(key='test2'):
    
    
    ai_first_sentences = get_first_three_sentences(key)
    human_first_sentences = get_first_three_sentences('human')
    
    ai_dict = {
    "first" : [text[0] for text in ai_first_sentences],
    "second" : [text[1] for text in ai_first_sentences],
    "third" : [text[2] for text in ai_first_sentences]
    }
    
    human_dict = {
    "first" : [text[0] for text in human_first_sentences],
    "second" : [text[1] for text in human_first_sentences],
    "third" : [text[2] for text in human_first_sentences]
    }
    
    results = {"first": [], "second": [], "third": []}
    
    for key in ai_dict.keys():
        ai_sentences = ai_dict[key]
        human_sentences = human_dict[key]
        result_list = results[key]
        
        for ai_sentence in ai_sentences:
            for human_sentence in human_sentences:
                
                ai_embedding = model.encode(ai_sentence).reshape(1, -1)
                human_embedding = model.encode(human_sentence).reshape(1, -1)
                cos_similarity = cosine_similarity(ai_embedding, human_embedding)
                result_list.append(cos_similarity)

        results[key] = result_list
        
    print(results.values())

    averages = [np.mean(item) for item in results.values()]
    standard_deviation = [np.std(item) for item in results.values()]


    
    return averages, standard_deviation
    

In [28]:
print(calculate_similarity_to_human('gpt4o'))
print(calculate_similarity_to_human('gpt35t'))
print(calculate_similarity_to_human('perplexity'))
print(calculate_similarity_to_human('clde'))

dict_values([[array([[0.3454457]], dtype=float32), array([[0.10351469]], dtype=float32), array([[0.46812463]], dtype=float32), array([[0.24579711]], dtype=float32), array([[0.23578785]], dtype=float32), array([[0.2613373]], dtype=float32), array([[0.20016104]], dtype=float32), array([[0.36073923]], dtype=float32), array([[0.09727599]], dtype=float32), array([[0.11003191]], dtype=float32), array([[0.33277202]], dtype=float32), array([[0.1900624]], dtype=float32), array([[0.07336093]], dtype=float32), array([[0.12359803]], dtype=float32), array([[0.3034734]], dtype=float32), array([[0.11347261]], dtype=float32), array([[0.36073923]], dtype=float32), array([[0.19449827]], dtype=float32), array([[0.27764958]], dtype=float32), array([[0.24908687]], dtype=float32), array([[0.0053731]], dtype=float32), array([[0.6615232]], dtype=float32), array([[0.12456976]], dtype=float32), array([[0.6917345]], dtype=float32), array([[0.36082953]], dtype=float32), array([[0.45662236]], dtype=float32), array