In [1]:
import json
from os import path

In [2]:
from collections import defaultdict
from itertools import chain

def flatten_csl(record, name):
    attributes = []
    for attr in record.get(name, "") or record.get("SVO_relationships", []):
        if isinstance(attr, str):
            attributes.extend(s.strip() for s in attr.split(",") if s.strip())
        elif isinstance(attr, dict):
            for key, value in attr.items():
                if isinstance(value, str):
                    attributes.extend(s.strip() for s in value.split(",") if s.strip())
    return attributes

def common_with_reference(reference, compare_func):
    def wrapper(datasets):
        common = defaultdict(int)
        datasets_to_compare = list(datasets)
        mapped_dataset = {name: {tuple(int(i) for i in k.split(",") if i.isdigit()): v for k, v in data.items()} for name, data in datasets.items()}

        for i in mapped_dataset[reference]:
            reference_record = mapped_dataset[reference][i]
            for name in datasets_to_compare:
                target_record = mapped_dataset[name].get(i)
                if target_record:
                    common[name] += compare_func(target_record, reference_record)

        return dict(common)

    return wrapper


def common_in_csl(property_name):
    def compare_func(target_record, reference_record):
        reference_attributes = flatten_csl(reference_record, property_name)
        attributes = flatten_csl(target_record, property_name)
        return sum(1 for attr in attributes if attr in reference_attributes)

    return compare_func

def total_svos(datasets):
    return {dataset: sum(data["total_SVOs"] for data in data_list.values()) for dataset, data_list in datasets.items()}

common_subject = common_with_reference("GPT-4", common_in_csl("subject"))
common_verbs = common_with_reference("GPT-4", common_in_csl("verb"))
common_stem = common_with_reference("GPT-4", common_in_csl("stem"))
common_objects = common_with_reference("GPT-4", common_in_csl("object"))


In [16]:
dataset_names = [
    "financial.json",
    "literature.json",
    "medical.json",
    "movies.json",
    "news.json",
]
models = ["GPT-4", "SpaCy", "NLTK", "LLAMA3"]
results = {}
directory = "./myResult/"

for dataset_name in dataset_names:
    loaded_data = {}    
    print("dataset_name = " + dataset_name)
    with open(path.join(directory, 'Using_' + "GPT-4", dataset_name), "r",encoding="utf-8") as f:
        print("GPT-4")
        loaded_data["GPT-4"] = json.load(f)

    with open(path.join(directory, 'Using_' + "LLAMA3", dataset_name), "r",encoding="utf-8") as f:
        print("LLAMA3")        
        loaded_data["LLAMA3"] = json.load(f)

    with open(path.join(directory, 'Using_' + "NLTK", dataset_name), "r",encoding="utf-8") as f:
        print("NLTK")        
        loaded_data["NLTK"] = json.load(f)

    with open(path.join(directory, 'Using_' + "Spacy", dataset_name), "r",encoding="utf-8") as f:
        print("Spacy")        
        loaded_data["Spacy"] = json.load(f)  
        
    if 'totalSVOs' not in results:   
        results['totalSVOs'] = {}
    if 'commonSubject' not in results:
        results['commonSubject'] = {}
    if 'commonVerbs' not in results:
        results['commonVerbs'] = {}
    if 'commonObject' not in results:
        results['commonObject'] = {}
    results['totalSVOs'][dataset_name]=total_svos(loaded_data)
    results['commonSubject'][dataset_name]=common_subject(loaded_data)       
    results['commonVerbs'][dataset_name]=common_verbs(loaded_data)
    results['commonObject'][dataset_name]=common_objects(loaded_data)
    print(results)

with open(path.join(directory, "ComparisonResults.json"), "w") as outfile:
    json.dump(results, outfile, indent=2)

print("Comparison complete")

dataset_name = financial.json
GPT-4
LLAMA3
NLTK
Spacy
{'totalSVOs': {'financial.json': {'GPT-4': 370, 'LLAMA3': 230, 'NLTK': 65, 'Spacy': 240}}, 'commonSubject': {'financial.json': {'GPT-4': 10, 'LLAMA3': 8, 'NLTK': 0, 'Spacy': 0}}, 'commonVerbs': {'financial.json': {'GPT-4': 10, 'LLAMA3': 8, 'NLTK': 0, 'Spacy': 0}}, 'commonObject': {'financial.json': {'GPT-4': 10, 'LLAMA3': 8, 'NLTK': 0, 'Spacy': 0}}}
dataset_name = literature.json
GPT-4
LLAMA3
NLTK
Spacy
{'totalSVOs': {'financial.json': {'GPT-4': 370, 'LLAMA3': 230, 'NLTK': 65, 'Spacy': 240}, 'literature.json': {'GPT-4': 870, 'LLAMA3': 408, 'NLTK': 78, 'Spacy': 279}}, 'commonSubject': {'financial.json': {'GPT-4': 10, 'LLAMA3': 8, 'NLTK': 0, 'Spacy': 0}, 'literature.json': {'GPT-4': 38, 'LLAMA3': 0, 'NLTK': 0, 'Spacy': 0}}, 'commonVerbs': {'financial.json': {'GPT-4': 10, 'LLAMA3': 8, 'NLTK': 0, 'Spacy': 0}, 'literature.json': {'GPT-4': 38, 'LLAMA3': 0, 'NLTK': 0, 'Spacy': 0}}, 'commonObject': {'financial.json': {'GPT-4': 10, 'LLAMA3':