In [1]:
import os
import json
import csv
import numpy as np
from collections import defaultdict

In [4]:
def load_ontologies(ontology_dir):
    print("Loading ontologies...")
    ontologies = []
    for file_name in os.listdir(ontology_dir):
        if file_name.endswith('.json'):
            with open(os.path.join(ontology_dir, file_name), 'r') as f:
                ont = json.load(f)
                ontologies.append(ont)
                print(f"Loaded {ont['title']} ({ont['id']})")
    return ontologies

def process_field(value):
    """Process TSV field values and split multi-values"""
    if not value or value.strip() in ['-', '']:
        return []
    return [v.strip() for v in value.split('|')]

def generate_triples(row, relation):
    """Generate triples for a given relation considering multi-value fields"""
    domain_field = relation['domain']
    range_field = relation['range']
    rel_label = relation['label']
    
    domain_values = process_field(row.get(domain_field, ''))
    range_values = process_field(row.get(range_field, ''))
    
    triples = []
    
    if not domain_values or not range_values:
        return triples
    
    # Handle different length combinations
    len_domain = len(domain_values)
    len_range = len(range_values)
    
    if len_domain == len_range:
        for d, r in zip(domain_values, range_values):
            triples.append({'sub': d, 'rel': rel_label, 'obj': r})
    elif len_domain > 1 and len_range == 1:
        for d in domain_values:
            triples.append({'sub': d, 'rel': rel_label, 'obj': range_values[0]})
    elif len_range > 1 and len_domain == 1:
        for r in range_values:
            triples.append({'sub': domain_values[0], 'rel': rel_label, 'obj': r})
    else:
        for i in range(min(len_domain, len_range)):
            triples.append({'sub': domain_values[i], 'rel': rel_label, 'obj': range_values[i]})
    
    return triples

def process_tsv(tsv_path, ontologies):
    print(f"\nProcessing TSV file: {tsv_path}")
    ontology_data = defaultdict(lambda: {'sents': []})
    
    with open(tsv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        total_rows = 0
        for row in reader:
            total_rows += 1
            sentence = row.get('Sentence', '').strip()
            if not sentence:
                continue
            
            for ontology in ontologies:
                ontology_triples = []
                for relation in ontology['relations']:
                    ontology_triples.extend(generate_triples(row, relation))
                
                if ontology_triples:
                    ontology_id = ontology['id']
                    ontology_data[ontology_id]['sents'].append({
                        'sent': sentence,
                        'triples': ontology_triples
                    })
        
        print(f"Processed {total_rows} TSV rows")
    return ontology_data

def split_and_save_data(ontology_data):
    print("\nSplitting data and saving files:")
    total_all_sents = 0
    total_train = 0
    total_test = 0
    
    for ontology_id, data in ontology_data.items():
        sents = data['sents']
        all_subjects = set()
        
        # Collect all unique subjects
        for sent in sents:
            for triple in sent['triples']:
                all_subjects.add(triple['sub'])
        
        # Split subjects into train/test
        all_subjects = list(all_subjects)
        np.random.shuffle(all_subjects)
        split_idx = int(0.8 * len(all_subjects))
        train_subjects = set(all_subjects[:split_idx])
        test_subjects = set(all_subjects[split_idx:])
        
        # Split sentences
        train_sents = []
        test_sents = []
        for sent in sents:
            sent_subjects = {triple['sub'] for triple in sent['triples']}
            if sent_subjects & test_subjects:
                test_sents.append(sent)
            else:
                train_sents.append(sent)
        
        # Print statistics for this ontology
        total = len(sents)
        train_count = len(train_sents)
        test_count = len(test_sents)
        print(f"\n{ontology_id}:")
        print(f"  Total sentences: {total}")
        print(f"  Train sentences: {train_count}")
        print(f"  Test sentences: {test_count}")
        print(f"  Unique subjects: {len(all_subjects)}")
        
        # Update global totals
        total_all_sents += total
        total_train += train_count
        total_test += test_count
        
        # Save datasets
        os.makedirs('./data/odeuropa/train', exist_ok=True)
        os.makedirs('./data/odeuropa/test', exist_ok=True)
        os.makedirs('./data/odeuropa/ground_truth', exist_ok=True)
        
        # Save training data
        train_path = f'./data/odeuropa/train/{ontology_id}_train.jsonl'
        with open(train_path, 'w') as f:
            for idx, sent in enumerate(train_sents, 1):
                entry = {
                    'id': f'{ontology_id}_train_{idx}',
                    'sent': sent['sent'],
                    'triples': sent['triples']
                }
                f.write(json.dumps(entry) + '\n')
        
        # Save test and ground truth
        test_path = f'./data/odeuropa/test/{ontology_id}_test.jsonl'
        gt_path = f'./data/odeuropa/ground_truth/{ontology_id}_ground_truth.jsonl'
        with open(test_path, 'w') as test_f, open(gt_path, 'w') as gt_f:
            for idx, sent in enumerate(test_sents, 1):
                test_entry = {
                    'id': f'{ontology_id}_test_{idx}',
                    'sent': sent['sent']
                }
                gt_entry = {
                    'id': f'{ontology_id}_test_{idx}',
                    'sent': sent['sent'],
                    'triples': sent['triples']
                }
                test_f.write(json.dumps(test_entry) + '\n')
                gt_f.write(json.dumps(gt_entry) + '\n')
        
        print(f"  Saved to:")
        print(f"  - {train_path}")
        print(f"  - {test_path}")
        print(f"  - {gt_path}")
    
    print("\nFinal statistics:")
    print(f"Total sentences processed: {total_all_sents}")
    print(f"Total train sentences: {total_train}")
    print(f"Total test sentences: {total_test}")
    print(f"Total ontologies processed: {len(ontology_data)}")

In [5]:
# Main execution
ontology_dir = './data/odeuropa/ontologies'
tsv_path = '../../odeuropa_subset_british-library_1890-1899/BritishLibrary-1890_1899-frames.tsv'

# Load ontologies
print("Starting Odeuropa dataset processing...")
ontologies = load_ontologies(ontology_dir)

# Process TSV and generate ontology-specific data
ontology_data = process_tsv(tsv_path, ontologies)

# Split data and save
split_and_save_data(ontology_data)
print("\nProcessing complete!")

Starting Odeuropa dataset processing...
Loading ontologies...
Loaded Smell Emission Ontology (ont_l12_smell_emission)
Loaded Olfactory Experience Ontology (ont_l13_olfactory_experience)

Processing TSV file: ../../odeuropa_subset_british-library_1890-1899/BritishLibrary-1890_1899-frames.tsv
Processed 1000 TSV rows

Splitting data and saving files:

ont_l12_smell_emission:
  Total sentences: 742
  Train sentences: 557
  Test sentences: 185
  Unique subjects: 889
  Saved to:
  - ./data/odeuropa/train/ont_l12_smell_emission_train.jsonl
  - ./data/odeuropa/test/ont_l12_smell_emission_test.jsonl
  - ./data/odeuropa/ground_truth/ont_l12_smell_emission_ground_truth.jsonl

ont_l13_olfactory_experience:
  Total sentences: 583
  Train sentences: 450
  Test sentences: 133
  Unique subjects: 297
  Saved to:
  - ./data/odeuropa/train/ont_l13_olfactory_experience_train.jsonl
  - ./data/odeuropa/test/ont_l13_olfactory_experience_test.jsonl
  - ./data/odeuropa/ground_truth/ont_l13_olfactory_experience