In [1]:
import json
from collections import defaultdict
import os

def split_json_by_label(input_file, output_dir):
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Read the original JSON file
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Group entries by their label
    grouped = defaultdict(list)
    for entry in data:
        label = entry.get("label", "unlabeled")
        grouped[label].append(entry)

    # Write each group into a separate file
    for label, entries in grouped.items():
        output_path = os.path.join(output_dir, f"{label}.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(entries, f, ensure_ascii=False, indent=2)

    print(f"Split complete. Files saved in: {output_dir}")


In [2]:
split_json_by_label("data/samples/GMF_1.json", "data/samples")

Split complete. Files saved in: data/samples


In [3]:
import json
import os

def split_nested_json_by_top_level_key(input_file, output_dir):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Load input JSON
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # For each top-level key, treat it as a label
    for label, pairs in data.items():
        entries = []
        for pair in pairs:
            entry = {
                "sentence1": pair.get("query", ""),
                "sentence2": pair.get("candidate", ""),
                "label": label
            }
            entries.append(entry)

        # Write to a file named after the label
        output_path = os.path.join(output_dir, f"{label}.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(entries, f, ensure_ascii=False, indent=2)

    print(f"Files written to '{output_dir}' for labels: {', '.join(data.keys())}")


In [6]:
split_nested_json_by_top_level_key("data/samples/Comp_Wild_Zwingli.json", "data/samples/MF")

Files written to 'data/samples/MF' for labels: quote, fuzzy_quote, paraphrase, similar_sentence, irrelevant


In [9]:
import json
import random
from pathlib import Path

random.seed(42)

LABEL = {
    'Qu': 'quote',
    'Fu': 'fuzzy_quote',
    'Pa': 'paraphrase',
    'Si': 'similar_sentence'
}

GENERAL_DISTRIBUTION = {
    'Qu': 1,
    'Fu': 3,
    'Pa': 2,
    'Si': 4
}


def load_labeled_pairs(file_path, label):
    with open(file_path, encoding='utf-8') as f:
        data = json.load(f)
    return [
        {
            'sentence1': pair['sentence1'],
            'sentence2': pair['sentence2'],
            'label': LABEL[label]
        }
        for pair in data
    ]


def load_corpus_sentences(corpus_path):
    with open(corpus_path, encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f if line.strip()]


def sample_irrelevant_pairs(lines, num_samples):
    pairs = []
    seen = set()
    n = len(lines)
    while len(pairs) < num_samples:
        i, j = random.sample(range(n), 2)
        s1, s2 = lines[i]['sentence'], lines[j]['sentence']
        if s1 != s2 and (i, j) not in seen:
            seen.add((i, j))
            pairs.append({
                'sentence1': s1,
                'sentence2': s2,
                'label': 'irrelevant'
            })
    return pairs


def save_eval_file(pairs, output_root, prefix, folder_label, index):
    output_root.mkdir(parents=True, exist_ok=True)
    filename = f"{prefix}{folder_label}_{index:02d}.json"
    out_path = output_root / filename
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(pairs, f, indent=2, ensure_ascii=False)


def generate_category_files(folder_path, corpus_lines, output_path):
    folder_label = folder_path.name
    for prefix, full_label in LABEL.items():
        labeled_path = folder_path / f"{LABEL[prefix]}.json"
        labeled_pairs = load_labeled_pairs(labeled_path, prefix)

        num_chunks = len(labeled_pairs) // 10
        for i in range(num_chunks):
            chunk_labeled = labeled_pairs[i * 10:(i + 1) * 10]
            sampled_irrelevant = sample_irrelevant_pairs(corpus_lines, 990)
            full = chunk_labeled + sampled_irrelevant
            random.shuffle(full)
            save_eval_file(full, output_path, prefix, folder_label, i)


def generate_general_files(folder_path, corpus_lines, output_path):
    folder_label = folder_path.name
    all_labeled = {
        key: load_labeled_pairs(folder_path / f"{fname}.json", key)
        for key, fname in LABEL.items()
    }

    num_sets = min(
        len(all_labeled['Qu']) // 1,
        len(all_labeled['Fu']) // 3,
        len(all_labeled['Pa']) // 2,
        len(all_labeled['Si']) // 4
    )

    for i in range(num_sets):
        block = []
        for key, count in GENERAL_DISTRIBUTION.items():
            start = i * count
            block.extend(all_labeled[key][start:start + count])
        sampled_irrelevant = sample_irrelevant_pairs(corpus_lines, 990)
        full = block + sampled_irrelevant
        random.shuffle(full)
        save_eval_file(full, output_path, 'Ge', folder_label, i)


In [10]:
corpus_lines = load_corpus_sentences(Path("data/corpus/corpus/corpus.jsonl"))

generate_category_files(Path("data/evaluation/eval-task-sources/S"), corpus_lines, Path("data/evaluation/eval-tasks"))
generate_general_files(Path("data/evaluation/eval-task-sources/S"), corpus_lines, Path("data/evaluation/eval-tasks"))

#generate_category_files(Path("data/evaluation/eval-task-sources/M"), corpus_lines, Path("data/evaluation/eval-tasks"))
#generate_general_files(Path("data/evaluation/eval-task-sources/M"), corpus_lines, Path("data/evaluation/eval-tasks"))
