In [1]:
import json
from collections import defaultdict
import os

def split_json_by_label(input_file, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Group entries by their label
    grouped = defaultdict(list)
    for entry in data:
        label = entry.get("label", "unlabeled")
        grouped[label].append(entry)

    # Write each group into a separate file
    for label, entries in grouped.items():
        output_path = os.path.join(output_dir, f"{label}.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(entries, f, ensure_ascii=False, indent=2)

    print(f"Split complete. Files saved in: {output_dir}")


In [None]:
split_json_by_label("../data/evaluation/GMF_1.json", "../data/evaluation")

In [None]:
import json
import os

def split_nested_json_by_top_level_key(input_file, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # For each top-level key, treat it as a label
    for label, pairs in data.items():
        entries = []
        for pair in pairs:
            entry = {
                "sentence1": pair.get("query", ""),
                "sentence2": pair.get("candidate", ""),
                "label": label
            }
            entries.append(entry)

        # Write to a file named after the label
        output_path = os.path.join(output_dir, f"{label}.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(entries, f, ensure_ascii=False, indent=2)

    print(f"Files written to '{output_dir}' for labels: {', '.join(data.keys())}")


In [None]:
split_nested_json_by_top_level_key("../data/evaluation/Comp_Wild_Zwingli.json", "../data/evaluation/MF")

In [None]:
import json
import random
from pathlib import Path

random.seed(42)

LABEL = {
    'Qu': 'quote',
    'Fu': 'fuzzy_quote',
    'Pa': 'paraphrase',
    'Si': 'similar_sentence'
}

GENERAL_DISTRIBUTION = {
    'Qu': 1,
    'Fu': 3,
    'Pa': 2,
    'Si': 4
}


def load_labeled_pairs(file_path, label):
    with open(file_path, encoding='utf-8') as f:
        data = json.load(f)
    return [
        {
            'sentence1': pair['sentence1'],
            'sentence2': pair['sentence2'],
            'label': LABEL[label]
        }
        for pair in data
    ]


def load_corpus_sentences(corpus_path):
    with open(corpus_path, encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f if line.strip()]


def sample_irrelevant_pairs(lines, num_samples):
    pairs = []
    seen = set()
    n = len(lines)
    while len(pairs) < num_samples:
        i, j = random.sample(range(n), 2)
        s1, s2 = lines[i]['sentence'], lines[j]['sentence']
        if s1 != s2 and (i, j) not in seen:
            seen.add((i, j))
            pairs.append({
                'sentence1': s1,
                'sentence2': s2,
                'label': 'irrelevant'
            })
    return pairs


def save_eval_file(pairs, output_root, prefix, folder_label, index):
    output_root.mkdir(parents=True, exist_ok=True)
    filename = f"{prefix}{folder_label}_{index:02d}.json"
    out_path = output_root / filename
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(pairs, f, indent=2, ensure_ascii=False)


def generate_category_files(folder_path, corpus_lines, output_path):
    folder_label = folder_path.name
    for prefix, full_label in LABEL.items():
        labeled_path = folder_path / f"{LABEL[prefix]}.json"
        labeled_pairs = load_labeled_pairs(labeled_path, prefix)

        num_chunks = len(labeled_pairs) // 10
        for i in range(num_chunks):
            chunk_labeled = labeled_pairs[i * 10:(i + 1) * 10]
            sampled_irrelevant = sample_irrelevant_pairs(corpus_lines, 90)
            full = chunk_labeled + sampled_irrelevant
            random.shuffle(full)
            save_eval_file(full, output_path, prefix, folder_label, i)


def generate_general_files(folder_path, corpus_lines, output_path):
    folder_label = folder_path.name
    all_labeled = {
        key: load_labeled_pairs(folder_path / f"{fname}.json", key)
        for key, fname in LABEL.items()
    }

    num_sets = min(
        len(all_labeled['Qu']) // 1,
        len(all_labeled['Fu']) // 3,
        len(all_labeled['Pa']) // 2,
        len(all_labeled['Si']) // 4
    )

    for i in range(num_sets):
        block = []
        for key, count in GENERAL_DISTRIBUTION.items():
            start = i * count
            block.extend(all_labeled[key][start:start + count])
        sampled_irrelevant = sample_irrelevant_pairs(corpus_lines, 990)
        full = block + sampled_irrelevant
        random.shuffle(full)
        save_eval_file(full, output_path, 'Ge', folder_label, i)

def generate_custom_files(folder_path, corpus_lines, output_path):
    folder_label = folder_path.name
    all_labeled = {
        key: load_labeled_pairs(folder_path / f"{fname}.json", key)
        for key, fname in LABEL.items()
    }

    def make_file(label, counts, prefix):
        min_sets = min(
            len(all_labeled[k]) // v for k, v in counts.items() if k != 'Ir'
        )
        for i in range(min_sets):
            block = []
            for k, v in counts.items():
                if k == 'Ir':
                    continue
                start = i * v
                block.extend(all_labeled[k][start:start + v])
            num_irrelevant = counts.get('Ir', 0)
            block.extend(sample_irrelevant_pairs(corpus_lines, num_irrelevant))
            random.shuffle(block)
            save_eval_file(block, output_path, prefix, folder_label, i)

    # Define custom configurations
    fu2_counts = {'Fu': 10, 'Pa': 10, 'Si': 10, 'Ir': 70}
    pa2_counts = {'Pa': 10, 'Si': 10, 'Ir': 80}

    make_file('Fu2', fu2_counts, 'Fu2')
    make_file('Pa2', pa2_counts, 'Pa2')


In [None]:
corpus_lines = load_corpus_sentences(Path("../data/corpus/corpus/corpus.jsonl"))
generate_custom_files(Path("../data/evaluation/eval-task-sources/M"), corpus_lines, Path("../data/evaluation/eval-tasks-M2"))

In [None]:
corpus_lines = load_corpus_sentences(Path("../data/corpus/corpus/corpus.jsonl"))
generate_custom_files(Path("../data/evaluation/eval-task-sources/S"), corpus_lines, Path("../data/evaluation/eval-tasks-S2"))

In [None]:
corpus_lines = load_corpus_sentences(Path("../data/corpus/corpus/corpus.jsonl"))

generate_category_files(Path("../data/evaluation/eval-task-sources/S"), corpus_lines, Path("../data/evaluation/eval-tasks-S1"))
#generate_general_files(Path("data/evaluation/eval-task-sources/S"), corpus_lines, Path("data/evaluation/eval-tasks"))


In [None]:
corpus_lines = load_corpus_sentences(Path("../data/corpus/corpus/corpus.jsonl"))

generate_category_files(Path("../data/evaluation/eval-task-sources/M"), corpus_lines, Path("../data/evaluation/eval-tasks-M1"))
#generate_general_files(Path("../data/evaluation/eval-task-sources/M"), corpus_lines, Path("../data/evaluation/eval-tasks-M"))

In [None]:
import os
import json
import spacy
from tqdm.notebook import tqdm

def analyze_jsonl_folder(
    folder_path: str,
    text_key: str = "sentence",
    model: str = "la_core_web_lg"
) -> None:
    """
    Analyzes .jsonl files in a folder for sentence and token counts,
    printing results at the end in JSON format.
    """
    nlp = spacy.load(model, disable=["parser", "ner"])
    total_sentences = 0
    total_tokens = 0
    file_stats = []

    jsonl_files = [
        f for f in os.listdir(folder_path) if f.endswith(".jsonl")
    ]

    for filename in tqdm(jsonl_files, desc="Files", unit="file"):
        file_path = os.path.join(folder_path, filename)
        sentence_count = 0
        token_count = 0

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                line_count = sum(1 for _ in f)
        except Exception as e:
            continue 

        with open(file_path, "r", encoding="utf-8") as f:
            for line in tqdm(f, total=line_count,
                             desc=filename, unit="line", leave=False):
                try:
                    data = json.loads(line)
                    text = data.get(text_key, "")
                    doc = nlp(text)
                    tokens = [t for t in doc if not t.is_space]
                    sentence_count += 1
                    token_count += len(tokens)
                except json.JSONDecodeError:
                    continue

        file_stats.append({
            "file": filename,
            "sentences": sentence_count,
            "tokens": token_count
        })

        total_sentences += sentence_count
        total_tokens += token_count

    result = {
        "files": file_stats,
        "total": {
            "sentences": total_sentences,
            "tokens": total_tokens
        }
    }

    print(json.dumps(result, indent=2))


In [None]:
analyze_jsonl_folder("../data/corpus/documents/")