In [1]:
from collections import defaultdict
import os
import re

import pandas as pd

# Initialize central mapping
work_id_mapping = defaultdict(lambda: defaultdict(set))

In [2]:
# Read GRETIL CSV
df = pd.read_csv("2025-02-26-gretil-etext-data.csv")

# Populate mapping with GRETIL links
i = j = 0
for _, row in df.iterrows():
    if pd.isna(row['Work ID']) or row['Work ID'] == "" or pd.isna(row['GRETIL link text']) or row['GRETIL link text'] == "":
        continue
    i += 1
    work_ids = [wid.strip() for wid in re.split(r'[,\r\n]+', str(row['Work ID']))]
    for work_id in work_ids:
        if work_id == "...":
            continue
        j += 1
        work_id_mapping[work_id]['GRETIL'].add(row['GRETIL link text'])

print(i, j)

887 754


In [3]:
# Function to extend the mapping with a new dataset
def extend_mapping(csv_path, source_key):
    df_new = pd.read_csv(csv_path)
    for _, row in df_new.iterrows():
        if pd.isna(row['Work ID']) or row['Work ID'] == "" or pd.isna(row[f'{source_key} link']) or row[f'{source_key} link'] == "":
            continue
        work_ids = [wid.strip() for wid in re.split(r'[,\r\n]+', str(row['Work ID']))]
        for work_id in work_ids:
            work_id_mapping[work_id][source_key].add(row[f'{source_key} link'])

# Example of extending the mapping with SARIT data
# extend_mapping("sarit_pandit_grounding.csv", "SARIT")

In [4]:
import json

# Convert defaultdict(set) to regular dict with lists for JSON serialization
def convert_to_serializable(d):
    return {k: {sk: list(sv) for sk, sv in v.items()} for k, v in d.items()}

output_filename = "2025-02-26-gretil-link-data.json"
output_json_path = os.path.join(output_filename)
with open(output_json_path, 'w') as jsonfile:
    json.dump(convert_to_serializable(work_id_mapping), jsonfile, indent=4, ensure_ascii=False)
