In [29]:
from collections import defaultdict
import os
import re

import pandas as pd

# Initialize mapping
work_id_mapping = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))

In [30]:
# Read SETI CSV
df = pd.read_csv("2025-03-18-seti.csv")

In [31]:
pd.options.display.max_colwidth = 200

In [32]:
df[df['Collection']=='SARIT']['Link 1 (main)']

1037           https://sarit.indology.info/arunadatta-sarvangasundara.xml
1038                 https://sarit.indology.info/astangahrdayasamhita.xml
1039                      https://sarit.indology.info/astangasangraha.xml
1040                        https://sarit.indology.info/astavakragita.xml
1041               https://sarit.indology.info/asvaghosa-buddhacarita.xml
                                      ...                                
1115    https://sarit.indology.info/vagbhata-rasaratnasamuccaya-comms.xml
1116               https://sarit.indology.info/vatsyayana-nyayabhasya.xml
1117          https://sarit.indology.info/vidhiviveka-and-nyayakanika.xml
1118                                                                  NaN
1119                                                                  NaN
Name: Link 1 (main), Length: 83, dtype: object

In [42]:
import pandas as pd
import re
from collections import defaultdict

# Define the renaming mapping for each collection
labels = {
    'SARIT': ('web HTML', 'GitHub XML'),
    'DCS': ('web HTML', 'GitHub (1) CoNLL-U', 'GitHub (2) TXT'),
    'Sanskrit Library / TITUS': ('Skt Lib web HTML', 'TITUS web HTML'),
}

# Initialize mapping: work_id -> collection -> subtype (optional) -> **set** of links (to avoid duplicates)
work_id_mapping = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))

# Define possible link types
link_types = {
    'main': 'Link 1 (main)',
    'underlying': 'Link 2 (underlying)',
    'extract': 'Link 3 (extract)',
}

# Process each row in the DataFrame
for row in df.to_dict(orient="records"):
    collection_name = row['Collection']
    
    if pd.isna(row['Work ID']) or row['Work ID'] == "":
        continue

    work_ids = [wid.strip() for wid in re.split(r'[,\r\n]+', str(row['Work ID']))]

    # Determine the label mapping for this collection
    mapped_labels = labels.get(collection_name, list(link_types.keys()))

    # Iterate over each link column
    for link_type, col_name in link_types.items():
        if col_name in row and pd.notna(row[col_name]) and row[col_name].strip():
            link = row[col_name].strip()
            subtype = mapped_labels[list(link_types.keys()).index(link_type)] if collection_name in labels else link_type
            
            # Add to work_id_mapping using **set** to prevent duplicates
            for work_id in work_ids:
                work_id_mapping[work_id][collection_name][subtype].add(link)

# ✅ **New Step: Flatten and Sort Collections**
for work_id, collections in work_id_mapping.items():
    for collection_name, subtypes in list(collections.items()):  # Convert to list to allow modification
        # Convert all sets to sorted lists
        for subtype, links in subtypes.items():
            work_id_mapping[work_id][collection_name][subtype] = sorted(links)

        # If only one subtype exists, move directly under collection_name
        if len(subtypes) == 1:
            work_id_mapping[work_id][collection_name] = next(iter(subtypes.values()))  # Extract the only list


In [43]:
# test mbh case
work_id_mapping['42078']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'GRETIL': ['http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_01_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_02_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_03_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_04_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_05_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_06_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_07_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_08_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_09_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_10_u.htm'

In [44]:
# test more specific mbh case: udyogaparvan
work_id_mapping['112267']


defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'GRETIL': ['http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_05_u.htm']})

In [45]:
import json

# Convert defaultdict(set) to regular dict with lists for JSON serialization
def convert_to_serializable(d):
    result = {}
    for work_id, collections in d.items():
        result[work_id] = {}
        for collection_name, links in collections.items():
            if isinstance(links, str):  # Direct mapping (single link)
                result[work_id][collection_name] = links
            elif isinstance(links, set):  # Single link stored as a set
                result[work_id][collection_name] = list(links)[0] if len(links) == 1 else list(links)
            elif isinstance(links, dict):  # Multiple link types (convert sets to lists)
                result[work_id][collection_name] = {k: list(v) for k, v in links.items()}
            else:
                result[work_id][collection_name] = links  # Already in the right format
    return result

output_filename = "2025-03-18-etext-link-data.json"
output_json_path = os.path.join(output_filename)
with open(output_json_path, 'w') as jsonfile:
    json.dump(convert_to_serializable(work_id_mapping), jsonfile, indent=4, ensure_ascii=False)


In [27]:
# Define the renaming mapping for each collection
labels = {
    # 'GRETIL': ('web'),
    'SARIT': ('web HTML', 'GitHub XML'),
    # 'Pramāṇa NLP': ('GitHub'),
    'DCS': ('web HTML', 'GitHub (1) CoNLL-U', 'GitHub (2) TXT'),
    # 'MB KSTS': ('web'),
    'Sanskrit Library / TITUS': ('Skt Lib web HTML', 'TITUS web HTML'),
}

for row in df.to_dict(orient="records"):
    collection_name = row['Collection']
    
    if pd.isna(row['Work ID']) or row['Work ID'] == "":
        continue
    
    # Define possible link types
    link_types = {
        'main': 'Link 1 (main)',
        'underlying': 'Link 2 (underlying)',
        'extract': 'Link 3 (extract)',
    }

    # Extract valid links
    links = {
        link_type: {row[col_name].strip()} for link_type, col_name in link_types.items()
        if col_name in row and pd.notna(row[col_name]) and row[col_name].strip()
    }

    # If no valid links, skip
    if not links:
        continue

    # Apply renaming mapping if applicable
    if collection_name in labels:
        mapped_labels = labels[collection_name]
        # Ensure values in `link_mapping` are always sets
        link_mapping = {label: (value if isinstance(value, set) else {value}) 
                        for label, value in zip(mapped_labels, links.values())}
    else:
        link_mapping = {k: (v if isinstance(v, set) else {v}) for k, v in links.items()}  # Ensure default case is also handled

    work_ids = [wid.strip() for wid in re.split(r'[,\r\n]+', str(row['Work ID']))]
    
    for work_id in work_ids:
        if work_id == "...":
            continue
        
        if len(link_mapping) == 1:
            # Single link type: Merge instead of overwrite
            single_link_type = next(iter(link_mapping.keys()))
            if isinstance(work_id_mapping[work_id][collection_name], set):
                work_id_mapping[work_id][collection_name].update(link_mapping[single_link_type])
            else:
                work_id_mapping[work_id][collection_name] = link_mapping[single_link_type]
        else:
            # Multiple link types: Merge category-wise
            for link_type, link_set in link_mapping.items():
                work_id_mapping[work_id][collection_name][link_type].update(link_set)


TypeError: 'set' object is not subscriptable

In [26]:
work_id_mapping['42078']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'GRETIL': {'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_01_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_02_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_03_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_04_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_05_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_06_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_07_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_08_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_09_u.htm',
              'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/2_epic/mbh/mbh_10_u.htm'

In [10]:
import json

# Convert defaultdict(set) to regular dict with lists for JSON serialization
def convert_to_serializable(d):
    result = {}
    for work_id, collections in d.items():
        result[work_id] = {}
        for collection_name, links in collections.items():
            if isinstance(links, str):  # Direct mapping (single link)
                result[work_id][collection_name] = links
            elif isinstance(links, set):  # Single link stored as a set
                result[work_id][collection_name] = list(links)[0] if len(links) == 1 else list(links)
            elif isinstance(links, dict):  # Multiple link types (convert sets to lists)
                result[work_id][collection_name] = {k: list(v) for k, v in links.items()}
            else:
                result[work_id][collection_name] = links  # Already in the right format
    return result

output_filename = "2025-03-18-etext-link-data.json"
output_json_path = os.path.join(output_filename)
with open(output_json_path, 'w') as jsonfile:
    json.dump(convert_to_serializable(work_id_mapping), jsonfile, indent=4, ensure_ascii=False)


In [4]:
# Populate mapping with SETI links

"Link 1 (main)	Link 2 (underlying)	Link 3 (extract)"

i = j = 0
for _, row in df.iterrows():
    collection_name = row['Collection']
    if pd.isna(row['Work ID']) or row['Work ID'] == "" or pd.isna(row['Link 1 (main)']) or row['Link 1 (main)'] == "":
        continue
    i += 1
    work_ids = [wid.strip() for wid in re.split(r'[,\r\n]+', str(row['Work ID']))]
    for work_id in work_ids:
        if work_id == "...":
            continue
        j += 1
        work_id_mapping[work_id][collection_name].add(row['Link 1 (main)'])

print(i, j)

1461 1198


In [7]:
# Read GRETIL CSV
df = pd.read_csv("2025-02-26-gretil-etext-data.csv")

# Populate mapping with GRETIL links
i = j = 0
for _, row in df.iterrows():
    if pd.isna(row['Work ID']) or row['Work ID'] == "" or pd.isna(row['GRETIL link text']) or row['GRETIL link text'] == "":
        continue
    i += 1
    work_ids = [wid.strip() for wid in re.split(r'[,\r\n]+', str(row['Work ID']))]
    for work_id in work_ids:
        if work_id == "...":
            continue
        j += 1
        work_id_mapping[work_id]['GRETIL'].add(row['GRETIL link text'])

print(i, j)

887 754


In [9]:
import json

# Convert defaultdict(set) to regular dict with lists for JSON serialization
def convert_to_serializable(d):
    return {k: {sk: list(sv) for sk, sv in v.items()} for k, v in d.items()}

output_filename = "2025-03-18-etext-link-data.json"
output_json_path = os.path.join(output_filename)
with open(output_json_path, 'w') as jsonfile:
    json.dump(convert_to_serializable(work_id_mapping), jsonfile, indent=4, ensure_ascii=False)
