In [1]:
import sys
import os

# Get the absolute path of the parent directory of 'data'
parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add the parent directory to sys.path
sys.path.append(parent_dir)

In [2]:
import pandas as pd
from collections import defaultdict
import re

# Initialize central mapping
work_id_mapping = defaultdict(lambda: defaultdict(set))

In [3]:
# Read GRETIL CSV
df = pd.read_csv("gretil_pandit_grounding.csv")

# Populate mapping with GRETIL links
i =0
for _, row in df.iterrows():
    if pd.isna(row['Work ID']) or row['Work ID'] in ["", "..."] or pd.isna(row['GRETIL link text']) or row['GRETIL link text'] == "":
        continue
    i += 1
    work_ids = [wid.strip() for wid in re.split(r'[,\r\n]+', str(row['Work ID']))]
    for work_id in work_ids:
        work_id_mapping[work_id]['GRETIL'].add(row['GRETIL link text'])

print(i)

733


In [4]:
# Function to extend the mapping with a new dataset
def extend_mapping(csv_path, source_key):
    df_new = pd.read_csv(csv_path)
    for _, row in df_new.iterrows():
        if pd.isna(row['Work ID']) or row['Work ID'] == "" or pd.isna(row[f'{source_key} link']) or row[f'{source_key} link'] == "":
            continue
        work_ids = [wid.strip() for wid in re.split(r'[,\r\n]+', str(row['Work ID']))]
        for work_id in work_ids:
            work_id_mapping[work_id][source_key].add(row[f'{source_key} link'])

# Example of extending the mapping with SARIT data
# extend_mapping("sarit_pandit_grounding.csv", "SARIT")

In [5]:
k0 = list(work_id_mapping.keys())[0]
ks = list(work_id_mapping.keys())
print(k0, work_id_mapping[k0])

41242 defaultdict(<class 'set'>, {'GRETIL': {'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/5_poetry/1_alam/mamkprku.htm', 'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/5_poetry/1_alam/mamkpb_u.htm', 'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/5_poetry/1_alam/mamkavpu.htm'}})


In [6]:
len(work_id_mapping)

409

In [7]:
ks

['41242',
 '41312',
 '41324',
 '41327',
 '41387',
 '41414',
 '41458',
 '41478',
 '41500',
 '41506',
 '41541',
 '41543',
 '41544',
 '41663',
 '41668',
 '41893',
 '41899',
 '41906',
 '41914',
 '41990',
 '42030',
 '42036',
 '42120',
 '42161',
 '42214',
 '42357',
 '42361',
 '42380',
 '42401',
 '42524',
 '44744',
 '85232',
 '88015',
 '88036',
 '88037',
 '88042',
 '88043',
 '88055',
 '88072',
 '88074',
 '88078',
 '88079',
 '88080',
 '88081',
 '88082',
 '88083',
 '88087',
 '88089',
 '88091',
 '88092',
 '88093',
 '88094',
 '88096',
 '88100',
 '88101',
 '88102',
 '88120',
 '88121',
 '88132',
 '88139',
 '88148',
 '88153',
 '88154',
 '88176',
 '88183',
 '88191',
 '88193',
 '88201',
 '88212',
 '88213',
 '88230',
 '88236',
 '88238',
 '88240',
 '88243',
 '88249',
 '88250',
 '88257',
 '88258',
 '88262',
 '88341',
 '88353',
 '88354',
 '88356',
 '88371',
 '88373',
 '88376',
 '88418',
 '88422',
 '88451',
 '88455',
 '88515',
 '88531',
 '88543',
 '88548',
 '88550',
 '88559',
 '88565',
 '88574',
 '88576',


In [8]:
type(dict(work_id_mapping))

dict

In [9]:
import json

# Convert defaultdict(set) to regular dict with lists for JSON serialization
def convert_to_serializable(d):
    return {k: {sk: list(sv) for sk, sv in v.items()} for k, v in d.items()}

output_filename = "gretil_link_data.json"
output_json_path = os.path.join(output_filename)
with open(output_json_path, 'w') as jsonfile:
    json.dump(convert_to_serializable(work_id_mapping), jsonfile, indent=4, ensure_ascii=False)


In [10]:
from utils.transform import create_entities
result = create_entities()

> [0;32m/Users/tyler/Git/panditya/utils/transform.py[0m(34)[0;36mcreate_entities[0;34m()[0m
[0;32m     32 [0;31m            [0;32mif[0m [0mwork_id[0m [0;32min[0m [0mlink_data[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     33 [0;31m                [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 34 [0;31m            [0mwork_name[0m [0;34m=[0m [0mrow[0m[0;34m[[0m[0;34m"Title"[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     35 [0;31m            [0mauthor_ids[0m [0;34m=[0m [0;34m[[0m[0mid[0m[0;34m.[0m[0mstrip[0m[0;34m([0m[0;34m)[0m [0;32mfor[0m [0mid[0m [0;32min[0m [0;34m([0m[0mrow[0m[0;34m[[0m[0;34m"Authors (IDs)"[0m[0;34m][0m [0;32mor[0m [0;34m""[0m[0;34m)[0m[0;34m.[0m[0msplit[0m[0;34m([0m[0;34m","[0m[0;34m)[0m [0;32mif[0m [0mid[0m[0;34m.[0m[0mstrip[0m[0;34m([0m[0;34m)[0m[0;34m][0m

ipdb>  work_id in link_data


True


ipdb>  link_data[work_id]


{'GRETIL': ['http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/6_sastra/3_phil/buddh/vakobhku.htm', 'http://gretil.sub.uni-goettingen.de/gretil/1_sanskr/6_sastra/3_phil/buddh/vakobhau.htm']}


ipdb>  q
