In [1]:
import concurrent.futures
import json
import requests
import os
import s3fs
from rdflib import Graph
s3 = s3fs.S3FileSystem(anon=True)

API = "https://api.dandiarchive.org/api/dandisets"
headers_dandi = {"Accept": "application/json"}

def replace_quotes(chunk):
    return chunk.replace("'", '"')

def _parallel_replace_quotes(text, chunk_size=100000, num_threads=4):
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        result_chunks = list(executor.map(replace_quotes, chunks))
    return ''.join(result_chunks)

def _get_s3_bucket_url(jsondata): 
    s3_urls = []
    if "results" not in jsondata:
        return []
    
    for i in range(0,len(jsondata["results"])):
        _id = jsondata["results"][i]["identifier"]
        _ver = jsondata["results"][i]["draft_version"]["version"] if jsondata["results"][i]["most_recent_published_version"] == None else jsondata["results"][i]["most_recent_published_version"]["version"]
        url = f's3://dandiarchive/dandisets/{_id}/{_ver}/'
        s3_urls.append(url)
    return s3_urls


def _get_jsonld_file_list_s3(s3url):
    try:
        return list(filter(lambda x: x.endswith('.jsonld'), s3.ls(s3url)))
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def _convert_to_turtle(jsonlddata, export=False, name="data"):
    if export:
        filename = f"{name}.ttl"
        Graph().parse(data=jsonlddata, format='json-ld').serialize(format="turtle", destination=filename)
        return "File exported into turtle format successfully!"
    else:
        return Graph().parse(data=jsonlddata, format='json-ld').serialize(format="turtle")

def _get_export_file_name(file_str):
    split_filestr = file_str.split("/")
    return f"{split_filestr[4].split('.')[0]}_{split_filestr[2]}_{split_filestr[3]}"
    
def _get_and_convert_turtle(s3_files):
    for file in s3_files:
        print("*"*100)
        export_file_name = _get_export_file_name(file)
        with s3.open(file, 'rb') as f:
            data_js = str(json.load(f))
            modified_text = _parallel_replace_quotes(data_js)
            turtle_data = _convert_to_turtle(modified_text, True, export_file_name)
            print(turtle_data)
        print("*"*100)
     

            
    

def get_jsonld_and_upload(total_pages):
    total_pages = 2 if total_pages <= 1 else total_pages
    for page_num in range(1,total_pages):
        url_dandi_datasets = f"{API}/?pages={page_num}"
        dandi_dataset_info_response = requests.get(url_dandi_datasets)
        if dandi_dataset_info_response.status_code ==200:
            for s3url in (_get_s3_bucket_url(json.loads(dandi_dataset_info_response.text))):
                try:
                    jsonld_files = _get_jsonld_file_list_s3(s3url)
                    print(jsonld_files)
                    if len(jsonld_files)>0:
                        _get_and_convert_turtle(jsonld_files)
                except:
                    continue
            else:
                printf("Unable to get the data")
        
    

In [None]:
get_jsonld_and_upload(1)


['dandiarchive/dandisets/000003/0.230629.1955/assets.jsonld', 'dandiarchive/dandisets/000003/0.230629.1955/collection.jsonld', 'dandiarchive/dandisets/000003/0.230629.1955/dandiset.jsonld']
****************************************************************************************************
File exported into turtle format successfully!
****************************************************************************************************
****************************************************************************************************
File exported into turtle format successfully!
****************************************************************************************************
****************************************************************************************************
['dandiarchive/dandisets/000004/0.220126.1852/assets.jsonld', 'dandiarchive/dandisets/000004/0.220126.1852/collection.jsonld', 'dandiarchive/dandisets/000004/0.220126.1852/dandiset.jsonld']
*********************