In [None]:
from pkgutil import read_code

from mlcroissant import RecordSet

from cryoet_data_portal_croissant.gen import generate_mlcroissant
import mlcroissant as mlc
import json

ds = generate_mlcroissant(dataset_ids=[10000])

with open("test.json", "w") as f:
    f.write(json.dumps(ds[0].to_json(), indent=4, default=str) + '\n')

In [None]:
dataset = mlc.Dataset.from_metadata(ds[0])
dataset.debug = True
records = dataset.records(record_set="runs")

for record in records:
    print(record)


In [None]:
from linkml_runtime.utils.schemaview import SchemaView

sv = SchemaView('schema.yaml')

print(dir(sv.all_classes()["Run"]))

sv.all_classes()["Run"].attributes['tiltseries']

In [None]:
TYPE_MAP = {
    "string": mlc.DataType.TEXT,
    "int": mlc.DataType.INTEGER,
    "float": mlc.DataType.FLOAT,
    "boolean": mlc.DataType.BOOL,
    "date": mlc.DataType.DATE,
    "datetime": mlc.DataType.DATE,
    "url": mlc.DataType.URL,
}

import re


def camel_to_snake(name):
    # Add an underscore before each capital letter and convert to lowercase
    return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()


def class_to_recordset(linkml_class):
    """
    Convert a class to a RecordSet.

    Args:
        sv: SchemaView object.
        cls: Class to convert.

    Returns:
        mlc.RecordSet: The converted RecordSet.
    """

    fields = []

    clsname = camel_to_snake(linkml_class.name)

    for attr, slot_def in linkml_class.attributes.items():
        if slot_def.range in TYPE_MAP:
            fld = mlc.Field(
                id=f"{clsname}/{slot_def.name}",
                data_types=[TYPE_MAP[slot_def.range]],
                name=slot_def.name,
                description=slot_def.description,

            )
        else:
            fld = mlc.Field(
                id=f"{clsname}/{slot_def.name}",
                name=slot_def.name,
                description=slot_def.description,
                references=mlc.Source(id=f"{slot_def.range}/id"),
            )

        fields.append(fld)

    rs = mlc.RecordSet(
        id=f"{clsname}",
        name=f"{clsname}",
        fields=fields,
    )

    return rs


class_to_recordset(sv.all_classes()["Run"]).to_json()

In [None]:
Field = mlc.Field(
    id="field",
    name="field",
    description="field",
    data_types=[mlc.DataType.TEXT],
    array_shape="(-1,)",
)

In [None]:
obj = mlc.FileObject(
    id="id",
    name="name",
    description="description",
    content_url="https://example.com",
    encoding_formats=["image/OME-Zarr"],
    same_as=["same_as"],
    contained_in=["contained_in"],
)
obj.

In [None]:
import cryoet_data_portal as cdp
import hashlib

client = cdp.Client()
tomos = cdp.Tomogram.find(client, [cdp.Tomogram.run.dataset_id == 10000])
jtomos = [tomo.to_dict() for tomo in tomos]

with open("tomos.json", "w") as f:
    f.write(json.dumps(jtomos, indent=4, default=str) + '\n')

with open("tomos.json", "rb") as f:
    csum = hashlib.file_digest(f, hashlib.sha256)

print(csum.hexdigest())

cdp.Tomogram.

In [None]:
obj = mlc.FileObject(
    id="tomogramsfile",
    name="name",
    description="description",
    content_url="http://0.0.0.0:8000/tomos.json",
    encoding_formats=["application/json"],
    sha256="234b14620d93934980f6131d41251eec918e86b07738434b1554867c506b8077",
)

rs = mlc.RecordSet(
    id="tomograms",
    name="tomograms",
    description="Tomograms",
    key=["tomograms/id"],
    fields=[
        mlc.Field(
            id="tomograms/id",
            data_types=[mlc.DataType.INTEGER],
            description="ID of the tomogram.",
            source=mlc.Source(file_object="tomogramsfile", extract=mlc.Extract(json_path="$[*].id")),
        ),
        mlc.Field(
            id="tomograms/name",
            data_types=[mlc.DataType.TEXT],
            description="Name of the tomogram.",
            source=mlc.Source(file_object="tomogramsfile", extract=mlc.Extract(json_path="$[*].s3_omezarr_dir")),
        ),
    ],
)

md = mlc.Metadata(
    name="test",
    description="test",
    creators=[mlc.Person(name="Test Person")],
    license=["https://creativecommons.org/public-domain/cc0/"],
    url="https://example.com",
    distribution=[obj],
    record_sets=[rs],
)

ds = mlc.Dataset.from_metadata(md)

for record in ds.records(record_set="tomograms"):
    print(record)


In [None]:
cdp.Dataset.__doc__

In [None]:
from griffe import Docstring
from typing import Type
from pydantic import BaseModel, create_model

_PORTAL_TYPES = (
        Type[cdp.Annotation]
        | Type[cdp.AnnotationShape]
        | Type[cdp.AnnotationFile]
        | Type[cdp.Tomogram]
        | Type[cdp.TiltSeries]
        | Type[cdp.Dataset]
        | Type[cdp.Run]
        | Type[cdp.Alignment]
)

_TYPE_MAP = {
    "str": mlc.DataType.TEXT,
    "int": mlc.DataType.INTEGER,
    "float": mlc.DataType.FLOAT,
    "bool": mlc.DataType.BOOL,
    "date": mlc.DataType.DATE,
    "datetime": mlc.DataType.DATE,
    "url": mlc.DataType.URL,
}


def _get_descriptions(cls: Type[cdp.Dataset]) -> tuple[str, dict[str, str]]:
    """
    Get the description and attributes of a class from its google docstring.

    Args:
        cls: The class to extract the docstring from.
    Returns:
        tuple: A tuple containing the description and a dictionary of attributes and their descriptions.
    """
    doc = Docstring(cls.__doc__)
    attrs = doc.parse("google")
    return attrs[0].value, {a.name: a.description for a in attrs[1].value}


def _portal_to_recordset(clz: _PORTAL_TYPES) -> mlc.RecordSet:
    """Automatically create a Pydantic model from a CryoET Data Portal annotation class."""
    attrs = clz.__annotations__
    docs = get_descriptions(clz)
    fields = []
    clz_name = clz._gql_type.lower()
    filename = f"{clz._gql_root_field}.json"

    for name, typ in attrs.items():
        if name == "neuroglancer_config":
            continue
        if typ in ["int", "float", "str", "bool"] and name[0] != "_":
            data_type = _TYPE_MAP[typ]
            if typ == "str" and ("http" in name or "s3://" in name):
                data_type = mlc.DataType.URL

            if "id" in name and name != "id":
                fld = mlc.Field(
                    id=f"{clz_name}_{name}",
                    name=name,
                    data_types=[data_type],
                    description=docs[1].get(name, None),
                    source=mlc.Source(
                        file_object=filename,
                        extract=mlc.Extract(
                            json_path=f"$[*].{name}"
                        ),
                    ),
                    references=mlc.Source(id=f"{name}"),
                )
            else:
                fld = mlc.Field(
                    id=f"{clz_name}_{name}",
                    name=name,
                    data_types=[data_type],
                    description=docs[1].get(name, None),
                    source=mlc.Source(
                        file_object=filename,
                        extract=mlc.Extract(
                            json_path=f"$[*].{name}"
                        ),
                    ),
                )

            fields.append(fld)

    rs = mlc.RecordSet(
        id=f"{clz_name}",
        name=f"{clz_name}",
        description=docs[0],
        fields=fields,
        key=[f"{clz_name}_id"],
    )
    return rs


rss = _portal_to_recordset(cdp.Dataset)

rss.to_json()

In [None]:
import cryoet_data_portal as cdp
import hashlib


def _dump_portal(dataset_id: int) -> tuple[mlc.FileObject, mlc.RecordSet]:
    """
    Dump a CryoET Data Portal class to a list of FileObjects.

    Args:
        clz: The class to dump.

    Returns:
        list[mlc.FileObject]: A list of FileObjects.
    """

    to_dump = [
        {
            "class": cdp.Annotation,
            "query": [cdp.Annotation.run.dataset_id == dataset_id]
        },
        {
            "class": cdp.AnnotationShape,
            "query": [cdp.AnnotationShape.annotation.run.dataset_id == dataset_id]
        },
        {
            "class": cdp.AnnotationFile,
            "query": [cdp.AnnotationFile.tomogram_voxel_spacing.run.dataset_id == dataset_id]
        },
        {
            "class": cdp.Tomogram,
            "query": [cdp.Tomogram.run.dataset_id == dataset_id]
        },
        {
            "class": cdp.TiltSeries,
            "query": [cdp.TiltSeries.run.dataset_id == dataset_id]
        },
        {
            "class": cdp.Dataset,
            "query": [cdp.Dataset.id == dataset_id]
        },
        {
            "class": cdp.Run,
            "query": [cdp.Run.dataset_id == dataset_id]
        },
        {
            "class": cdp.Alignment,
            "query": [cdp.Alignment.run.dataset_id == dataset_id]
        },
    ]

    objects = []
    recordsets = []

    for dump in to_dump:
        clz = dump["class"]
        query = dump["query"]
        client = cdp.Client()
        items = clz.find(client, query)
        jitems = [item.to_dict() for item in items]
        filename = f"{clz._gql_root_field}.json"
        text = json.dumps(jitems, indent=4, default=str) + '\n'

        with open(filename, "w") as f:
            f.write(text)

        with open(filename, "rb") as f:
            checksum = hashlib.file_digest(f, hashlib.sha256).hexdigest()

        print(checksum)

        obj = mlc.FileObject(
            id=filename,
            name=filename,
            description="description",
            content_url=f"http://0.0.0.0:8000/{clz._gql_root_field}.json",
            encoding_formats=["application/json"],
            sha256=checksum,
        )

        objects.append(obj)

        recordsets.append(_portal_to_recordset(clz))
        print(json.dumps(recordsets[-1].to_json(), indent=4, default=str))
    return objects, recordsets


objs, recordsets = _dump_portal(10000)
#rs = _portal_to_recordset(cdp.Dataset)


md = mlc.Metadata(
    name="test",
    description="test",
    creators=[mlc.Person(name="Test Person")],
    license=["https://creativecommons.org/public-domain/cc0/"],
    url="https://example.com",
    distribution=objs,
    record_sets=recordsets,
)

ds = mlc.Dataset.from_metadata(md)

# for record in ds.records(record_set="tomogram"):
#     print(record)

print(list(ds.records(record_set="tomogram"))[0])

In [None]:
objs

In [None]:
from cryoet_data_portal_croissant.gen import generate_mlcroissant

metadata = generate_mlcroissant([10000])

# dataset = mlc.Dataset.from_metadata(metadata[0])
# dataset.debug = True
# records = dataset.records(record_set="runs")
#
# for record in records:
#     print(record)

In [None]:
import json

json.dumps(metadata[0].to_json(), indent=4, default=str)

In [2]:
from cryoet_data_portal_croissant._generators._dump_portal import _dump_portal
import cryoet_data_portal as cdp
import json

client = cdp.Client()
dataset = cdp.Dataset.get_by_id(client, 10000)

# Dump the portal metadata to croissant
distribution, recordsets = _dump_portal(10000, 'testdata', "http://[::]:8000/")

for r in recordsets:
    print(json.dumps(r.to_json(), indent=4, default=str))

for d in distribution:
    print(json.dumps(d.to_json(), indent=4, default=str))


{
    "@type": "cr:RecordSet",
    "@id": "annotation",
    "name": "annotation",
    "description": "Metadata for an annotation",
    "key": {
        "@id": "annotation_id"
    },
    "field": [
        {
            "@type": "cr:Field",
            "@id": "annotation_id",
            "name": "id",
            "description": "Numeric identifier (May change!)",
            "dataType": "sc:Integer",
            "source": {
                "fileObject": {
                    "@id": "annotations.json"
                },
                "extract": {
                    "jsonPath": "$[*].id"
                }
            }
        },
        {
            "@type": "cr:Field",
            "@id": "annotation_run_id",
            "name": "run_id",
            "description": "None",
            "dataType": "sc:Integer",
            "references": {
                "@id": "run_id"
            },
            "source": {
                "fileObject": {
                    "@id": "annotations.json"

In [7]:
from cryoet_data_portal_croissant.gen import generate_mlcroissant
import mlcroissant as mlc
import json
import shutil

shutil.rmtree("/Users/utz.ermel/.cache/croissant", ignore_errors=True)

metadata = generate_mlcroissant([10000])

with open("test.json", "w") as f:
    f.write(json.dumps(metadata[0].to_json(), indent=4, default=str) + '\n')

dataset = mlc.Dataset.from_metadata(metadata[0])
dataset.debug = True
records = dataset.records(record_set="tomogram_segmentation")

for record in records:
    print(record)


  -  [Metadata(S. pombe cells with defocus)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.
  -  [Metadata(S. pombe cells with defocus)] Property "https://schema.org/version" is recommended, but does not exist.
Downloading http://0.0.0.0:8000/annotations.json...: 100%|██████████| 127k/127k [00:00<00:00, 92.6MiB/s]
Downloading http://0.0.0.0:8000/tomograms.json...: 100%|██████████| 135k/135k [00:00<00:00, 97.3MiB/s]

{'tomogram_segmentation/tomogram_id': 629, 'tomogram_segmentation/annotation_id': 629, 'tomogram_segmentation/tomogram_s3_omezarr_dir': b's3://cryoet-data-portal-public/10000/TS_043/Reconstructions/VoxelSpacing13.480/Tomograms/100/TS_043.zarr'}
{'tomogram_segmentation/tomogram_id': 621, 'tomogram_segmentation/annotation_id': 621, 'tomogram_segmentation/tomogram_s3_omezarr_dir': b's3://cryoet-data-portal-public/10000/TS_026/Reconstructions/VoxelSpacing13.480/Tomograms/100/TS_026.zarr'}
{'tomogram_segmentation/tomogram_id': 622, 'tomogram_segmentation/annotation_id': 622, 'tomogram_segmentation/tomogram_s3_omezarr_dir': b's3://cryoet-data-portal-public/10000/TS_027/Reconstructions/VoxelSpacing13.480/Tomograms/100/TS_027.zarr'}
{'tomogram_segmentation/tomogram_id': 623, 'tomogram_segmentation/annotation_id': 623, 'tomogram_segmentation/tomogram_s3_omezarr_dir': b's3://cryoet-data-portal-public/10000/TS_028/Reconstructions/VoxelSpacing13.480/Tomograms/100/TS_028.zarr'}
{'tomogram_segmentat




In [5]:
import cryoet_data_portal as cdp

client = cdp.Client()
tomo = cdp.Tomogram.get_by_id(client, 10000)
tomo.s3_omezarr_dir

's3://cryoet-data-portal-public/10216/mba2010-08-26-19/Reconstructions/VoxelSpacing10.240/Tomograms/100/mba2010-08-26-19.zarr'

In [7]:
typ = "str"
name = tomo.s3_omezarr_dir
if typ == "str" and ("http" in name or "s3://" in name):
    print("yes")

yes
