# inferring linked data from `IPython` run times.

automatically exporting rdf data during interactive computing.

we'll modify `IPython`s display formatter to include a system that describes
python objects as `json` linked data.

In [1]:
    %pip install traitlets IPython jinja2 pandas pyld
    from functools import singledispatch, singledispatchmethod
    import types, gc, re, sys
    from IPython.core.formatters import DisplayFormatter, BaseFormatter, catch_format_error, JSONFormatter
    from traitlets import ObjectName, Unicode, Instance, List, Any
    from IPython import get_ipython
    from pathlib import Path
    
    TYPE, ID, GRAPH, CONTAINER, NEST, CONTEXT = "@type @id @graph @container @nest @context".split()
    MAIN = __name__ == "__main__"
    ACTIVE = "__file__" not in locals() and MAIN
    shell = get_ipython()

a `MetadataFormatter` for including linked data in `IPython` reprs.
this class carries machinery to generate:

* ids for python types and objects as urns `MetadataFormatter.get_id`
* linked data representations of python objects with `MetadataFormatter.get_graph`

`MetadataFormatter.for_type, MetadataFormatter.get_id.register, MetadataFormatter.get_graph.register` extra the expression of the linked data graphs.

## custom metadata formatter

In [2]:
    class MetadataFormatter(BaseFormatter):
        graph, format_type,  = List(), Unicode('application/ld+json') 
        _return_type, print_method = (list, dict), ObjectName('_repr_metadata_')
        
        @singledispatchmethod
        def get_id(self, object): return next(self.get_object(object), None)
        
        @singledispatchmethod
        def get_graph(self, object):
            data = {TYPE: self.get_id(type(object))}
            id = self.get_id(object)
            if id: 
                data.setdefault(ID, id)
                if isinstance(id, list): return [{ID: x, **data} for x in id]
            return data
        
        def get_object(self, object, filter=None):
            if isinstance(filter, str): filter = re.compile(filter)
            for referrer in (x for x in gc.get_referrers(object) if isinstance(x, dict)):
                yield from self.get_object_from_ns(referrer, object, filter=filter)

        def get_object_from_ns(self, ns, object, filter=None):
            weakref = ns.get("__weakref__")
            parent = None
            if weakref: parent = self.get_id(weakref.__objclass__)
            else:
                parent = ns.get("__module__", ns.get("__name__"))
                if parent: parent += ":"
                
            if not parent and ns is sys.modules:
                return object.__name__

            for k in (k for k, v in ns.items() if v is object and not k.startswith("_")): 
                name = F"{parent or 'noparent'}#{k}"
                if filter is not None and not filter.match(name): continue
                yield name

        def get_session_cell_id(self):
            data = get_ipython().kernel.get_parent()
            return data["metadata"]["cellId"], data["header"]["session"]
   
        def set_metadata(self, object=None, **kwargs):
            ids = dict(zip(("cell:id", "session:id"), self.get_session_cell_id()))
            node = {} if object is None else self.get_graph(object)
            if isinstance(node, dict): node = [node]
            for node in node:
                self.graph.append({**ids, **node, **kwargs})
                
            
        def __call__(self, object):
            explicit = super().__call__(object)
            if explicit:
                if isinstance(explicit, dict): self.set_metadata(**explicit)
                else: 
                    for e in explicit: self.set_metadata(**w)
            else: self.set_metadata(object)
            try: return self.graph[:]
            finally: self.graph.clear()

## custom display formatter

the `LinkedDataFormatter` customizes how `IPython`s normal `DisplayFormatter` expresses metadata.

In [3]:
    class LinkedDataFormatter(DisplayFormatter):
        metadata_formatter = Instance(MetadataFormatter, args=())
        def format(self, object, include=None, exclude=None):
            data, meta = super().format(object, include, exclude)
            g = self.metadata_formatter(object)
            if g: meta[GRAPH] = g
            return data, meta
        
    def load_ipython_extension(shell=get_ipython()):
        shell.display_formatter = LinkedDataFormatter(**shell.display_formatter._trait_values)
        shell.user_ns["set_metadata"] = shell.display_formatter.metadata_formatter.set_metadata
            
    def unload_ipython_extension(shell=get_ipython()):
        shell.display_formatter = DisplayFormatter(**shell.display_formatter._trait_values)


extend how the graph is generated for tuples and strings as examples.

In [4]:
    @MetadataFormatter.get_graph.register(tuple)
    def get_graph_tuple(self, object): return list(map(self.get_graph, object))

register a different id for modules. we use their namespaces for expansion later.

In [5]:
    @MetadataFormatter.get_id.register(types.ModuleType)
    def get_name(self, object): return object.__name__

activate the display formatter

In [6]:
    ACTIVE and load_ipython_extension()

### some data for the graph

#### dataframes

create a custom graph expression for `pandas.DataFrame`s

In [7]:
    import pandas

    if ACTIVE:
        shell.display_formatter.metadata_formatter.get_graph.register(pandas.DataFrame
        )(lambda s, x: {ID: s.get_id(x), TYPE: s.get_id(type(x)), "pandas.DataFrame:shape": list(x.shape)})

In [8]:
    if ACTIVE:
        import pandas
        df = pandas.DataFrame()
        display((df, pandas, pandas.DataFrame))

(Empty DataFrame
 Columns: []
 Index: [],
 <module 'pandas' from '/home/tbone/mambaforge/lib/python3.9/site-packages/pandas/__init__.py'>,
 pandas.core.frame.DataFrame)

#### string or url

if there is a url hidden in a string we can elevate that as metadata
thereby linked it to a cell.

for example, this work revists https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1 so we'll include that in the graph.

In [9]:
    @MetadataFormatter.get_id.register(str)
    def get_graph_str(self, object): 
        from urllib.parse import urlparse
        parsed = urlparse(object)
        if parsed.scheme:
            return object
    "https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1"

'https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1'

## looking at the metadata graph

our choice of "@id" and "@type" are jsonld conventions.
through these conventions we can surface the metadata by creating
a `jsonld` context. we can ensure a consistent structure of the notebook
and thereby context.

In [10]:
    ctx = {
        "cells": {
            ID: "nb:cell", CONTAINER: "@list", 
            CONTEXT: {
                "outputs": {CONTEXT: {"metadata": NEST}, ID: "cell:metadata"},
                "id": {ID: "cell:id", TYPE: ID},
                "cell_type": "cell:type",
                "metadata": {
                    ID: "cell:metadata",
                    CONTAINER: GRAPH,
                    CONTEXT: {
                        "tags": "rdf:name"
                    }
                },
                
            }
        },
        "@version": 1.1}

In [11]:
    if ACTIVE:
        file = Path("2022-10-29-metadata-formatter.ipynb")
        data = __import__("json").loads(file.read_text())

In [12]:
    if ACTIVE:
        from pyld import jsonld
        from IPython.display import JSON
        from_local = jsonld.compact(data, {}, options=dict(expandContext=ctx))
        set_metadata(from_local, **{"rdf:description": "all of the things we can expand from the notebook metadata."})
        display(from_local)

{'nb:cell': {'@list': [{'cell:id': {'@id': 'b89d004a-233f-452e-9da7-84604f291174'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '1734421e-0762-47e6-8f5f-6f3468bf7b2f'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '2f97b27e-f363-4dce-a2d5-45927c92feba'},
    'cell:metadata': {'@graph': {'rdf:name': ['imports', 'constants']}},
    'cell:type': 'code'},
   {'cell:id': {'@id': 'd2ed9bc9-c426-4bb4-98fb-bf4f3d26c299'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '1ea5f698-7263-4dc2-b24f-232e9f1dcfcd'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'cec19c6c-affe-4378-99d9-c927f8d2e726'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '55478faa-bcc6-4447-9e28-607b2b6bafa6'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '58291cbb-ce

when the post is published we can condense the notation.

In [13]:
    remote = "https://raw.githubusercontent.com/tonyfast/tonyfast/main/tonyfast/xxii/oct/2022-10-29-metadata-formatter.ipynb"; remote

'https://raw.githubusercontent.com/tonyfast/tonyfast/main/tonyfast/xxii/oct/2022-10-29-metadata-formatter.ipynb'

In [14]:
    if ACTIVE:
        from_remote = jsonld.compact(
            remote, {},
            options=dict(expandContext=ctx)
        )
        display(from_remote)

{'nb:cell': {'@list': [{'cell:id': {'@id': 'b89d004a-233f-452e-9da7-84604f291174'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '1734421e-0762-47e6-8f5f-6f3468bf7b2f'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '2f97b27e-f363-4dce-a2d5-45927c92feba'},
    'cell:metadata': {'@graph': {'rdf:name': ['imports', 'constants']}},
    'cell:type': 'code'},
   {'cell:id': {'@id': 'd2ed9bc9-c426-4bb4-98fb-bf4f3d26c299'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '1ea5f698-7263-4dc2-b24f-232e9f1dcfcd'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'cec19c6c-affe-4378-99d9-c927f8d2e726'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '55478faa-bcc6-4447-9e28-607b2b6bafa6'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '58291cbb-ce

this notebook is certified to have metadata

## things we capture

in this proof of concept we don't capture much, but we do expose machinery to test this concept further and extend.

we capture:

* kernel session id which can verify the outputs are generated in the same session
* each cell id that makes it possible link back to the source cells.
* some python variable information.

## things we can capture in the graph.

* annotations are type to id mappings.
* we could trace function calls
* with could encode imports
* we could capture variable assignment