In [None]:
# install aimmdb package to get tiled client extensions for interacting with aimm server
# !pip install git+https://git@github.com/AI-multimodal/aimmdb.git@dev

In [None]:
import copy
import pathlib

import aimmdb
import numpy as np
import pandas as pd
import tiled
from aimmdb.queries import RawMongo
from tiled.client import from_uri
from tiled.examples.xdi import read_xdi

In [None]:
# create tiled client object
# because we have installed the aimmdb package, tiled automatically constructs an AIMMCatalog instance
# this is a subclass of the generic tiled client providing a more expressive repr and methods for uploading data to the AIMM database
c = from_uri("https://aimm-staging.lbl.gov/api")
c

In [None]:
# login to gain authenticated access
c.login()

In [None]:
c["sample"]

In [None]:
# delete everything so we start fresh
# DANGER
for k in list(c["uid"]):
    del c["uid"][k]

In [None]:
# the catalog is initially empty
c["uid"]

In [None]:
# the server now supports writing array data using the write_array method
# when submitting the server generates a unique key which can be used to refer to and fetch the data
x = np.random.rand(100, 100)
metadata = {"dataset" : "sandbox", "foo" : "bar"}
key = c["uid"].write_array(x, metadata)
key

In [None]:
c["uid"]

In [None]:
node = c["uid"][key]
node

In [None]:
node.metadata

In [None]:
node.read()

In [None]:
# it is also possible to write dataframes
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "sandbox", "foo" : "bar"}
key = c["uid"].write_dataframe(df, metadata)
key

In [None]:
c["uid"]

In [None]:
node = c["uid"][key]
node

In [None]:
node.metadata

In [None]:
node.read()

In [None]:
# when submitting data you MUST include a 'dataset' key in the metadata
# this is used to allow providing access control and enforcement of schemas
metadata = {"foo" : "bar"}
key = c["uid"].write_array(x, metadata)

In [None]:
# the server can be configured to apply custom validation to data in a particular dataset
# we have configured a 'newville' dataset which requires that the metadata conform to a particular schema which provides element and edge information
# submitting arbitrary data to this dataset will be rejected by the server

In [None]:
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "newville", "foo" : "bar"}
key = c["uid"].write_dataframe(df, metadata)

In [None]:
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "newville", "foo" : "bar"}
key = c["uid"].write_dataframe(df, metadata, specs=["XAS"])

In [None]:
x = np.random.rand(100, 100)
metadata = {"dataset" : "newville", "foo" : "bar"}
key = c["uid"].write_array(x, metadata, specs=["XAS"])

In [None]:
# with the correct metadata we can write to the server
# NOTE this doesn't prevent you from writing garbage but does help
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "newville", "foo" : "bar", "element" : {"symbol" : "Au", "edge" : "K"}}
key = c["uid"].write_dataframe(df, metadata, specs=["XAS"])
key

In [None]:
c["uid"]

In [None]:
c["dataset"]

In [None]:
c["dataset"]["newville"]["uid"]

In [None]:
c["uid"][key]

In [None]:
# since we just submitted some garbage lets delete it
del c["uid"][key]

In [None]:
c["dataset"]["newville"]["uid"]

In [None]:
# so far we have been write methods from the generic tiled client
# aimmdb provides specialized methods for writing XAS data
# NOTE now the validation error happens on the client rather than the server
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "sandbox"}
key = c["uid"].write_xas(df, metadata)
key

In [None]:
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "sandbox", "element" : {"symbol" : "Au", "edge" : "K"}}
key = c["uid"].write_xas(df, metadata)
key

In [None]:
c["uid"][key]

In [None]:
# delete the garbage again
del c["uid"][key]

In [None]:
# aimmdb also provides functionality for associating metadata and measurements with samples
# use write_sample to write some metadata to the sample database and get a unique id
sample_metadata = {"name" : "NiO", "description" : "Jos's Nickle Oxide Sample"}
sample_key = c.write_sample(sample_metadata)
sample_key

In [None]:
# we can include sample_id in the metadata when submitting measurements

df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "sandbox", "element" : {"symbol" : "Ni", "edge" : "K"}, "sample_id" : sample_key}
key = c["uid"].write_xas(df, metadata)
print(f"{key=}")

df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "sandbox", "element" : {"symbol" : "Ni", "edge" : "L"}, "sample_id" : sample_key}
key = c["uid"].write_xas(df, metadata)
print(f"{key=}")

In [None]:
c["sample"]

In [None]:
# we can then retrieve all measurements associated with that sample_id
c["sample"][sample_key]["uid"]

In [None]:
node = c["sample"][sample_key]["uid"].values_indexer[0]
node

In [None]:
# the metadata from the sample database is merged into the measurement metadata
node.metadata

In [None]:
# samples can also be deleted
sample_key = c.write_sample({"name" : "garbage"})
c.delete_sample(sample_key)

In [None]:
# now we will ingest the newville dataset
# first download and unpack the raw data
!curl -L https://github.com/XraySpectroscopy/XASDataLibrary/archive/master.tar.gz | tar xz

In [None]:
def load_newville(data_path):
    """
    Load newville dataset into a dataframe parsing sample information from metadata
    """
    files = list(data_path.rglob("*.xdi"))
    print(f"found {len(files)} xdi files to ingest")

    data_list = []

    for f in files:
        name = f.stem
        _, metadata = read_xdi(str(f))
        fields = metadata.pop("fields")
        metadata.update({k.lower(): v for k, v in fields.items()})
        sample = metadata["sample"]
        name = sample.pop("name")
        prep = sample.pop("prep", None)

        data_list.append(
            {
                "name": f.stem,
                "file": str(f),
                "sample.name": name,
                "sample.prep": prep,
                "metadata": metadata,
            }
        )

    df = pd.DataFrame(data_list)

    return df

In [None]:
# read through all the files and extract some metadata
df = load_newville(pathlib.Path("./XASDataLibrary-master/data/"))
df

In [None]:
def ingest_newville(c, df, verbose=False):
    """
    Upload the newville dataset to database
    """

    for (name, prep), g in df.groupby(["sample.name", "sample.prep"]):
        if verbose:
            print(f"{name}: {prep}, {len(g)}")

        sample_id = c.write_sample({"name" : name, "prep" : prep})

        for i, row in g.iterrows():
            xas_df, _ = read_xdi(row.file)
            metadata = row.metadata
            metadata["dataset"] = "newville"
            metadata["sample_id"] = sample_id
            c["uid"].write_xas(xas_df, metadata)

In [None]:
# read and upload all files
print("starting ingestion...")
ingest_newville(c, df, verbose=True)
print("finished.")

In [None]:
# now we can see the newville data on the server
# NOTE (values/keys/items)_indexer are getting a better interface soon!
c["dataset"]["newville"]["uid"].items_indexer[:10]

In [None]:
# because we have the aimmdb package installed, tiled automatically constructs an XASClient instance when we select a single measurement
# this is a subclass of the generic tiled dataframe client providing a more expressive repr which shows the sample name and edge information for the measurement
x = c["dataset"]["newville"]["uid"].values_indexer[0]
x

In [None]:
# the measurement uid can be accessed as a property
x.uid

In [None]:
# the measurement itself is stored in a dataframe which can be obtained using the .read() method
df = x.read()
df

In [None]:
# once the data is ingested it can be searched
# queries can be interactively built up by drilling down into the tree structure

In [None]:
c

In [None]:
c["dataset"]

In [None]:
c["dataset"]["newville"]

In [None]:
# measurements can be grouped by absorbing element (as in the original file tree)
c["dataset"]["newville"]["element"]

In [None]:
c["dataset"]["newville"]["element"]["Ni"]

In [None]:
c["dataset"]["newville"]["element"]["Ni"]["uid"]

In [None]:
node = c["dataset"]["newville"]["element"]["Ni"]["uid"].values_indexer[0]
node

In [None]:
# ... or by edge
c["dataset"]["newville"]["edge"]

In [None]:
c["dataset"]["newville"]["edge"]["L3"]["uid"]

In [None]:
node = c["dataset"]["newville"]["edge"]["L3"]["uid"].values_indexer[0]
node

In [None]:
# ... and these can be combined/nested

# this query gets all absorbing elements for which we have an L3 edge measurement
c["dataset"]["newville"]["edge"]["L3"]["element"]

In [None]:
node = c["dataset"]["newville"]["edge"]["L3"]["element"]["Pt"]["uid"].values_indexer[0]
node

In [None]:
# if you prefer you can also directly search on the metadata
result = c["uid"].search(RawMongo({"metadata.dataset" : "newville", "metadata.element.symbol" : "Pt", "metadata.element.edge" : "L3"}))
result

In [None]:
node = result.values_indexer[0]
node

In [None]:
node.read()

In [None]:
# the newville dataset is quite clean and well curated but it is still in need of some post-processing
# most obviously, not all measurements have the actual absorption mu computed
# we will now demonstrate using aimmdb tools to perform this first basic post processing step

In [None]:
# first read through all the data on the server and print the column names to get an idea of what we are dealing with
for k, v in c["dataset"]["newville"]["uid"].items():
    sample_name = v.metadata["sample"]["name"]
    df = v.read()
    print(f"{sample_name:>40} ({k}): {list(df)}")

In [None]:
# define functions to compute the most basic calculation of the absorption from transmission or fluorescence measurements

def compute_mutrans(df):
    assert "energy" in df
    assert "i0" in df
    assert "itrans" in df

    energy = df["energy"]
    mutrans = np.log(df["i0"] / df["itrans"])

    return pd.DataFrame({"energy": energy, "mu": mutrans})


def compute_mufluor(df):
    assert "energy" in df
    assert "i0" in df
    assert "ifluor" in df

    energy = df["energy"]
    mufluor = df["ifluor"] / df["i0"]

    return pd.DataFrame({"energy": energy, "mu": mufluor})

In [None]:
def process_metadata(metadata):
    metadata_ = copy.deepcopy(metadata)
    return metadata_

def run_postprocessing(c, uids):
    for i, k in enumerate(uids):
        v = c[k]
        sample_name = v.metadata["sample"]["name"]
        print(f"{i}: {sample_name}")

        df = v.read()
        metadata = dict(v.metadata)

        cols = set(list(df))

        if {"energy", "i0", "itrans"}.issubset(cols):
            df_ = compute_mutrans(df)
            metadata_ = process_metadata(metadata)
            metadata_["postprocessing"] = {
                "operator": "compute_mutrans",
                "parents": [v.uid],
            }
            c.write_xas(df_, metadata_)

        if {"energy", "i0", "ifluor"}.issubset(cols):
            df_ = compute_mufluor(df)
            metadata_ = process_metadata(metadata)
            metadata_["postprocessing"] = {
                "operator": "compute_mufluor",
                "parents": [v.uid],
            }
            c.write_xas(df_, metadata_)

        if {"energy", "mutrans"}.issubset(cols):
            df_ = pd.DataFrame({"energy": df["energy"], "mu": df["mutrans"]})
            metadata_ = process_metadata(metadata)
            metadata_["postprocessing"] = {
                "operator": "copy_mutrans",
                "parents": [v.uid],
            }
            c.write_xas(df_, metadata_)

        if {"energy", "mufluor"}.issubset(cols):
            df_ = pd.DataFrame({"energy": df["energy"], "mu": df["mufluor"]})
            metadata_ = process_metadata(metadata)
            metadata_["postprocessing"] = {
                "operator": "copy_mufluor",
                "parents": [v.uid],
            }
            c.write_xas(df_, metadata_)

In [None]:
# get a list of the uids for non postprocessed data
# NOTE we use list(...) to extract the list of uids ahead of time because it is not safe to iterate a collection that you are simultaneously modifying
raw_uids = list(c["dataset"]["newville"]["uid"].search(RawMongo({"metadata.postprocessing" : None})))

In [None]:
run_postprocessing(c["uid"], raw_uids)

In [None]:
# we can use search to select only the postprocessed data
result = c["dataset"]["newville"]["uid"].search(RawMongo({"metadata.postprocessing" : {"$exists" : True}}))
result

In [None]:
# information about the postprocessing steps are stored in the metadata
node = result.values_indexer[0]
node.metadata

In [None]:
# we can use search to select data that has been postprocessed in a particular way
result = c["dataset"]["newville"]["uid"].search(RawMongo({"metadata.postprocessing.operator" : "compute_mutrans"}))
result

In [None]:
# we can use search to select data that has been postprocessed in a particular way
result = c["dataset"]["newville"]["uid"].search(RawMongo({"metadata.postprocessing.operator" : "compute_mufluor"}))
result

In [None]:
# our postprocessed data is guarenteed to be a dataframe with two columns: energy and mu
df = node.read()
df

In [None]:
df.plot("energy", "mu", title=node.describe())

In [None]:
# the postprocessed metadata include a list of 'parent' uids which were used as input enabling basic provenance tracking
node.metadata["postprocessing"]

In [None]:
key = node.metadata["postprocessing"]["parents"][0]
parent_node = c["uid"][key]
parent_node

In [None]:
parent_node.read()

In [None]:
# while logged in both the 'newville' and 'sandbox' datasets are visible
c["dataset"]

In [None]:
c.logout()

In [None]:
# upon logout the sandbox dataset is no longer visible but the newville dataset is still visible
c["dataset"]

In [None]:
# public users are allowed to read data from specified datasets
key = c["dataset"]["newville"]["uid"].keys_indexer[0]
node = c["uid"][key]
node.read()

In [None]:
# but are not allowed any write access
del c["uid"][key]