# AIMM Data Ingestion

This notebook contains code to ingest data used by the AIMM project into a mongo database.

The data is stored in three collections: `measurements`, `samples`, and `tree`.

The `measurements` collection stores the individual XAS measurements.
Documents in this collection follow the model `aimmdb.models.XASMeasurement`.
Each measurement contains a `sample_id` field which indexes into the `samples` collection.

The `samples` collection stores data about the samples that the measurements are performed on (e.g. the composition).
Documents in this collection follow the model `aimmdb.models.Sample`.
Note, one sample can have many measurements associated with it.
As mentioned above, this one to many association is modeled via the `sample_id` field of the documents in the `measurements` collection.

The `tree` collection stores a hierarchical layout of the data suitable for browsing via tiled.
Documents in this collection follow the model `aimmdb.models.Node`.
The tree structure is modeled using the [materialized paths](https://docs.mongodb.com/manual/tutorial/model-tree-structures-with-materialized-paths/) mongo pattern.
For example to query all nodes below the `/core` path one can use `db.tree.find({"path": {"$regex": "^/core/[^/]*$"}})`
The leaf nodes in the tree carry a `data_id` key which indexes into the `measurements` collection where the actual data is stored.

In [None]:
import getpass
import itertools
import pathlib
import re
import shutil
from collections import defaultdict
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymongo
from bson.objectid import ObjectId
from pymongo import MongoClient
from tiled.examples.xdi import read_xdi

In [None]:
import aimmdb
from aimm_adapters.heald_labview import (
    mangle_dup_names,
    normalize_dataframe,
    parse_heald_labview,
)

In [None]:
mongo_uri = "mongodb://localhost:27017/aimm?authSource=admin"
mongo_pass = getpass.getpass()

In [None]:
db_name = "aimm"
client = MongoClient(mongo_uri, username="root", password=mongo_pass)
db = client[db_name]

## Ingest AIMM NCM data

In [None]:
data_path = pathlib.Path("/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/NCM/")

In [None]:
ncm_samples = {
    "BM_NCM622": {
        "icp_oes": {
            "Li": 0.99,
            "Ni": 0.62,
            "Co": 0.19,
            "Mn": 0.19,
            "Al": 0.0,
            "Zr": 0.004,
            "B": 0.004,
        }
    },
    "BM_NCM712": {
        "icp_oes": {
            "Li": 0.98,
            "Ni": 0.73,
            "Co": 0.09,
            "Mn": 0.19,
            "Al": 0.007,
            "Zr": 0.002,
            "B": 0.003,
        }
    },
    "BM_NCMA": {
        "icp_oes": {
            "Li": 0.98,
            "Ni": 0.89,
            "Co": 0.05,
            "Mn": 0.07,
            "Al": 0.019,
            "Zr": 0.003,
            "B": 0.005,
        }
    },
}

# cycle, voltage, charge state
values = [
    (0, 0, "DC"),
    (1, 4.3, "C"),
    (1, 4.8, "C"),
    (1, 3.0, "DC"),
    (2, 4.3, "C"),
    (2, 4.8, "C"),
    (10, 4.8, "C"),
    (10, 3.0, "DC"),
]
keys = ["cycle", "voltage", "state"]
params = [dict(zip(keys, v)) for v in values]

In [None]:
def ingest_samples(db, ncm_samples):
    c = db.samples

    for name, metadata in ncm_samples.items():
        metadata["provenance"] = {"source_id": "aimm_ncm"}
        x = aimmdb.models.Sample(name=name, metadata=metadata)
        r = db.samples.insert_one(x.dict())


ingest_samples(db, ncm_samples)

In [None]:
sample_ids = {}
for doc in db.samples.find({"metadata.provenance.source_id": "aimm_ncm"}):
    sample_ids[doc["name"]] = doc["_id"]
pprint(sample_ids)

In [None]:
def ingest_chenjun(db, sample_ids, data_path, atom, n):
    counter = defaultdict(int)

    for i, (charge, sample) in zip(
        range(1, n + 1), itertools.cycle(itertools.product(params, sample_ids.items()))
    ):
        path = data_path / "chenjun" / f"NCMBM24{atom}.{i:04d}"
        with open(path) as f:
            fname = path.name
            print(fname)
            df, metadata = parse_heald_labview(f)
            df, translation = normalize_dataframe(df, standardize=True)
            metadata["translation"] = translation
            df["mutrans"] = np.log(df["i0"] / df["itrans"])
            df["murefer"] = np.log(df["i0"] / df["irefer"])
            metadata["charge"] = charge
            metadata["fname"] = fname

            element = aimmdb.models.XDIElement(symbol=atom, edge="K")

            data = aimmdb.models.DataFrameData.from_pandas(df)

            if charge and charge["cycle"] > 0:
                cycle, voltage, state = (
                    charge["cycle"],
                    charge["voltage"],
                    charge["state"],
                )
                name = f"{element.symbol}-{element.edge}-cycle{cycle:d}-{voltage:0.1f}V-{state}"
            else:
                name = f"{element.symbol}-{element.edge}-pristine"

            counter[(sample, name)] += 1
            i = counter[(sample, name)]

            name += f"-{i}"

            print(sample[0], name)

            metadata = aimmdb.models.XASMetadata(
                name=name,
                element=element,
                provenance={"source_id": "aimm_ncm_chenjun"},
                **metadata,
            )

            doc = aimmdb.models.XASMeasurement(
                structure_family="dataframe",
                metadata=metadata,
                data=data,
                sample_id=sample[1],
            )

            db.measurements.insert_one(doc.dict())

In [None]:
# ingest_chenjun(db, sample_ids, data_path, "Ni", 48)
# ingest_chenjun(db, sample_ids, data_path, "Co", 34)
# ingest_chenjun(db, sample_ids, data_path, "Mn", 24)

In [None]:
def read_header(f):
    header = ""
    for line in f:
        if line.startswith("Time (s)"):
            header = line.split("\t")
            return header


def read_wanli(f):
    names = read_header(f)
    names = mangle_dup_names(names)
    df = pd.read_csv(f, sep="\t", names=names)

    translation = {
        "Mono Energy": "energy",
        "Counter 3": "i0",
        "Counter 1": "tey",
        "Counter 2": "tfy",
        "Counter 0": "i0_alt",
    }
    df = df.rename(columns=translation)[list(translation.values())]

    df["mu_tfy"] = df["tfy"] / df["i0"]
    df["mu_tey"] = df["tey"] / df["i0"]

    return df


# NOTE this hardcodes BM prefix
def parse_filename(name):
    if "622" in name:
        sample = "BM_NCM622"
    elif "NCMA" in name:
        sample = "BM_NCMA"
    elif "712" in name:
        sample = "BM_NCM712"
    else:
        raise KeyError(f"unable to parse sample from {name}")

    if sample == "Ni_metal":
        charge = None
    elif "Pristine" in name:
        charge = (0, 0.0, "DC")
    else:
        if "1st" in name:
            cycle = 1
        elif "2nd" in name:
            cycle = 2
        elif "10th" in name:
            cycle = 10
        else:
            raise KeyError(f"unable to parse cycle from {name}")

        voltage_str = re.search("(\d*)V", name)[0]
        if voltage_str == "43V":
            voltage = 4.3
            state = "C"
        elif voltage_str == "48V":
            voltage = 4.8
            state = "C"
        elif voltage_str == "3V":
            voltage = 3.0
            state = "DC"
        else:
            raise KeyError(f"unable to parse voltage from {voltage_str}")

        charge = (cycle, voltage, state)

    if charge:
        charge = dict(zip(keys, charge))
    return sample, charge

In [None]:
files = list(
    (data_path / "wanli" / "Unimodal NCM622_712Al-doped_NCMA_Ni L3").glob("*.txt")
)

In [None]:
def ingest_wanli(db, files, sample_ids):
    counter = defaultdict(int)
    for file in files:
        fname = file.name
        try:
            sample, charge = parse_filename(fname)
        except KeyError as e:
            print(f"failed to extract sample from {fname}")
            continue

        metadata = {}
        element = aimmdb.models.XDIElement(symbol="Ni", edge="L3")

        with open(file, "r") as f:
            df = read_wanli(f)

        metadata["charge"] = charge
        metadata["sample"] = sample
        metadata["fname"] = fname

        data = aimmdb.models.DataFrameData.from_pandas(df)

        if charge and charge["cycle"] > 0:
            cycle, voltage, state = charge["cycle"], charge["voltage"], charge["state"]
            name = f"{element.symbol}-{element.edge}-cycle{cycle:d}-{voltage:0.1f}V-{state}"
        else:
            name = f"{element.symbol}-{element.edge}-pristine"

        counter[(sample, name)] += 1
        i = counter[(sample, name)]

        name += f"-{i}"

        print(sample, name)
        sample_id = sample_ids[sample]

        metadata = aimmdb.models.XASMetadata(
            name=name,
            element=element,
            provenance={"source_id": "aimm_ncm_wanli"},
            **metadata,
        )

        doc = aimmdb.models.XASMeasurement(
            structure_family="dataframe",
            metadata=metadata,
            data=data,
            sample_id=sample_id,
        )

        db.measurements.insert_one(doc.dict())

In [None]:
ingest_wanli(db, files, sample_ids)

In [None]:
# db.create_collection("tree")
# db.tree.create_index("path", unique=True)

In [None]:
def make_ncm_tiled_tree(db):
    root = aimmdb.models.Node(
        name="NCM", path="/NCM", structure_family="node", metadata={}, data=None
    )
    db.tree.insert_one(root.dict())

    return

    for x in db.samples.find({"metadata.provenance.source_id": "aimm_ncm"}):
        sample_name = x["name"]
        sample_metadata = x["metadata"]
        sample_id = x["_id"]

        node = aimmdb.models.Node(
            name=sample_name,
            path=f"/NCM/{sample_name}",
            structure_family="node",
            metadata=sample_metadata,
            data_id=None,
        )

        db.tree.insert_one(node.dict())
        for m in db.measurements.find({"sample_id": sample_id}):
            m_name = m["metadata"]["name"]
            node = aimmdb.models.Node(
                name=m_name,
                path=f"/NCM/{sample_name}/{m_name}",
                structure_family="dataframe",
                metadata=m["metadata"],
                data_id=m["_id"],
            )
            db.tree.insert_one(node.dict())

In [None]:
make_ncm_tiled_tree(db)

## Ingest Core Data

This includes misc data with the goal of focusing on "standard" compounds which have an unambiguous structural identity

In [None]:
def ingest_wanli_oxygen_K(db, data_path):
    files = (data_path / "O_K").glob("*.txt")

    element = aimmdb.models.XDIElement(symbol="O", edge="K")
    provenance = {"source_id": "core_wanli"}

    for f in files:
        name = f.stem
        print(f"{name}")
        df = pd.read_csv(f, header=None, delimiter="\t", names=["energy", "mu"])
        data = aimmdb.models.DataFrameData.from_pandas(df)
        xas_name = f"{element.symbol}-{element.edge}"
        sample = aimmdb.models.Sample(name=name, metadata={"provenance": provenance})

        r = db.samples.insert_one(sample.dict())
        sample_id = r.inserted_id

        metadata = aimmdb.models.XASMetadata(
            name=xas_name, element=element, provenance=provenance
        )

        doc = aimmdb.models.XASMeasurement(
            structure_family="dataframe",
            metadata=metadata,
            data=data,
            sample_id=sample_id,
        )

        db.measurements.insert_one(doc.dict())

In [None]:
data_path = pathlib.Path(
    "/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/wanli/core/"
).expanduser()
ingest_wanli_oxygen_K(db, data_path)

In [None]:
def ingest_wanli_TM_L(db, data_path):
    files = data_path / "TM_L"

    for d in files.iterdir():
        if d.is_file():
            continue
        symbol = d.stem
        element = aimmdb.models.XDIElement(symbol=symbol, edge="L")

        for f in d.glob("*.txt"):
            if f.stem.startswith("IgorPlot"):
                continue

            name = f.stem
            print(f"{name}")

            with open(f, "r") as ff:
                l = ff.readline()
                n = len(l.split())

                if n == 1:
                    skiprows = 1
                    title = l
                elif n == 2:
                    skiprows = 0
                    title = None
                else:
                    assert False

            df = pd.read_csv(
                f, delimiter="\t", names=["energy", "mu"], skiprows=skiprows
            )

            data = aimmdb.models.DataFrameData.from_pandas(df)
            xas_name = f"{element.symbol}-{element.edge}"

In [None]:
# FIXME doesn't do anything yet
# need to do something about parsing these file names to extract the sample part
data_path = pathlib.Path(
    "/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/wanli/core/"
).expanduser()
ingest_wanli_TM_L(db, data_path)

In [None]:
def ingest_newville(db, data_path):
    files = list(data_path.rglob("*.xdi"))
    print(f"found {len(files)} xdi files to ingest")

    provenance = {
        "source_id": "newville",
        "url": "https://github.com/XraySpectroscopy/XASDataLibrary",
    }

    data_list = []
    for f in files:
        name = f.stem
        _, metadata = read_xdi(str(f))
        fields = metadata.pop("fields")
        metadata.update(**fields, file=f)
        data_list.append(
            {"name": f.stem, "sample_name": metadata["Sample"]["name"], "file": f}
        )
    df = pd.DataFrame(data_list)

    for k, g in df.groupby("sample_name"):
        sample = aimmdb.models.Sample(name=k, metadata={"provenance": provenance})
        r = db.samples.insert_one(sample.dict())
        sample_id = r.inserted_id

        for i, row in g.iterrows():
            f = row.file
            name = f.stem
            print(f"{name}")
            df_xas, metadata_xdi = read_xdi(str(f))
            fields = metadata_xdi.pop("fields")
            metadata_xdi.update(**fields, file=str(f))
            element = aimmdb.models.XDIElement(**metadata_xdi.pop("Element"))
            xas_name = f"{element.symbol}-{element.edge}-{name}"

            metadata = aimmdb.models.XASMetadata(
                name=xas_name, element=element, provenance=provenance, **metadata_xdi
            )

            data = aimmdb.models.DataFrameData.from_pandas(df_xas)
            doc = aimmdb.models.XASMeasurement(
                structure_family="dataframe",
                metadata=metadata,
                data=data,
                sample_id=sample_id,
            )

            db.measurements.insert_one(doc.dict())

In [None]:
data_path = pathlib.Path(
    "/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/newville/data"
).expanduser()
df = ingest_newville(db, data_path)

In [None]:
def make_core_tree(db):
    root = aimmdb.models.Node(
        name="core", path="/core", structure_family="node", metadata={}, data=None
    )
    db.tree.insert_one(root.dict())

    # use counter disambiguate sample names
    sample_name_counter = defaultdict(int)

    for src in ["core_wanli", "newville"]:
        for x in db.samples.find({"metadata.provenance.source_id": src}):
            sample_name = x["name"]
            sample_metadata = x["metadata"]
            sample_id = x["_id"]

            sample_name_counter[sample_name] += 1
            count = sample_name_counter[sample_name]
            sample_name += f"-{count:d}"

            node = aimmdb.models.Node(
                name=sample_name,
                path=f"/core/{sample_name}",
                structure_family="node",
                metadata=sample_metadata,
                data_id=None,
            )

            db.tree.insert_one(node.dict())
            for m in db.measurements.find({"sample_id": sample_id}):
                m_name = m["metadata"]["name"]
                node = aimmdb.models.Node(
                    name=m_name,
                    path=f"/core/{sample_name}/{m_name}",
                    structure_family="dataframe",
                    metadata=m["metadata"],
                    data_id=m["_id"],
                )
                db.tree.insert_one(node.dict())

In [None]:
make_core_tree(db)