In [None]:
import argparse
import pathlib
import json
import getpass
from collections import defaultdict

import pymongo
from pymongo import MongoClient

import uuid
import shortuuid
import itertools

import pandas as pd

from tqdm.notebook import tqdm

import h5py

import pprint

import tiled

from tiled.examples.xdi import read_xdi

import numpy as np
import matplotlib.pyplot as plt

In [None]:
import models
import ingest

In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
%aimport models
%aimport ingest

In [None]:
mongo_uri = "mongodb://localhost:27017/aimm?authSource=admin"
mongo_pass = getpass.getpass()

In [None]:
db_name = "aimm"
client = MongoClient(mongo_uri, username="root", password=mongo_pass)
db = client[db_name]

In [None]:
# db.drop_collection("spike")
# c = db.create_collection("spike")
# c.create_index([("name", pymongo.ASCENDING)])
# c.create_index([("path", pymongo.ASCENDING)], unique=True)

In [None]:
c = db["tiled"]

In [None]:
list(c.find({"name" : "newville"}))

In [None]:
c.delete_one({"name" : "newville"})

In [None]:
c.delete_many({"path" : {"$regex" : "/core/newville"}})

In [None]:
data_path = pathlib.Path("/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/wanli/core/").expanduser()
print(data_path)
ingest.ingest_wanli_oxygen_K(c, data_path, root=["core", "wanli"])

In [None]:
ingest.ingest_wanli_TM_L(c, data_path, root=["core", "wanli"])

In [None]:
data_path = pathlib.Path("/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/newville/data").expanduser()
print(data_path)
ingest.ingest_newville(c, data_path, root=["core", "newville"])

In [None]:
list(c.find({"folder" : False, "measurements.element.symbol" : "Fe"}, {"name" : 1, "path" : 1, "_id" : 0}))

In [None]:
data_path = pathlib.Path("/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/edrixs").expanduser()
print(data_path)
files = list(data_path.rglob("*.h5"))
print("{:d} files".format(len(files)))

In [None]:
import util
import models

In [None]:
def ingest_edrixs(c, files, root=["edrixs"]):
  util.mk_path(c, root)
  for f in tqdm(files):
    name = shortuuid.encode(uuid.UUID(f.parent.stem))
    
    with h5py.File(f, "r") as h5f:
      input_data = util.read_group(h5f["/input"], jsoncompat=True)
      output_data = util.read_group(h5f["/output"])
      
    symbol = input_data["args"]["atom"]
    edge = "L"
    element = models.XDIElement(symbol = symbol, edge=edge)
    df = pd.DataFrame({"energy" : output_data["xas"]["omega_in"], "mu" : output_data["xas"]["data"][:,0]})
    
    xas_data = models.DataFrameData(
            columns=list(df.columns),
            media_type="application/x-parquet",
            blob=util.serialize_parquet(df).tobytes(),
        )
    xas = models.XASMeasurement(element=element, metadata={}, data=xas_data)
    
    rixs_array = output_data["rixs"]["data"]
    rixs_data = models.ArrayData(shape = rixs_array.shape, media_type="application/octet-stream", blob=memoryview(rixs_array).tobytes())
    
    off = input_data["params"]["off"]
    
    rixs_metadata = {"e_in_min" : input_data["args"]["omega_min"] + off, 
                     "e_in_max" : input_data["args"]["omega_max"] + off, 
                     "n_e_in" : input_data["args"]["n_omega"],
                     "e_loss_min" : input_data["args"]["eloss_min"],
                     "e_loss_max" : input_data["args"]["eloss_max"],
                     "n_e_loss" : input_data["args"]["n_eloss"]}
    
    rixs = models.RIXSMeasurement(element=element, metadata=rixs_metadata, data=rixs_data)
    
    path = "/".join(root + [name])
    sample = models.Sample(name=name, folder=False, path=path, metadata={}, measurements=[xas, rixs])
    c.insert_one(sample.dict())

In [None]:
ingest_edrixs(c, files)

In [None]:
# c.delete_many({"path" : {"$regex" : "edrixs/"}})

In [None]:

# match only direct children!!!
list(c.find({"path" : {"$regex" : r"^[^/]*$"}}, {"name" : 1, "path" : 1, "_id" : 0}).limit(10))

In [None]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [None]:
def make_doc(f):
  name = shortuuid.encode(uuid.UUID(f.parent.stem))
  sample_id = name

  with h5py.File(f, "r") as h5f:
    input_data = util.read_group(h5f["/input"], jsoncompat=True)
    xas_data = util.read_group(h5f["/output/xas"])

  xas = xas_data["data"]
  e_in_xas = xas_data["omega_in"]
  df = pd.DataFrame({"energy" : e_in_xas, "mu" : xas[:,0]})

  symbol = input_data["args"]["atom"]
  edge = "L"
  element = {"symbol" : symbol, "edge" : edge}
  columns = list(df.columns)

  metadata = {**input_data}

  tags = ["theory", "edrixs"]
  internal = {"tags" : tags, "sample_id" : sample_id, "type" : "xas", "element" : element, "columns" : columns}

  data = {
    "df" : {
      "media_type": "application/x-parquet",
      "structure_family": "dataframe",
      "blob": util.serialize_parquet(df).tobytes(),
    }
  }

  content = {"data": data, "metadata": metadata, "internal" : internal}
  doc = {"name" : name, "content" : content}
  return doc

def ingest_edrixs_xas(c, files):
  doc = util.get_path(c, ["edrixs", "xas"])
  parent = doc["_id"]
  ancestors = doc["ancestors"]
  ancestors.append(parent)
  
  for b in tqdm(batch(files, 64)):
    docs = []
    for f in b:
      doc = make_doc(f)
      doc.update({"leaf" : True, "ancestors" : ancestors, "parent" : parent})
      docs.append(doc)
      
    c.insert_many(docs)

In [None]:
files = list(data_path.rglob("*.h5"))

In [None]:
%%time

ingest_edrixs_xas(c, files)

In [None]:
util.mk_path(c, ["edrixs", "rixs"])

In [None]:
def make_edrixs_rixs_doc(f):
  name = shortuuid.encode(uuid.UUID(f.parent.stem))
  sample_id = name

  with h5py.File(f, "r") as h5f:
    input_data = util.read_group(h5f["/input"], jsoncompat=True)
    rixs_data = util.read_group(h5f["/output/rixs"])

  rixs = rixs_data["data"]
  e_in_rixs = rixs_data["omega_in"]
  e_loss = rixs_data["eloss"]

  symbol = input_data["args"]["atom"]
  edge = "L"
  element = {"symbol" : symbol, "edge" : edge}

  metadata = {**input_data}

  tags = ["theory", "edrixs"]
  internal = {"tags" : tags, "sample_id" : sample_id, "type" : "rixs", "element" : element}

  data = {
    "rixs" : {
      "media_type": "application/octet-stream",
      "structure_family": "array",
      "shape" : rixs.shape,
      "blob": memoryview(rixs).tobytes()
    },
    "e_in" : {
      "media_type": "application/octet-stream",
      "structure_family" : "array",
      "shape" : e_in_rixs.shape,
      "blob": memoryview(e_in_rixs).tobytes()
    },
    "e_loss" : {
      "media_type" : "application/octet-stream",
      "structure_family" : "array",
      "shape" : e_loss.shape,
      "blob" : memoryview(e_loss).tobytes()
    }
  }

  content = {"data": data, "metadata": metadata, "internal" : internal}
  doc = {"name" : name, "content" : content}
  return doc

def ingest_edrixs_rixs(c, files):
  doc = util.get_path(c, ["edrixs", "rixs"])
  parent = doc["_id"]
  ancestors = doc["ancestors"]
  ancestors.append(parent)
  
  batch_size = 64
  total = math.ceil(len(files) / batch_size)
  
  for b in tqdm(batch(files, 64), total=total):
    docs = []
    for f in b:
      doc = make_edrixs_rixs_doc(f)
      doc.update({"leaf" : True, "ancestors" : ancestors, "parent" : parent})
      docs.append(doc)
      
    c.insert_many(docs)

In [None]:
ingest_edrixs_rixs(c, files)