In [None]:
import argparse
import pathlib
import json
import getpass
from collections import defaultdict

import pymongo
from pymongo import MongoClient

import uuid
import shortuuid
import itertools

import pandas as pd

from tqdm.notebook import tqdm

import h5py

import pprint

import tiled

from tiled.examples.xdi import read_xdi

import numpy as np
import matplotlib.pyplot as plt

In [None]:
%load_ext autoreload
%autoreload 1
%aimport util

In [None]:
# mongo_uri = "mongodb://root:d9b86a9586138e103a597a3d117e2cf3@localhost:27017/aimm?authSource=admin"
mongo_uri = "mongodb://localhost:27017/aimm?authSource=admin"
mongo_pass = getpass.getpass()

In [None]:
db_name = "aimm"
client = MongoClient(mongo_uri, username="root", password=mongo_pass)
db = client[db_name]

In [None]:
with open("schema.json", "r") as f:
  schema = json.load(f)
  
pprint.pprint(schema)

In [None]:
collection = "spike"
# c = util.create_collection(db, collection, schema, overwrite=True)
c = db[collection]

In [None]:
rixs_dir = util.get_path(c, ["edrixs", "rixs"])
xas_dir = util.get_path(c, ["edrixs", "xas"])

In [None]:
%time x = list(zip(range(100), c.find({"parent" : rixs_dir["_id"]}, {"content.data" : 0})))

In [None]:
%time x = list(zip(range(100), c.find({"parent" : xas_dir["_id"]})))

In [None]:
data_path = pathlib.Path("/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/wanli/core/").expanduser()
print(data_path)

In [None]:
util.mk_path(c, ["wanli", "core"])

In [None]:
def ingest_oxygen_K():
  files = (data_path / "O_K").glob("*.txt")
  
  tags = ["experiment", "wanli"]
  element = {"symbol" : "O", "edge" : "K"}
  
  doc = util.get_path(c, ["wanli", "core"])
  parent = doc["_id"]
  ancestors = doc["ancestors"]
  ancestors.append(parent)
  
  for f in files:
    name = f.stem
    print(f"{name=}")
    
    df = pd.read_csv(f, header=None, delimiter="\t", names=["energy", "mu"])
    
    sample_id = str(uuid.uuid4())
    columns = list(df.columns)
    internal = {"tags" : tags, "sample_id" : sample_id, "type" : "xas", "element" : element, "columns" : columns}
    metadata = {"element" : element, "columns" : columns}
    
    data = {
        "df" : {
        "media_type": "application/x-parquet",
        "structure_family": "dataframe",
        "blob": util.serialize_parquet(df).tobytes(),
      }
    }
    
    content = {"data": data, "metadata": metadata, "internal" : internal}
    
    doc = {"name" : name, "leaf" : True, "ancestors" : ancestors, "parent" : parent, "content" : content}

    c.insert_one(doc)

In [None]:
ingest_oxygen_K()

In [None]:
def ingest_TM_L():
  files = (data_path / "TM_L")
  
  tags = ["experiment", "wanli"]
  
  doc = util.get_path(c, ["wanli", "core"])
  parent = doc["_id"]
  ancestors = doc["ancestors"]
  ancestors.append(parent)
  
  for d in files.iterdir():
    if d.is_file():
      continue
    symbol = d.stem
    edge = "L"
    element = {"symbol" : symbol, "edge" : edge}
    
    for f in d.glob("*.txt"):
      if f.stem.startswith("IgorPlot"):
        continue
        
      name = f.stem
      print(f"{name=}")
      
      with open(f, "r") as ff:
        l = ff.readline()
        n = len(l.split())
        
        if n == 1:
          skiprows = 1
          title = l
        elif n == 2:
          skiprows = 0
          title = None
        else:
          assert False
          
      df = pd.read_csv(f, delimiter="\t", names=["energy", "mu"], skiprows=skiprows)
      
      sample_id = str(uuid.uuid4())
      columns = list(df.columns)
      internal = {"tags" : tags, "sample_id" : sample_id, "type" : "xas", "element" : element, "columns" : columns}

      metadata = {"element" : element, "columns" : columns}

      data = {
        "df" : {
          "media_type": "application/x-parquet",
          "structure_family": "dataframe",
          "blob": util.serialize_parquet(df).tobytes(),
        }
      }

      content = {"data": data, "metadata": metadata, "internal" : internal}

      doc = {"name" : name, "leaf" : True, "ancestors" : ancestors, "parent" : parent, "content" : content}

      c.insert_one(doc)

In [None]:
ingest_TM_L()

In [None]:
data_path = pathlib.Path("/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/newville/data").expanduser()
print(data_path)

In [None]:
# util.rm_path(c, ["newville"])
util.mk_path(c, ["newville"])

In [None]:
def ingest_newville():
  tags = ["experiment", "newville", "xas", "xdi"]
  
  doc = util.get_path(c, ["newville"])
  parent = doc["_id"]
  ancestors = doc["ancestors"]
  ancestors.append(parent)
  
  files = list(data_path.rglob("*.xdi"))
  print(f"found {len(files)} xdi files to ingest")
  
  for f in files:
    df, metadata = read_xdi(str(f))
    fields = metadata.pop("fields")
    metadata.update(**fields)

    metadata = {k.lower() : v for k,v in metadata.items()}

    name = f.stem
    print(f"{name=}")

    data = {
      "df" : {
        "media_type": "application/x-parquet",
        "structure_family": "dataframe",
        "blob": util.serialize_parquet(df).tobytes(),
      }
    }

    columns = list(df.columns)
    symbol = metadata["element"]["symbol"]
    edge = metadata["element"]["edge"]
    element = {"symbol" : symbol, "edge" : edge}
    sample_id = str(uuid.uuid4())
    internal = {"tags" : tags, "sample_id" : sample_id, "type" : "xas", "element" : element, "columns" : columns}

    content = {"data": data, "metadata": metadata, "internal" : internal}
    doc = {"name" : name, "leaf" : True, "ancestors" : ancestors, "parent" : parent, "content" : content}
    c.insert_one(doc)

In [None]:
ingest_newville()

In [None]:
data_path = pathlib.Path("/run/media/joseph/seagate/jkleinhenz/projects/aimm/data/edrixs").expanduser()
print(data_path)

In [None]:
util.mk_path(c, ["edrixs", "xas"])

In [None]:
d = util.tree(c)
pprint.pprint(d)

In [None]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [None]:
def make_doc(f):
  name = shortuuid.encode(uuid.UUID(f.parent.stem))
  sample_id = name

  with h5py.File(f, "r") as h5f:
    input_data = util.read_group(h5f["/input"], jsoncompat=True)
    xas_data = util.read_group(h5f["/output/xas"])

  xas = xas_data["data"]
  e_in_xas = xas_data["omega_in"]
  df = pd.DataFrame({"energy" : e_in_xas, "mu" : xas[:,0]})

  symbol = input_data["args"]["atom"]
  edge = "L"
  element = {"symbol" : symbol, "edge" : edge}
  columns = list(df.columns)

  metadata = {**input_data}

  tags = ["theory", "edrixs"]
  internal = {"tags" : tags, "sample_id" : sample_id, "type" : "xas", "element" : element, "columns" : columns}

  data = {
    "df" : {
      "media_type": "application/x-parquet",
      "structure_family": "dataframe",
      "blob": util.serialize_parquet(df).tobytes(),
    }
  }

  content = {"data": data, "metadata": metadata, "internal" : internal}
  doc = {"name" : name, "content" : content}
  return doc

def ingest_edrixs_xas(c, files):
  doc = util.get_path(c, ["edrixs", "xas"])
  parent = doc["_id"]
  ancestors = doc["ancestors"]
  ancestors.append(parent)
  
  for b in tqdm(batch(files, 64)):
    docs = []
    for f in b:
      doc = make_doc(f)
      doc.update({"leaf" : True, "ancestors" : ancestors, "parent" : parent})
      docs.append(doc)
      
    c.insert_many(docs)

In [None]:
files = list(data_path.rglob("*.h5"))

In [None]:
%%time

ingest_edrixs_xas(c, files)

In [None]:
util.mk_path(c, ["edrixs", "rixs"])

In [None]:
def make_edrixs_rixs_doc(f):
  name = shortuuid.encode(uuid.UUID(f.parent.stem))
  sample_id = name

  with h5py.File(f, "r") as h5f:
    input_data = util.read_group(h5f["/input"], jsoncompat=True)
    rixs_data = util.read_group(h5f["/output/rixs"])

  rixs = rixs_data["data"]
  e_in_rixs = rixs_data["omega_in"]
  e_loss = rixs_data["eloss"]

  symbol = input_data["args"]["atom"]
  edge = "L"
  element = {"symbol" : symbol, "edge" : edge}

  metadata = {**input_data}

  tags = ["theory", "edrixs"]
  internal = {"tags" : tags, "sample_id" : sample_id, "type" : "rixs", "element" : element}

  data = {
    "rixs" : {
      "media_type": "application/octet-stream",
      "structure_family": "array",
      "shape" : rixs.shape,
      "blob": memoryview(rixs).tobytes()
    },
    "e_in" : {
      "media_type": "application/octet-stream",
      "structure_family" : "array",
      "shape" : e_in_rixs.shape,
      "blob": memoryview(e_in_rixs).tobytes()
    },
    "e_loss" : {
      "media_type" : "application/octet-stream",
      "structure_family" : "array",
      "shape" : e_loss.shape,
      "blob" : memoryview(e_loss).tobytes()
    }
  }

  content = {"data": data, "metadata": metadata, "internal" : internal}
  doc = {"name" : name, "content" : content}
  return doc

def ingest_edrixs_rixs(c, files):
  doc = util.get_path(c, ["edrixs", "rixs"])
  parent = doc["_id"]
  ancestors = doc["ancestors"]
  ancestors.append(parent)
  
  batch_size = 64
  total = math.ceil(len(files) / batch_size)
  
  for b in tqdm(batch(files, 64), total=total):
    docs = []
    for f in b:
      doc = make_edrixs_rixs_doc(f)
      doc.update({"leaf" : True, "ancestors" : ancestors, "parent" : parent})
      docs.append(doc)
      
    c.insert_many(docs)

In [None]:
ingest_edrixs_rixs(c, files)