In [1]:
import copy
import os
import re
from pathlib import Path
import numpy as np
import rasterio as rio
from xml_utils import XMLRecord, XMLNode

Note, I could not get the pymdwizard package itself installed in my conda env. I don't know much about instaling python packages manually, and it is probably trivial to do for those familiar with packaging. I took a shortcut and just copied the `pymdwizard/core/xml_utils.py` module into the working directory of this notebook.

We are going to load the original metadata, iterate over all of the data files, and populate the entity and attribute entries.

Load the metadata:

In [2]:
fp = Path("Littell_AKclimate_19Apr23.xml")
original_md = XMLRecord(str(fp))

# clone this metadata record for editing
new_md = copy.deepcopy(original_md)

Get a list of all of the data filepaths:

In [3]:
fps = list(Path("Data Release").glob("*/*/*.tif")) + list(Path("Data Release").glob("*/*.tif"))

We will iterate over these files and extract/generate the info needed for populating the metadata. Define a function to pull the needed values from the files:

In [4]:
def get_eainfo(fp):
    """Get the necessary info from the data file. Currently this is just nodata value, min value, and max value"""
    with rio.open(fp) as src:
        arr = src.read().astype(float)
    
    nodata = src.meta["nodata"]
    arr[arr == nodata] = np.nan
    
    eainfo = {}
    eainfo["range_min"] = np.nanmin(arr)
    eainfo["range_max"] = np.nanmax(arr)
    eainfo["nodata"] = nodata
    
    return eainfo
    

Define a function to update the metadata:

In [5]:
def update_metadata(md, file):
    # updates the metadata object in place, so use a copy
    fn = file.name
    attrs = re.split('\.|_', fn)[:-1] #drop ext

    # just initialize a new list for this. It could have existing entries but this is just simpler.
    eainfo = md.metadata.eainfo
    
    # create a new entry, with two children: enttyp, attr
    ea_detail_template = (
        "<detailed>\n<enttyp>\n<enttypl></enttypl>\n<enttypd></enttypd>\n<enttypds></enttypds>\n</enttyp>\n"
        "<attr>\n<attrlabl></attrlabl>\n<attrdef></attrdef>\n<attrdefs></attrdefs>\n<attrdomv>\n<edom>\n<edomv></edomv>\n<edomvd></edomvd>\n<edomvds></edomvds>\n</edom>\n</attrdomv>\n"
        "<attrdomv>\n<rdom>\n<rdommin></rdommin>\n<rdommax></rdommax>\n<attrunit></attrunit>\n<attrmres></attrmres>\n</rdom>\n</attrdomv>\n</attr>\n</detailed>"
    )
    new_ea_detail = XMLNode(ea_detail_template)
    eainfo.add_child(new_ea_detail)
    # new child is appended
    ea_detail = eainfo.children[-1]

    # get info from the data file
    eainfo = get_eainfo(file)

    # populate the fields
    enttyp = ea_detail.enttyp
    enttyp.enttypl.text = fn
    enttyp.enttypd.text = ", ".join([fn_attr_lu[attr] for attr in attrs])
    enttyp.enttypds.text = "Producer Defined"
    attr = ea_detail.attr
    attr.attrlabl.text="Value"
    attr.attrdef.text="Unique numeric values contained in each raster cell."
    attr.attrdefs.text="Producer Defined"
    # edom is first attribute
    edom_attr = ea_detail.attr.attrdomv[0]
    edom_attr.edom.edomv.text = str(eainfo["nodata"])
    edom_attr.edom.edomvd.text = "-Inf"
    edom_attr.edom.edomvds.text = "Producer Defined"
    # rdom is second attribute
    rdom_attr = ea_detail.attr.attrdomv[1]
    rdom_attr.rdom.rdommin.text = str(eainfo["range_min"])
    rdom_attr.rdom.rdommax.text = str(eainfo["range_max"])
    # TO-DO : this is hardcoded for now, need a way to supply these according to one or more of the file attributes (probably variable)
    rdom_attr.rdom.attrunit.text = rdom_lu[attrs[0]]["units"]
    rdom_attr.rdom.attrmres.text = rdom_lu[attrs[0]]["resolution"]
    
    return


Defined a lookup table with all possible file attributes and correpsonding text that should be included in a description:

In [6]:
fn_attr_lu = {
    "dpr": "Change in total precipitation",
    "dtas": "Change in surface air temperature",
    "dswe": "Change in total snow water equivalent",
    "swe": "Snow water equivalent",
    "SFEtoP": "Ratio of total snowfall water equivalent to total precipitation",
    "monr70": "70% threshold for snowfall water equivalent / precipitation ratio",
    "dfs": "Change in months of reliable snow",
    "dfpcen": "Change in fires per century",
    "dvgcen": "Change in vegetation types per century",
    "NuPiBA0t5": "New white spruce colonization (basal area 0 to 5 m^2/ha)",
    "NuPiBA5t12": "New white spruce colonization (basal area 5 to 12 m^2/ha)",
    "dDEF": "Change in total water balance deficit, PET - AET",
    "dPET": "Change in total potential evapotranspiration",
    "dq": "Change in total runoff",
    "ANN": "Annual",
    "MAM": "Spring - March, April, May",
    "JJA": "Summer - June, July, August",
    "SON": "Autumn - September, October, November",
    "DJF": "Winter - December, January, Februaryy",
    "ONDJFM": "Snow season - October through March",
    "AMJJAS": "Growing season - April through September",
    "APR": "April",
    "FEB": "February",
    "MAR": "March",
    "MAY": "Annual",
    "5mm": "5 GCM mean",
    "ccsm4": "NCARCCSM4",
    "cgcm3": "MRICGCM3",
    "hist": "historical mean",
    "2050s": "2040-2069",
    "2080s": "2070-2099",
    "1980s": "1970-1999",
    "2050": "2050-2059",
    "2100": "End of 2000-2100 century model run",
    "r85": "RCP 8.5",
    "rcp85": "RCP 8.5",
    "r45": "RCP 4.5",
    "2C": "+2C global pseudo global warming composite ensemble, similar to RCP 4.5",
    "4C": "+4C global pseudo global warming composite ensemble, similar to RCP 8.5",
}

rdom_lu = {
    "dfs": {"units": "months", "resolution": "1"},
    "dtas": {"units": "C", "resolution": "1"},
    "dpr": {"units": "percent / 100", "resolution": "0.001"},
    "dswe": {"units": "percent", "resolution": "1"},
    "swe": {"units": "mm", "resolution": "1"},
    "SFEtoP": {"units": "percent", "resolution": "1"},
    "dfpcen": {"units": "fires", "resolution": "1"},
    "dvgcen": {"units": "vegetation types", "resolution": "1"},
    "dDEF": {"units": "mm", "resolution": "1"},
    "dPET": {"units": "mm", "resolution": "1"},
    "dq": {"units": "mm", "resolution": "1"},
    "NuPiBA0t5": {"units": "percent", "resolution": "1"},
    "NuPiBA5t12": {"units": "percent", "resolution": "1"},
}

Now we will iterate and update the record copy.

Looks like the original metadata has some empty entries, just going to wipe the EA section:

In [7]:
new_md.metadata.eainfo.clear_children()
new_md.metadata.eainfo

<eainfo>
</eainfo>

Then iterate over the files and update:

In [8]:
for fp in fps:
    update_metadata(new_md, fp)

Check out some sample entries:

In [9]:
new_md.metadata.eainfo.children[0]

<detailed>
  <enttyp>
    <enttypl>dtas_SON_5mm.2050s.r85.tif</enttypl>
    <enttypd>Change in surface air temperature, Autumn - September, October, November, 5 GCM mean, 2040-2069, RCP 8.5</enttypd>
    <enttypds>Producer Defined</enttypds>
  </enttyp>
  <attr>
    <attrlabl>Value</attrlabl>
    <attrdef>Unique numeric values contained in each raster cell.</attrdef>
    <attrdefs>Producer Defined</attrdefs>
    <attrdomv>
      <edom>
        <edomv>-3.3999999521443642e+38</edomv>
        <edomvd>-Inf</edomvd>
        <edomvds>Producer Defined</edomvds>
      </edom>
    </attrdomv>
    <attrdomv>
      <rdom>
        <rdommin>2.4666664600372314</rdommin>
        <rdommax>7.933333873748779</rdommax>
        <attrunit>C</attrunit>
        <attrmres>1</attrmres>
      </rdom>
    </attrdomv>
  </attr>
</detailed>

In [10]:
new_md.metadata.eainfo.children[50]

<detailed>
  <enttyp>
    <enttypl>NuPiBA0t5.cgcm3.2050.r85.tif</enttypl>
    <enttypd>New white spruce colonization (basal area 0 to 5 m^2/ha), MRICGCM3, 2050-2059, RCP 8.5</enttypd>
    <enttypds>Producer Defined</enttypds>
  </enttyp>
  <attr>
    <attrlabl>Value</attrlabl>
    <attrdef>Unique numeric values contained in each raster cell.</attrdef>
    <attrdefs>Producer Defined</attrdefs>
    <attrdomv>
      <edom>
        <edomv>255.0</edomv>
        <edomvd>-Inf</edomvd>
        <edomvds>Producer Defined</edomvds>
      </edom>
    </attrdomv>
    <attrdomv>
      <rdom>
        <rdommin>0.0</rdommin>
        <rdommax>1.0</rdommax>
        <attrunit>percent</attrunit>
        <attrmres>1</attrmres>
      </rdom>
    </attrdomv>
  </attr>
</detailed>

Save the new metadata:

In [11]:
new_md.save(fname="Littell_AKclimate_19Apr28.xml")

Ways this process can be improved:

Function:
* Update the spatial information based on the files
* allow mappings for units/resolution text

error handling:
* warning if not all lookups were used
* warning / error if not all files have the same spatial information
