# Convert atomic data

This notebook demonstrates how a specific atomic data file is converted from an old continuum TARDIS version to modern TARDIS so the two can be compared directly without re-generating atomic data for a deprecated and unsupported atom data format.

In [1]:
import hashlib
import pickle
import platform
import uuid
from datetime import datetime
from pathlib import Path

import h5py
import numpy as np
import pandas as pd
import pytz

## Utility functions



The basic procedure here is to convert a structured array into a `pandas` dataframe matching the structure of our template atomic data.

To accomplish this, the structures we need to create from the old data format are an `pd.Index` object to organize the rows, columns with datatypes matching the template, and the table data itself.

Most of these table conversions can be reduced to two basic operations - porting a table that has a multi-index, and porting a table that has a simple index.

### Multi-Index Port

Tables with a multi-index in the new format are missing the formal `pd.MultiIndex` structure, but contain all the same indices in the same format. These are fairly straightforward to convert.

The only trick is making sure the dtypes of the ported data columns match the format of the template atom data. After the `pd.DataFrame` is created from the old data, we loop through each column and set its dtype to be equal to that of the template.

The same procedure is applied to each level of the `pd.MultiIndex`, as these are converted to floats by default, but we typically want `int` for atomic number, levels, etc.

In [2]:
def multiindex_port(olddata, templatedata, templatekey, oldkey=None):
    """
    Convert an structured-array dataset into a pandas.DataFrame that matches the
    index names and dtypes of a provided template.

    Parameters
    ----------
    olddata : h5py.File object
        Structured array(s) from the data to be converted. The structured array is
        accessed as olddata[oldkey].
    templatedata : pandas.DataFrame
        Container providing a template pandas object (DataFrame or Series) whose
        index names and column/element dtypes should be matched.
    templatekey : hashable
        Key to select the template object from templatedata.
    oldkey : hashable, optional
        Key to select the old structured array from olddata. If None, templatekey
        is used.

    Returns
    -------
    pandas.DataFrame
        A DataFrame constructed from the old structured array with its index set to
        the template's index names, column dtypes coerced to the template dtypes,
        and index-level dtypes converted to match the template.

    Raises
    ------
    KeyError
        If templatekey or oldkey is not present in the provided containers.
    TypeError, ValueError
        If coercion to the template dtypes or index conversions fail.
    """
    if oldkey is None:
        oldkey = templatekey

    # Get the format of the multi-index from the desired template
    index_names = templatedata[templatekey].index.names

    # Attempt conversion of the old structured array to a pd DataFrame
    newdata = pd.DataFrame(olddata[oldkey][:]).set_index(index_names)

    # Check datatypes of columns, convert if necessary
    if hasattr(templatedata[templatekey], "columns"):
        # Handle multiple columns
        for col in templatedata[templatekey].columns:
            desired_dtype = templatedata[templatekey][col].dtype
            if desired_dtype == np.dtype("object"):
                desired_dtype = str
            newdata[col] = newdata[col].astype(desired_dtype)
    else:
        # Handle single columns - in this case, we don't need to loop over multiple
        # column names or data types
        col = [templatedata[templatekey].name]
        newdata[col] = newdata[col].astype(templatedata[templatekey].dtype)

    # Convert datatypes of each level of the multi-index
    if isinstance(newdata.index, pd.Index):
        # Handle single index object
        template_ind_dtype = templatedata[templatekey].index.dtype
        newdata.index = newdata.index.astype(template_ind_dtype)
    elif isinstance(newdata.index, pd.MultiIndex):
        # Handle multi-index - need to loop over each index level
        for i, indname in enumerate(newdata.index.names):
            template_ind_dtype = templatedata[templatekey].index.dtypes[indname]
            converted_index = newdata.index.levels[i].astype(template_ind_dtype)
            newdata.index = newdata.index.set_levels(converted_index, level=i)
    return newdata

### Simple Port

For tables that only have a single index, the conversion is much simpler. We just need to create the `pd.DataFrame` from the old structured array and then ensure the dtypes of the columns and (single) index match up.

In [3]:
def simple_port(olddata, templatedata, templatekey, oldkey=None):
    """
    Convert an old structured-array dataset into a pandas.DataFrame that matches the
    column names and dtypes of a provided template DataFrame.

    Parameters
    ----------
    olddata : h5py.File object
        Container holding the old structured array(s). The structured array is read
        as olddata[oldkey].
    templatedata : pandas.DataFrame
        Container providing a template DataFrame whose column dtypes should be matched.
    templatekey : hashable
        Key to select the template object from templatedata.
    oldkey : hashable, optional
        Key to select the old structured array from olddata. If None, templatekey
        is used.

    Returns
    -------
    pandas.DataFrame
        A DataFrame constructed from the old structured array with column dtypes
        coerced to match the template's dtypes.

    Raises
    ------
    KeyError
        If templatekey or oldkey is not present in the provided containers.
    TypeError, ValueError
        If coercion to the template dtypes fails.
    """
    if oldkey is None:
        oldkey = templatekey

    newdata = pd.DataFrame(olddata[oldkey][:])

    # Check datatypes of columns, convert if necessary
    for col in templatedata[templatekey].columns:
        desired_dtype = templatedata[templatekey][col].dtype
        if desired_dtype == np.dtype("object"):
            desired_dtype = str
        newdata[col] = newdata[col].astype(desired_dtype)

    return newdata

### Checksum Utility Functions

These are included directly from the `carsus` utilities for writing checksum metadata for an HDF5 file.

In [4]:
def serialize_pandas_object(pd_object):
    """
    Serialize Pandas objects with Pickle.

    Parameters
    ----------
    pd_object : pandas.Series or pandas.DataFrame
        Pandas object to be serialized with Pickle.

    Returns
    -------
    Pickle serialized Python object.
    """
    return pickle.dumps(pd_object)


def hash_pandas_object(pd_object, algorithm="md5"):
    """
    Hash Pandas objects.

    Parameters
    ----------
    pd_object : pandas.Series or pandas.DataFrame
        Pandas object to be hashed.
    algorithm : str, optional
        Algorithm available in `hashlib`, by default "md5"

    Returns
    -------
    str
        Hash values.

    Raises
    ------
    ValueError
        If `algorithm` is not available in `hashlib`.
    """
    algorithm = algorithm.lower()

    if hasattr(hashlib, algorithm):
        hash_func = getattr(hashlib, algorithm)

    else:
        raise ValueError('algorithm not supported')

    return hash_func(serialize_pandas_object(pd_object)).hexdigest()

## Converting the atomic data

The utility functions above are included for completeness. For the rest of this script, I'll import these functions directly from the Python script in `./tardis/scripts/convert_atomic_data.py`.

In [5]:
from tardis.scripts import convert_atomic_data

### Loading the files

We need to use three existing atomic data files to produce our new file in the modern TARDIS format:

1. Old atomic data
2. Old photoionization data
3. A template to use as a reference for what our desired format should be

**Be sure to change the file paths below to reflect the directory structure of your system**. By default, I've selected an NLTE atom data file to use as a template for conversion.

In [6]:
oldatomdata_filename = "merged_mod_20SNG_forbidden_yg_fix_H30_cmfgen_yg.h5"
oldpidata_filename = "photoionization_data_H30_He.h5"
template_filename = "/home/connor/tardis-regression-data/atom_data/nlte_atom_data/TestNLTE_He_Ti.h5"
new_filename = "converted_atom_data.h5"

In [7]:
# Open HDFStore file objects for each of the paths defined above
# By default, HDFStore will *create* the file if it doesn't exist,
# which we don't want for the old files we're trying to convert.
if Path(oldatomdata_filename).is_file():
    old_df = pd.HDFStore(oldatomdata_filename)
else:
    raise FileNotFoundError()

if Path(oldpidata_filename).is_file():
    pi_data = pd.HDFStore(oldpidata_filename)
else:
    raise FileNotFoundError()

if Path(template_filename).is_file():
    template = pd.HDFStore(template_filename)
else:
    raise FileNotFoundError()

if Path(new_filename).is_file():
    raise FileExistsError(f"Destination file {new_filename} already exists. Delete it or specify a different destination.")
else:
    new = pd.HDFStore(new_filename)

### Collisions data

Let's take a look at the structure of our data to see what we're working with.

In [8]:
old_df.keys()

['/Y/g']

There's certainly more data in this file, but the only dataframe `pandas` can access from this format is `/Y/g`, which contains the collision data. We can extract and convert this first and then re-load the dataframe with `h5py` directly to get the rest of the information we need.

In [9]:
template.keys()

['/atom_data',
 '/collisions_data',
 '/collisions_metadata',
 '/ionization_data',
 '/levels_data',
 '/lines_data',
 '/lines_metadata',
 '/macro_atom_data',
 '/macro_atom_references',
 '/metadata',
 '/photoionization_data',
 '/zeta_data']

The keys above represent the format into which we want to convert the old atom data. We'll start with the collisions data and then proceed through this list.

Here we convert the first four columns of the collisions data to a `pd.MultiIndex` to match the template data structure and rename the columns to the index of the temperature bin each column represents.

(In the old atomic data format, the temperatures themselves are used as the column headers. In the new format, the *index* of these temperatures is used as the column header and the temperatures themselves are stored in the `collisions_metadata` field.)

In [10]:
### COLLISIONS DATA
multiindex_cols = list(old_df["/Y/g"].columns[:4])
new["collisions_data"] = old_df["/Y/g"].set_index(multiindex_cols)
tempcols = list(new['collisions_data'].columns)
new['collisions_data'] = new['collisions_data'].rename(lambda f: tempcols.index(f), axis=1)

### Re-load Old Atom Data with h5py



We've gotten all useful info out of the old dataframe using `pandas`, so now load the same file with h5py directly to get the rest.

In [11]:
old_df.close()
old = h5py.File(oldatomdata_filename)

In [12]:
old.keys()

<KeysViewHDF5 ['Y', 'basic_atom_data', 'ionization_data', 'levels_data', 'lines_data', 'macro_atom_data', 'macro_atom_references', 'synpp_refs', 'zeta_data']>

Now we can see the other fields that will populate our new atom data file---we just have to convert them to the same data structures and key names as the template.

### Collisions Metadata

This particular atomic data format stores the temperatures for the collisions data table as a separate array stored in a "metadata" field. The `AtomData` loader class in TARDIS checks for this field when loading data written in this format.

In [13]:
print(tempcols)

['2000', '5000', '10000', '20000', '50000', '100000', '200000', '500000', '1000000']


These are the temperatures associated with the `collisions_data` we loaded in previously - let's insert these values in the existing metadata structure of the template atom data file.

In [14]:
new_metadata = template["collisions_metadata"].copy()
new_metadata["temperatures"] = np.array(tempcols, dtype=np.int64)
new["collisions_metadata"] = new_metadata

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->values] [items->None]

  new["collisions_metadata"] = new_metadata


### Atom Data

In [15]:
new_atom_data = convert_atomic_data.multiindex_port(
    old, template, "atom_data", oldkey="basic_atom_data"
)
new["atom_data"] = new_atom_data

### Ionization Data

In [16]:
new["ionization_data"] = convert_atomic_data.multiindex_port(
    old, template, "ionization_data"
)

### Levels Data

In [17]:
new["levels_data"] = convert_atomic_data.multiindex_port(
    old, template, "levels_data"
)

### Lines Data

In [18]:
new["lines_data"] = convert_atomic_data.multiindex_port(
    old, template, "lines_data"
)

### Lines Metadata

In [19]:
print(template["lines_metadata"])

               value
field  key          
format version   1.0


This is a super simple field that doesn't exist in the original format, so we can just copy it over for consistency in case a particular atom data reader needs it.

In [20]:
new["lines_metadata"] = template["lines_metadata"].copy()

### Macroatom Data

In [21]:
new["macro_atom_data"] = convert_atomic_data.simple_port(old, template, "macro_atom_data")

### Macroatom References

In [22]:
new["macro_atom_references"] = convert_atomic_data.multiindex_port(
    old, template, "macro_atom_references"
)

### Photoionization Data

Note that this data is stored in an entirely different file - in this case, it is already in the correct format for our new atomic data file, so we can just copy it into our new dataframe.

In [23]:
print(pi_data['photoionization_data'])

                                                 nu        x_sect
atomic_number ion_number level_number                            
1             0          0             3.288087e+15  6.307684e-18
                         0             3.333138e+15  6.081111e-18
                         0             3.378806e+15  5.862911e-18
                         0             3.425100e+15  5.652766e-18
                         0             3.472028e+15  5.450373e-18
...                                             ...           ...
2             0          50            1.168172e+14  5.755967e-20
                         50            1.184178e+14  5.527215e-20
                         50            1.200402e+14  5.307454e-20
                         50            1.216849e+14  5.096434e-20
                         50            1.233522e+14  4.893821e-20

[16200 rows x 2 columns]


In [24]:
new["photoionization_data"] = pi_data["photoionization_data"].copy()

### Zeta Data

In [25]:
print(old['zeta_data'][:])

[[ 1.      1.      0.339  ...  0.5342  0.5373  0.5401]
 [ 1.      2.      0.     ...  0.      0.      0.    ]
 [ 2.      1.      0.4012 ...  0.679   0.6835  0.6878]
 ...
 [28.     26.      0.0859 ...  0.1413  0.1417  0.1421]
 [28.     27.      0.0645 ...  0.1095  0.1098  0.1101]
 [28.     28.      0.1149 ...  0.1938  0.195   0.1963]]


This isn't formatted as a structured array, so the `multiindex_port` function won't be able to grab specific columns by name. We could add this functionality to the porting function, but this is the only field for which this is a problem. We'll just handle this one by hand.

In [26]:
zeta_index = pd.MultiIndex.from_arrays(
    old["zeta_data"][:, :2].T.astype(np.int64),
    names=template["zeta_data"].index.names,
)
zeta_temps = pd.Index(
    old["zeta_data"].attrs["t_rad"].astype(np.float64), name="temp"
)
new_zeta_data = pd.DataFrame(
    old["zeta_data"][:, 2:], index=zeta_index, columns=zeta_temps
)
new["zeta_data"] = new_zeta_data

### Metadata

In [27]:
# Copied over from Andrew's notebook demonstrating how to do this
meta = []
meta.append(("format", "version", "1.0"))

total_checksum = hashlib.md5()
for key in new.keys():
   # update the total checksum to sign the file
   total_checksum.update(convert_atomic_data.serialize_pandas_object(new[key]))

   # save individual DataFrame/Series checksum
   checksum = convert_atomic_data.hash_pandas_object(new[key])
   meta.append(("md5sum", key.lstrip("/"), checksum))

# relevant package versions
meta.append(("software", "python", platform.python_version()))
imports = [
   "carsus",
   "astropy",
   "numpy",
   "pandas",
   "tables",
   "ChiantiPy",
]
for package in imports:
   meta.append(("software", package, __import__(package).__version__))
meta_df = pd.DataFrame.from_records(
   meta, columns=["field", "key", "value"], index=["field", "key"]
)
uuid1 = uuid.uuid1().hex
new.root._v_attrs["MD5"] = total_checksum.hexdigest()
new.root._v_attrs["UUID1"] = uuid1
new.root._v_attrs["FORMAT_VERSION"] = "1.0"
tz = pytz.timezone("UTC")
date = datetime.now(tz).isoformat()
new.root._v_attrs["DATE"] = date

 ChiantiPy version 0.15.2 


### Write out / Close files

In [28]:
old.close()
template.close()
pi_data.close()
new.close()