In [None]:
import os

import numpy as np
import pandas
import matplotlib.pyplot as plt
import h5py
import astropy.constants as const
import astropy.units as u
import astropy.table
import ChiantiPy.tools.util as ch_util
import ChiantiPy.tools.io as ch_io
import ChiantiPy.core as ch

%matplotlib inline

In [None]:
ch_dbase = os.environ['XUVTOP']

# Parse Raw CHIANTI Data
In this notebook, we'll develop some tools for parsing the raw ASCII data in the CHIANTI atomic database and transforming it into pandas dataframes. It is important to include a way to very easily parse the actual raw data in the ChiantiPy package.

Then, the next step is to transform these dataframes into one large HDF5 file. ChiantiPy will then use this file like a database to stream the needed atomic data from. This provides a sleeker interface to the data itself and can be much more efficient than the current system.

## Notes on Database Structure
Other directories/files that are **not** ions:
* __abundance__
* ancillary_data
* continuum
* dem
* __ioneq__
* __ip__
* masterlist
* VERSION (file)

The items in bold can be parsed appropriately and attached to the ion data objects.

The filetypes for each ion are:
* ~~**.elvlc**~~
* ~~**.wgfa**~~
* ~~.scups~~
* .psplups
* .rrparams
* .trparams
* .diparams
* .drparams
* ~~**.easplom**~~
* ~~**.easplups**~~
* ~~**.fblvl**~~
* .cilvl
* .reclvl

The items in bold have a basic row-column structure and can be easily read using just a list of column names and datatypes.

Those not bolded will be a bit more complicated.

One possible idea would be to force everything into the basic row-column format. This could be done by providing preprocessors for specific filetypes and then a general filereader. This would help the code be less verbose and more maintable.

For those entries which are arrays, the preprocesor could take those rows, turn them into comma separated lists, make them an item in the row (i.e. a single entry) and then give them a custom datatype that can later be used to just convert this into a Numpy array.

Goal of the preprocessor should be to force every distinct entry into a single space-delimited row where each entry is unique.

## Read and Parse Raw ASCII Data
Each CHIANTI filetype seems to have a completely different layout. We do not want the user to have to worry about this. We've implemented a basic factory pattern that creates a different parser based on the input filetype.

In [None]:
class ParserFactory(type):
    def __call__(cls,*args,**kwargs):
        filetype = args[0].split('.')[-1]
        if filetype == 'elvlc':
            return ElvlcParser(*args,**kwargs)
        elif filetype == 'wgfa':
            return WgfaParser(*args,**kwargs)
        elif filetype == 'fblvl':
            return FblvlParser(*args,**kwargs)
        elif filetype == 'scups':
            return ScupsParser(*args,**kwargs)
        elif filetype == 'easplom':
            return EasplomParser(*args,**kwargs)
        elif filetype == 'easplups':
            return EasplupsParser(*args,**kwargs)
        elif filetype == 'psplups':
            return PsplupsParser(*args,**kwargs)
        else:
            return type.__call__(cls,*args,**kwargs)

        
class GenericParser(object):
    
    def __init__(self,ion_filename):
        self.ion_filename = ion_filename
        self.filetype = self.ion_filename.split('.')[-1]
        self.ion_name = self.ion_filename.split('.')[0]
        self.element = self.ion_name.split('_')[0]
        
    def parse(self):
        """
        Generate Astropy QTable from a CHIANTI ion file 
        """
        with open(os.path.join(ch_dbase,self.element,self.ion_name,self.ion_filename),'r') as f:
            lines = f.readlines()
        table = []
        for i,line in enumerate(lines):
            line = list(filter(None,line.strip().split('  ')))
            if line[0] == '-1':
                comment = ''.join(lines[i+1:len(lines)])
                break
            else:
                self.preprocessor(table,line,i)
        
        df = astropy.table.QTable(data=list(map(list,zip(*table))), names=self.headings)
        for name,unit,dtype in zip(self.headings,self.units,self.dtypes):
            df[name].unit = unit
            df[name] = df[name].astype(dtype)
        
        df.meta['footer'] = comment
        df.meta['element'] = self.element
        df.meta['ion'] = self.ion_name
        return df
    
    def preprocessor(self,table,line,index):
        """
        Default preprocessor method
        """
        table.append(line)
        

class Parser(GenericParser,metaclass=ParserFactory):
    def __init__(self,ion_filename):
        super().__init__(ion_filename)

### Preprocessors
Preprocessors are essentially middleware that know the details of the specific filetype we're parsing. This makes it so that there is one common interface to parsing all files and the middleware is selected based on the supplied extension. These are instantiated by the factory and should not be instantiated manually.

In [None]:
class ElvlcParser(GenericParser):
    dtypes = [int,str,int,str,float,float,float]
    units = [None,None,None,None,u.dimensionless_unscaled,1/u.cm,1/u.cm]
    headings = ['level index','configuration','multiplicity',
                'orbital angular momentum','total angular momentum',
                'observed energy','theoretical energy']

    
class FblvlParser(GenericParser):
    dtypes = [int,str,int,int,str,int,float,float]
    units = [None,None,None,None,None,None,1/u.cm,1/u.cm]
    headings = ['level index','configuration','principal quantum number',
                'azimuthal quantum number','orbital angular momentum',
                'multiplicity','observed energy','theoretical energy']

    
class ScupsParser(GenericParser): 
    
    dtypes = [int,int,float,float,float,int,int,float,'object','object']
    units = [None,None,u.Ry,u.dimensionless_unscaled,1/u.Ry,None,None,
             u.dimensionless_unscaled,u.dimensionless_unscaled,
             u.dimensionless_unscaled]
    headings = ['lower level index','upper level index','delta energy','oscillator strength',
                'high-temperature limit','number of scaled temperatures','Burgess-Tully scaling type',
                'Burgess-Tully scaling parameter','Burgess-Tully scaled temperatures',
                'Burgess-Tully scaled effective collision strengths']
    
    def preprocessor(self,table,line,index):
        if index%3 == 0:
            # main data
            table.append(line)
        else:
            # scaled temperature or collision strengths
            scaled = np.array(line,dtype=float)
            table[-1].append(scaled)
            
class PsplupsParser(GenericParser):
    
    dtypes = [int,int,int,float,float,float,float]
    units = [None,None,None,u.dimensionless_unscaled,u.Ry,u.dimensionless_unscaled,
             u.dimensionless_unscaled]
    headings = ['lower level index','upper level index','Burgess-Tully scaling type',
                'oscillator strength','delta energy','Burgess-Tully scaling parameter',
                'Burgess-Tully scaled effective collision strengths']
    
    def preprocessor(self,table,line,index):
        line = list(filter(None,('      '.join(line)).split()))
        row = line[:5]
        tmp = line[5]
        i_split = [i for i,char in enumerate(tmp) if char=='-' and tmp[i-1]!='e'][0]
        row += [tmp[:i_split]]
        scups = [tmp[i_split+1:]] 
        tmp_scups = line[6:]
        if len(tmp_scups[-1].split('-')) > 2:
            tmp = tmp_scups[-1]
            i_split = [i for i,char in enumerate(tmp) if char=='-' and tmp[i-1]!='e'][0]
            tmp_scups = tmp_scups[:-1] + [tmp[:i_split],tmp[i_split+1:]]
        scups = np.array(scups+tmp_scups,dtype=float)
        row += [scups]
        table.append(row)
        
            
class EasplomParser(GenericParser):
    
    dtypes = [int,int,int,float,float,float,float]
    units = [None,None,None,u.dimensionless_unscaled,u.Ry,u.dimensionless_unscaled,
             u.dimensionless_unscaled]
    headings = ['lower level index','upper level index','Burgess-Tully scaling type',
                'oscillator strength','delta energy','Burgess-Tully scaling parameter',
                'Burgess-Tully scaled cross-section']
    
    def preprocessor(self,table,line,index):
        line = list(filter(None,('      '.join(line)).split()))
        scaled_cs = np.array(line[8:],dtype=float)
        row = line[2:8] + [scaled_cs]
        table.append(row)
        
        
class EasplupsParser(EasplomParser):
    dtypes = [int,int,int,float,float,float,float]
    units = [None,None,None,u.dimensionless_unscaled,u.Ry,u.dimensionless_unscaled,
             u.dimensionless_unscaled]
    headings = ['lower level index','upper level index','Burgess-Tully scaling type',
                'oscillator strength','delta energy','upsilon coefficient',
                'excitation-autoionization rate coefficients']
    
    
class WgfaParser(GenericParser):
    
    dtypes = [np.int,np.int,np.float,np.float,np.float,
              str,np.int,str,np.float,
              str,np.int,str,np.float]
    units = [None,None,u.angstrom,u.dimensionless_unscaled,1/u.s,None,None,None,None,
             None,None,None,None]
    headings = ['lower level index','upper level index',
                'transition wavelength','oscillator strength','radiative decay rate',
                'lower level configuration','lower level multiplicity',
                'lower level orbital angular momentum',
                'lower level total angular momentum',
                'upper level configuration','upper level multiplicity',
                'upper level orbital angular momentum',
                'upper level total angular momentum']
    
    def preprocessor(self,table,line,index):
        ### lower ###
        tmp = line[-2].strip().split()
        del tmp[-1] # delete rogue dash
        tmp_pretty = tmp[-1]
        config = ' '.join(tmp[:-1])
        mult = tmp_pretty[0]
        orb = tmp_pretty[1]
        frac = tmp_pretty[2:]
        if len(frac) == 1:
            frac = frac[0]
        else:
            frac = float(frac.split('/')[0])/float(frac.split('/')[-1])
        lower = [config,mult,orb,frac] 
        ### upper ###
        tmp = line[-1].strip().split()
        tmp_pretty = tmp[-1]
        config = ' '.join(tmp[:-1])
        mult = tmp_pretty[0]
        orb = tmp_pretty[1]
        frac = tmp_pretty[2:]
        if len(frac) == 1:
            frac = frac[0]
        else:
            frac = float(frac.split('/')[0])/float(frac.split('/')[-1])
        upper = [config,mult,orb,frac] 
        ### recombine and assemble ###
        table.append(line[:-2] + lower + upper)

## Trying it out
Parse some example files for all of the filetypes using our parser factory approach.

 `.scups` Files

In [None]:
Parser('fe_6.scups').parse()

 `.psplups` Files

In [None]:
Parser('ca_17.psplups').parse()

`.elvlc` Files

In [None]:
Parser('h_1.elvlc').parse()

 `.easplom` Files

In [None]:
Parser('fe_6.easplom').parse()

 `.easplups` Files

In [None]:
Parser('fe_6.easplups').parse()

`.fblvl` Files

In [None]:
Parser('ar_16.fblvl').parse()

`.wgfa` Files

In [None]:
Parser('he_2.wgfa').parse()

## Check Raw Text
Call tail on the actual file to make sure we are parsing the right stuff.

In [None]:
%%bash
head -n 10 $XUVTOP/fe/fe_6/fe_6.scups

In [None]:
%%bash
head -n 10 $XUVTOP/ca/ca_17/ca_17.psplups

In [None]:
%%bash
head -n 10 $XUVTOP/h/h_1/h_1.elvlc

In [None]:
%%bash
head -n 100 $XUVTOP/fe/fe_6/fe_6.easplom

In [None]:
%%bash
head -n 10 $XUVTOP/fe/fe_6/fe_6.easplups

In [None]:
%%bash
head -n 30 $XUVTOP/h/h_1/h_1.fblvl

In [None]:
%%bash
head -n 100 $XUVTOP/fe/fe_3/fe_3.wgfa