## NOAO data reduction
### WESmith

MIT License

Copyright (c) 2018 

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

In [1]:
import os
import fnmatch
import numpy as np
import pandas as pd
import io
import pdb
from   datetime import datetime

In [2]:
pd.set_option('max_rows', 32, 'max_columns', 40)

In [3]:
# some important fields: 
FIELDS = ['DATE-OBS', 'DTCALDAT', 'DTTELESC', 'DTINSTRU',
          'OBSTYPE','PROCTYPE','PRODTYPE','DTSITE', 'OBSERVAT', 
          #'REFERENCE','FILESIZE','MD5SUM',
          'DTACQNAM','DTPROPID',
          #'PI','RELEASE_DATE',
          'RA','DEC',
          #'FOOTPRINT',
          'FILTER','EXPTIME',
          #'EXPOSURE',
          'OBSMODE','SEEING',
          #'DEPTH','SURVEYID','COLLECTIONID',
          'OBJECT']  

In [21]:
# location of test NOAO json data and results area
BASEDIR   = '~/Devel/python/noao_data'
DATADIR   = 'json-scrape'
SNAPS     = 'pandas-snapshots' # location of intermediate HDF5 files
file_list = 'file_list_first_600.txt'

In [22]:
TOPDIR = os.path.join(BASEDIR, DATADIR)
SAVDIR = os.path.join(BASEDIR, SNAPS)
FILES  = os.path.join(TOPDIR, file_list)

In [5]:
# HDF5 storage of dataframe metadata from 
# https://stackoverflow.com/questions/29129095/save-additional-attributes-in-pandas-dataframe/29130146#29130146
# note: needed to 'pip install --upgrade tables' for HDFStore

def h5store(filename, df, **kwargs):
    store = pd.HDFStore(filename)
    store.put('mydata', df)
    store.get_storer('mydata').attrs.metadata = kwargs
    store.close()

def h5load(store):
    data = store['mydata']
    metadata = store.get_storer('mydata').attrs.metadata
    return data, metadata

In [6]:
def line_count(filename):
    '''number of lines in FILENAME'''
    with open(filename) as f:
        cnt = 0
        for line in f: cnt += 1
    return cnt

In [7]:
def list_to_disk(fname, data):
    with open(fname, 'w') as f:
        for item in data:
            f.write("%s\n" % item)

In [13]:
class Finish():
    """Estimate when a long running job will finish."""

    def __init__(self, total_count, start_count = 0, start_time = None ):
        self.start_count = start_count
        if start_time:
            self.start_time = start_time
        else:
            self.start_time = datetime.now()
        self.total_count = total_count 

    def __str__(self):
        return ('{}; {} to {}'.format(self.start_time.isoformat(),
                                      self.start_count, self.total_count))

    def est_complete(self, current_count):
        """Estimate date/time of completion. Use '.isoformat' on result."""
        elapsed = datetime.now() - self.start_time
        perc_complete = (current_count-self.start_count)/self.total_count*1.0
        if perc_complete == 0:
            perc_complete += .0001
        done = ((self.start_time + elapsed / perc_complete)
                    .isoformat(timespec='minutes'))
        return ('{}/{}: {}'
                .format(current_count-self.start_count,
                        self.total_count,
                        done))

In [14]:
#%%writefile ProcessJSON.txt
# to write this cell out for printing, uncomment the line above: 
# otherwise leave it commented, or this cell will not compile

class GetDF(object):

    def __init__(self, savdir='~/pandas-snapshots', file_hdr='local_file'):
        '''
        savdir:   path to location of snapshots
        file_hdr: user-defined column heading for filename in final 
                  dataframe (default: 'local_file')
        '''
        self._savdir    = os.path.expanduser(savdir)
        self._file_hdr  = file_hdr
        self._error_group_col = ('ERROR: grouping column {} in file {}'
                                 ' does not have a unique value')
        self._metadata  = {}
        self._important = None
        self._group_col = None
        self._progress  = None
        self._snapshot  = None
        self._multi_group_cols = None
        self._force_overwrite  = False  # yet to implement this
        
        os.makedirs(self._savdir, exist_ok=True)
        
    def _process(self, file_list, topdir):
        '''
        process group of json files , 
        save intermediate dataframes to disk as hdf5 snapshots'''
        
        count = 0
        print('Collecting for fields: {}'.format(self._important))
        
        dd     = [pd.DataFrame(columns=self._important)]
        dd_tot = [] # total accumulator list; dd is the to-write accumulator list
        fnames = [] # filename accumulator list: can be used in reading hdf5 files
        
        num_files = line_count(file_list.name)
        ec = Finish(num_files)
        print('[{}] DBG: started reading files'.format(datetime.now().isoformat()))
        
        for line in file_list:
            fname = line.strip()  # strips off newline char
            filename = os.path.join(topdir, fname)
            
            count += 1

            if 0 == (count % self._progress):
                print('File progress: {}'.format(ec.est_complete(count)))
                
            jj = pd.read_json(filename)
            
            # verify the grouping-column value is unique and not missing
            # in this file across the HDUs, otherwise assert an error; 
            # TODO: make this a try/except: save bad filenames and keep moving
            assert jj[self._group_col].nunique() == 1, \
                self._error_group_col.format(self._group_col, filename)
            
            # if existing and unique, broadcast the grouping-column value
            # to the entire grouping column: this is required for proper grouping later;
            # usually grouping column is 'DTINSTRU', the instrument name
            jj[self._group_col] = jj[self._group_col].dropna().iloc[0]
            
            # add the file-name column to the dataframe: 
            # this is required for grouping HDUs by filename
            jj[self._file_hdr] = os.path.basename(filename)
            
            dd.append(jj)
            
            if (0 == (count % self._snapshot)) or (count == num_files):
                # write snapshot as hdf5
                #! will have to modify num_files for trailing files at the end
                self._metadata['num_files']    = self._snapshot
                self._metadata['file_hdr']     = self._file_hdr
                self._metadata['group_column'] = self._group_col
                hdf_name = '{}/snapshot-{}.hdf5'.format(self._savdir, count)
                df = pd.concat(dd)[self._important] # strip fields now
                h5store(hdf_name, df, **self._metadata)
                print('Wrote file {}'.format(hdf_name))
                
                dd_tot.append(df)  # accumulate stripped-field dataframes
                dd = [pd.DataFrame(columns=self._important)] # reset dd for next filewrite
                fnames.append(os.path.split(hdf_name)[-1])
                
        listname = '{}/snapshot_list_start-{}_end-{}.txt'.format(self._savdir, 
                                                                 self._snapshot, count)
        list_to_disk(listname, fnames)
        print('[{}] DBG: All files read'.format(datetime.now().isoformat()))
        print('Wrote snapshots and snapshot list to: {}'.format(self._savdir))
        
        print('Generating full dataframe...')
        return pd.concat(dd_tot)
        
    def write_snaps_get_df(self, file_list, fields, topdir='/', 
                           group_col='DTINSTRU', 
                           progress=1000, snapshot=2000):
        '''
        file_list: list of fullpaths to JSON files to read; it must be 
                   an io.TextIOWrapper object (this can be produced in argparse 
                   or by open(file_list) prior to this call)
        fields:    list of column headers to keep in processed dataframes
        topdir:    top dir of fullpaths to JSON files
        group_col: column name on which to group (default:'DTINSTRU')
        progress:  file interval at which file-reading progress is reported
                   (default: 1000)
        snapshot:  file interval at which snapshot files are created
                   (default: 2000)
        '''
        # add file-header column for filename: 
        # important not to use append() method here: it breaks things
        self._important = fields + [self._file_hdr]         
        self._group_col = group_col
        self._progress  = progress
        self._snapshot  = snapshot
        self._multi_group_cols  = [self._group_col, self._file_hdr]
        
        return self._process(file_list, topdir)
    
    def read_snaps_get_df(self, file_list, topdir='/'):
        '''
        file_list: list of fullpaths to hdf5 snapshot files written previously
                   by write_snaps_get_df(); it must be 
                   an io.TextIOWrapper object (this can be produced in argparse 
                   or by open(file_list) prior to this call)
        topdir:    top dir of fullpaths to hdf5 snapshot files           
        '''
        dd_tot = [] # dataframe accumulator
        gc     = [] # group-column info (for error check, and passing to analysis class)
        for line in file_list:
            fname = line.strip()  # strips off newline char
            filename = os.path.join(topdir, fname)
            print('reading {} from disk'.format(filename))
            with pd.HDFStore(filename) as store:
                df, metadata = h5load(store)
                dd_tot.append(df)
                gc.append((metadata['group_column'], metadata['file_hdr']))
        if (len(set(gc)) != 1): 
            print('WARNING: group_column and file_hdr are NOT the same for all hdf5 files:')
            print(gc)
        return pd.concat(dd_tot), gc[0]

In [23]:
rr = GetDF(savdir=SAVDIR)

In [24]:
# must turn FILES into a TextIOWrapper
FILES = open(os.path.expanduser(FILES), mode='rt')
df = rr.write_snaps_get_df(FILES, FIELDS, topdir=TOPDIR, progress=100, snapshot=250)

Collecting for fields: ['DATE-OBS', 'DTCALDAT', 'DTTELESC', 'DTINSTRU', 'OBSTYPE', 'PROCTYPE', 'PRODTYPE', 'DTSITE', 'OBSERVAT', 'DTACQNAM', 'DTPROPID', 'RA', 'DEC', 'FILTER', 'EXPTIME', 'OBSMODE', 'SEEING', 'OBJECT', 'local_file']
[2019-01-31T03:46:00.953057] DBG: started reading files
File progress: 100/600: 2019-01-31T03:46
File progress: 200/600: 2019-01-31T03:46


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['DATE-OBS', 'DTCALDAT', 'DTTELESC', 'DTINSTRU', 'OBSTYPE', 'PROCTYPE', 'PRODTYPE', 'DTSITE', 'OBSERVAT', 'DTACQNAM', 'DTPROPID', 'RA', 'DEC', 'FILTER', 'OBSMODE', 'SEEING', 'OBJECT', 'local_file']]



Wrote file /home/smithw/Devel/python/noao_data/pandas-snapshots/snapshot-250.hdf5
File progress: 300/600: 2019-01-31T03:46
File progress: 400/600: 2019-01-31T03:46
File progress: 500/600: 2019-01-31T03:46
Wrote file /home/smithw/Devel/python/noao_data/pandas-snapshots/snapshot-500.hdf5
File progress: 600/600: 2019-01-31T03:46
Wrote file /home/smithw/Devel/python/noao_data/pandas-snapshots/snapshot-600.hdf5
[2019-01-31T03:46:10.382876] DBG: All files read
Wrote snapshots and snapshot list to: /home/smithw/Devel/python/noao_data/pandas-snapshots
Generating full dataframe...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['DATE-OBS', 'DTCALDAT', 'DTTELESC', 'DTINSTRU', 'OBSTYPE', 'PROCTYPE', 'PRODTYPE', 'DTSITE', 'OBSERVAT', 'DTACQNAM', 'DTPROPID', 'RA', 'DEC', 'FILTER', 'OBSMODE', 'OBJECT', 'local_file']]



In [25]:
HDF5_files = '~/Devel/python/noao_data/pandas-snapshots/snapshot_list_start-250_end-600.txt'
HDF5_topdir = '~/Devel/python/noao_data/pandas-snapshots'
HDF5_files = open(os.path.expanduser(HDF5_files), mode='rt')
HDF5_topdir = os.path.expanduser(HDF5_topdir)
df2, gc = rr.read_snaps_get_df(HDF5_files, HDF5_topdir)

reading /home/smithw/Devel/python/noao_data/pandas-snapshots/snapshot-250.hdf5 from disk
reading /home/smithw/Devel/python/noao_data/pandas-snapshots/snapshot-500.hdf5 from disk
reading /home/smithw/Devel/python/noao_data/pandas-snapshots/snapshot-600.hdf5 from disk


In [26]:
gc

('DTINSTRU', 'local_file')

In [27]:
df2.equals(df)

True

In [28]:
df

Unnamed: 0,DATE-OBS,DTCALDAT,DTTELESC,DTINSTRU,OBSTYPE,PROCTYPE,PRODTYPE,DTSITE,OBSERVAT,DTACQNAM,DTPROPID,RA,DEC,FILTER,EXPTIME,OBSMODE,SEEING,OBJECT,local_file
0,2017-07-01T21:20:01.445,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,/lhome/data/observer/ccd170701bias.0001.fits,smarts,,,,,,,,c13a_170701_212001_zri.fits.json
1,2017-07-01,,,andicam,,,,,CTIO,,,11:17:30.03,-30:06:56.8,,0.0,,,BIASES,c13a_170701_212001_zri.fits.json
0,2017-07-01T21:20:52.614,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,/lhome/data/observer/ccd170701bias.0002.fits,smarts,,,,,,,,c13a_170701_212052_zri.fits.json
1,2017-07-01,,,andicam,,,,,CTIO,,,11:18:21.27,-30:06:56.6,,0.0,,,BIASES,c13a_170701_212052_zri.fits.json
0,2017-07-01T21:21:43.718,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,/lhome/data/observer/ccd170701bias.0003.fits,smarts,,,,,,,,c13a_170701_212143_zri.fits.json
1,2017-07-01,,,andicam,,,,,CTIO,,,11:19:12.52,-30:06:56.4,,0.0,,,BIASES,c13a_170701_212143_zri.fits.json
0,2017-07-01T21:22:34.807,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,/lhome/data/observer/ccd170701bias.0004.fits,smarts,,,,,,,,c13a_170701_212234_zri.fits.json
1,2017-07-01,,,andicam,,,,,CTIO,,,11:20:03.55,-30:06:56.2,,0.0,,,BIASES,c13a_170701_212234_zri.fits.json
0,2017-07-01T21:23:25.909,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,/lhome/data/observer/ccd170701bias.0005.fits,smarts,,,,,,,,c13a_170701_212325_zri.fits.json
1,2017-07-01,,,andicam,,,,,CTIO,,,11:20:54.79,-30:06:56.0,,0.0,,,BIASES,c13a_170701_212325_zri.fits.json


In [60]:
class ProcessJSON(object):
    
    # needs __init__()
        
        
    @property
    def get_full_dataframe(self):
        assert self._full_dataframe is not None, self._errmsg
        return self._full_dataframe
    
    @property
    def get_instr_vs_fields_unique_all_data(self):
        # TODO: this needs to be generalized so user can define the top rows 
        #       for display: 
        #       this will be broken when the 'important' list changes
        gg = self.get_full_dataframe.groupby(self._group_col).nunique().T
        indx = list(gg.index)
        # reorder rows to get similar rows at top for direct comparison
        indx = [self._file_hdr,'DTACQNAM'] + indx[:12] + indx[13:-1]
        return gg.loc[indx,:]
    
    @property
    def get_HDU_uniqueness_per_file(self):
        gg = self.get_full_dataframe.groupby(self._multi_group_cols).nunique()
        # drop corrupted (by nunique()) grouping columns
        gg = gg.drop(self._multi_group_cols, axis=1)
        # reset index, drop unnecessary local_file column
        gg.reset_index().drop(self._multi_group_cols[1], axis=1)  
        return gg.groupby(self._multi_group_cols[0]).\
                  agg(['min','max','mean','std']).round(2).stack().T
    
    @property
    def get_all_fields(self):
        return self._important
    
    @property    
    def get_HDU_stats(self):
        gg = self.get_full_dataframe.groupby(self._multi_group_cols).size()
        return gg.groupby(self._group_col).agg(['min','max','mean','std']).\
                                         rename_axis('HDU stats:', axis=1)
    
    def get_num_files_writing_fields(self, instr=True, percent=True):
        '''
        instr:   if True, list percentages (or raw numbers) of files per 
                 instrument that write each field, if False list total
                 number of files (or percentages) over ALL instruments 
                 (default=True)
        percent: if True, list percentages of files that write each field, 
                 if False, list raw numbers of files (default=True)
        '''
        zz = self.get_full_dataframe.groupby(self._multi_group_cols).nunique() > 0
        if not instr:
            gg = zz.sum()
            return (gg/gg[self._file_hdr]*100).round(2) if percent else gg
        else:
            gg = zz.drop(['DTINSTRU'], axis=1).\
                 rename(columns={self._file_hdr:'COUNT'}).\
                 reset_index().drop(self._file_hdr, axis=1)
            gg = gg.groupby('DTINSTRU').sum().T
            return (gg/gg.loc['COUNT']*100).round(2) if percent else gg

    def get_unique_values_of_field(self, field):
        return list(self.get_full_dataframe[field].dropna().unique())
    
    def get_num_unique_values_by_keys(self, field1, field2):
        gg = self.get_full_dataframe.groupby([field1, field2]).nunique()
        return pd.DataFrame(gg.loc[:, self._file_hdr]).rename(columns=\
                                            {self._file_hdr:'TOTAL OCCURRENCES'})

In [61]:
proc = ProcessJSON(BASE)

In [62]:
dates = DATE
num = None #100  # 'None' to get all files
force_overwrite = False
proc.run(dates, important=important, group_col='DTINSTRU', num_to_read=num, force_overwrite=force_overwrite) 

reading ../pydata-book/processed/20170701-processed.hdf5 from disk
reading ../pydata-book/processed/20170702-processed.hdf5 from disk
reading ../pydata-book/processed/20170703-processed.hdf5 from disk
reading ../pydata-book/processed/20170704-processed.hdf5 from disk
reading ../pydata-book/processed/20170705-processed.hdf5 from disk
reading ../pydata-book/processed/20170706-processed.hdf5 from disk
reading ../pydata-book/processed/20170707-processed.hdf5 from disk
reading ../pydata-book/processed/20170708-processed.hdf5 from disk
reading ../pydata-book/processed/20170709-processed.hdf5 from disk
reading ../pydata-book/processed/20170710-processed.hdf5 from disk
reading ../pydata-book/processed/20170711-processed.hdf5 from disk
reading ../pydata-book/processed/20170712-processed.hdf5 from disk
reading ../pydata-book/processed/20170713-processed.hdf5 from disk
reading ../pydata-book/processed/20170714-processed.hdf5 from disk
reading ../pydata-book/processed/20170715-processed.hdf5 from 

## TESTING

In [9]:
aa = proc.get_full_dataframe.copy()  # make copy to experiment: without copying, it is a VIEW (ie, a pointer)

In [10]:
bb = proc.get_instr_vs_fields_unique_all_data

In [11]:
cc = proc.get_HDU_uniqueness_per_file

In [12]:
dd = proc.get_all_fields # list

In [13]:
ee = proc.get_HDU_stats

In [14]:
ff = proc.get_unique_values_of_field('OBSTYPE')

In [15]:
gg1 = proc.get_num_unique_values_by_keys('DTINSTRU', 'OBSTYPE')

In [16]:
gg2 = proc.get_num_unique_values_by_keys('DTINSTRU', 'FILTER')

In [17]:
gg3 = proc.get_num_unique_values_by_keys('DTTELESC','DTINSTRU')

In [18]:
gg4 = proc.get_num_unique_values_by_keys('DTINSTRU', 'DTCALDAT')

In [78]:
hh1 = proc.get_num_files_writing_fields(instr=True, percent=True)

In [79]:
hh2 = proc.get_num_files_writing_fields(instr=True, percent=False)

In [80]:
hh3 = proc.get_num_files_writing_fields(instr=False, percent=True)

In [81]:
hh4 = proc.get_num_files_writing_fields(instr=False, percent=False)

In [19]:
aa # too big for html: 389000 rows!

Unnamed: 0,DATE-OBS,DTCALDAT,DTTELESC,DTINSTRU,OBSTYPE,PROCTYPE,PRODTYPE,DTSITE,OBSERVAT,REFERENCE,FILESIZE,MD5SUM,DTACQNAM,DTPROPID,PI,RELEASE_DATE,RA,DEC,FOOTPRINT,FILTER,EXPOSURE,OBSMODE,SEEING,DEPTH,SURVEYID,COLLECTIONID,OBJECT,RADIUS / BOX,RADIUS/BOX,local_file
0,2017-07-01T21:20:01.445,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,,,,/lhome/data/observer/ccd170701bias.0001.fits,smarts,,,,,,,,,,,,,,,,20170701/ct13m/smarts/c13a_170701_212001_zri.f...
1,2017-07-01,,,andicam,,,,,CTIO,,,,,,,,11:17:30.03,-30:06:56.8,,,,,,,,,BIASES,,,20170701/ct13m/smarts/c13a_170701_212001_zri.f...
0,2017-07-01T21:20:52.614,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,,,,/lhome/data/observer/ccd170701bias.0002.fits,smarts,,,,,,,,,,,,,,,,20170701/ct13m/smarts/c13a_170701_212052_zri.f...
1,2017-07-01,,,andicam,,,,,CTIO,,,,,,,,11:18:21.27,-30:06:56.6,,,,,,,,,BIASES,,,20170701/ct13m/smarts/c13a_170701_212052_zri.f...
0,2017-07-01T21:21:43.718,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,,,,/lhome/data/observer/ccd170701bias.0003.fits,smarts,,,,,,,,,,,,,,,,20170701/ct13m/smarts/c13a_170701_212143_zri.f...
1,2017-07-01,,,andicam,,,,,CTIO,,,,,,,,11:19:12.52,-30:06:56.4,,,,,,,,,BIASES,,,20170701/ct13m/smarts/c13a_170701_212143_zri.f...
0,2017-07-01T21:22:34.807,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,,,,/lhome/data/observer/ccd170701bias.0004.fits,smarts,,,,,,,,,,,,,,,,20170701/ct13m/smarts/c13a_170701_212234_zri.f...
1,2017-07-01,,,andicam,,,,,CTIO,,,,,,,,11:20:03.55,-30:06:56.2,,,,,,,,,BIASES,,,20170701/ct13m/smarts/c13a_170701_212234_zri.f...
0,2017-07-01T21:23:25.909,2017-07-01,ct13m,andicam,BIAS,raw,image,ct,CTIO,,,,/lhome/data/observer/ccd170701bias.0005.fits,smarts,,,,,,,,,,,,,,,,20170701/ct13m/smarts/c13a_170701_212325_zri.f...
1,2017-07-01,,,andicam,,,,,CTIO,,,,,,,,11:20:54.79,-30:06:56.0,,,,,,,,,BIASES,,,20170701/ct13m/smarts/c13a_170701_212325_zri.f...


In [23]:
# TODO: make optional csv, html output a method in ProcessJSON
bb.to_html('html/get_instr_vs_fields_unique_all_data.html')

In [25]:
bb.to_csv('csv/get_instr_vs_fields_unique_all_data.csv')

In [26]:
cc.to_html('html/get_HDU_uniqueness_per_file.html')

In [27]:
cc.to_csv('csv/get_HDU_uniqueness_per_file.csv')

In [28]:
dd  # list

['DATE-OBS',
 'DTCALDAT',
 'DTTELESC',
 'DTINSTRU',
 'OBSTYPE',
 'PROCTYPE',
 'PRODTYPE',
 'DTSITE',
 'OBSERVAT',
 'REFERENCE',
 'FILESIZE',
 'MD5SUM',
 'DTACQNAM',
 'DTPROPID',
 'PI',
 'RELEASE_DATE',
 'RA',
 'DEC',
 'FOOTPRINT',
 'FILTER',
 'EXPOSURE',
 'OBSMODE',
 'SEEING',
 'DEPTH',
 'SURVEYID',
 'COLLECTIONID',
 'OBJECT',
 'RADIUS / BOX',
 'RADIUS/BOX',
 'local_file']

In [29]:
ee.to_html('html/get_HDU_stats.html')

In [30]:
ee.to_csv('csv/get_HDU_stats.csv')

In [31]:
gg1.to_html('html/get_num_unique_values_by_keys_DTINSTRU_OBSTYPE.html')

In [32]:
gg1.to_csv('csv/get_num_unique_values_by_keys_DTINSTRU_OBSTYPE.csv')

In [33]:
gg2.to_html('html/get_num_unique_values_by_keys_DTINSTRU_FILTER.html')

In [34]:
gg2.to_csv('csv/get_num_unique_values_by_keys_DTINSTRU_FILTER.csv')

In [35]:
gg3.to_html('html/get_num_unique_values_by_keys_DTTELESC_DTINSTRU.html')

In [36]:
gg3.to_csv('csv/get_num_unique_values_by_keys_DTTELESC_DTINSTRU.csv')

In [37]:
gg4.to_html('html/get_num_unique_values_by_keys_DTINSTRU_DTCALDAT.html')

In [38]:
gg4.to_csv('csv/get_num_unique_values_by_keys_DTINSTRU_DTCALDAT.csv')

In [70]:
hh1.to_html('html/get_num_files_writing_fields(instr=True, percent=True).html')

In [71]:
hh2.to_html('html/get_num_files_writing_fields(instr=True, percent=False).html')

In [75]:
hh3.to_frame().to_html('html/get_num_files_writing_fields(instr=False, percent=True).html')

In [77]:
hh4.to_frame().to_html('html/get_num_files_writing_fields(instr=False, percent=False).html')