# readCornishMBH
Reads the massive black hole binary catalog provided by Neil and makes a pandas data frame

In [1]:
import os
from datetime import datetime
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Start by getting a list of the chain files
dataDir = '/home/centos/Data/CornishChains'
chainFiles = glob.glob(dataDir+'/chain_*')

In [3]:
# build the initial catalog based on source number and observation week
wks = np.empty(len(chainFiles))
srcs = np.empty(len(chainFiles))
for idx,chainFile in enumerate(chainFiles):
    prts = chainFile[len(dataDir)+1:].split('_')
    wks[idx]=int(prts[1])
    srcs[idx]=int(prts[2][:-4])

fileDict = {'chain file' : chainFiles, 'observation week' : wks, 'source number' : srcs}
fileDF = pd.DataFrame(fileDict)
fileDF.head()

Unnamed: 0,chain file,observation week,source number
0,/home/centos/Data/CornishChains/chain_3_158.dat,3.0,158.0
1,/home/centos/Data/CornishChains/chain_8_711.dat,8.0,711.0
2,/home/centos/Data/CornishChains/chain_8_6.dat,8.0,6.0
3,/home/centos/Data/CornishChains/chain_7_435.dat,7.0,435.0
4,/home/centos/Data/CornishChains/chain_14_6.dat,14.0,6.0


In [4]:
wks = np.unique(fileDF['observation week'])
wks

array([ 1.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
       15.])

In [5]:
srcs = np.unique(fileDF['source number'])
len(srcs)

12

In [6]:
for idx,wk in enumerate(wks):
    # set up output file
    outputFile = ('/home/centos/Data/MBH_wk%03i.h5' % wk)
    
    # make metadata dataframe 
    now = datetime.now()
    metaDict = {'author' : 'Neil Cornish', 'creation date' : now.strftime("%Y-%m-%d %H:%M:%S"), 'observation week' : wk}
    if wk > 1:
        metaDict['parent'] = 'MBHcatalog_week%03i' % wks[idx-1]
    else :
        metaDict['parent'] = None
   
    metaDF = pd.DataFrame(data =metaDict, index = {'MBHcatalog_week%03i' % (wk)})
    metaDF.to_hdf(outputFile,key='metadata',mode='w')
    
    # pick out right subset of catalogs
    catwk = fileDF[fileDF['observation week']==wk]
    
    # loop through entries and get point-estimates (median) as well as chain files
    colNames = ['Iteration', 
                'Log Likelihood', 
                'Mass 1', 
                'Mass 2', 
                'Spin 1',
                'Spin 2', 
                'Merger Phase', 
                'Barycenter Merge Time', 
                'Luminosity Distance', 
                'cos ecliptic colatitude', 
                'Ecliptic Longitude',
                'Polarization',
                'cos inclination',
                'Detector Merger Time'];

    entries = list(catwk.index)
    pointEstimates = np.empty([len(catwk),len(colNames)-1])

    srcs = list()
    for entry in entries:
        # read chain file
        chainDF = pd.read_csv(catwk.loc[entry]['chain file'], sep = ' ', index_col=0,names = colNames)
        chainDF['Ecliptic Latitude'] = np.pi/2-np.arccos(np.array(chainDF['cos ecliptic colatitude']))
        
        # take the median to get the point-estimates (maybe not good for multi-modal?)
        src = chainDF.median()
        src['name'] = 'MBH%09i' % src['Barycenter Merge Time']
        src['chain file'] = os.path.basename(outputFile)
        src['source number'] = catwk.loc[entry]['source number']
        src['observation week'] = wk
        srcs.append(src)
        
        chainDF.to_hdf(outputFile,key=('%s_chain' % src['name']), mode = 'a')
    
    entryDF = pd.DataFrame(srcs)
    entryDF.set_index('name', inplace=True)
    entryDF.to_hdf(outputFile,key='detections',mode='a')


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['author', 'creation date', 'parent'], dtype='object')]

  encoding=encoding,


## Convert to cummulative catalogs
We want to build a more realisitc set of "cummulative" catalogs.  This next section does that although it cheats by using the source number information to track the lineage of the sources. 

In [7]:
# first step is to build a master catalog with all the detections in all weeks
catPath = '/home/centos/Data'   # output path for the pandas catalogs
catFiles = glob.glob(catPath + '/MBH_wk*[!C].h5')
dfs = list()

# You read individual DataFrames out by specifying the key parameter. Here is metadata
for catFile in catFiles:
    df = pd.read_hdf(catFile, key = 'detections')
    dfs.append(df) 

allCats = pd.concat(dfs,axis=0)
allCats.sort_values(by=['source number', 'observation week'],ascending = True, axis=0, inplace=True)
parStr = list()
curr_src = -1
for idx in range(0,len(allCats)):
    new_src = allCats.iloc[idx]['source number']
    if new_src == curr_src:
        parStr.append(allCats.index[idx-1])
    else:
        parStr.append('')
        curr_src = new_src

allCats.insert(0,'Parent',parStr,True)
allCats

Unnamed: 0_level_0,Parent,Log Likelihood,Mass 1,Mass 2,Spin 1,Spin 2,Merger Phase,Barycenter Merge Time,Luminosity Distance,cos ecliptic colatitude,Ecliptic Longitude,Polarization,cos inclination,Detector Merger Time,Ecliptic Latitude,chain file,source number,observation week
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
MBH011281654,,56.88291,94116.38,43884.03306,0.105217,0.095709,1.576882,11281650.0,13.335744,-0.042806,2.747777,1.574148,0.598801,11281660.0,-0.042819,MBH_wk006.h5,6.0,6.0
MBH011289993,MBH011281654,78.604067,94055.93,43972.895728,0.3193,0.17823,1.573542,11289990.0,13.694113,-0.082289,1.350098,1.573576,0.680395,11289950.0,-0.082383,MBH_wk007.h5,6.0,7.0
MBH011296095,MBH011289993,103.889425,104964.8,40084.515488,0.529682,0.154848,1.574801,11296100.0,14.085472,-0.155118,1.245274,1.572586,0.738628,11296050.0,-0.155747,MBH_wk008.h5,6.0,8.0
MBH011294126,MBH011296095,135.824209,79700.16,50898.513935,0.3551,0.139837,1.581469,11294130.0,14.600668,-0.261373,1.173424,1.56232,0.79237,11294040.0,-0.264445,MBH_wk009.h5,6.0,9.0
MBH011308511,MBH011294126,171.155975,108595.6,39049.282454,0.760077,0.23533,1.560354,11308510.0,15.938038,-0.437332,1.084969,1.589925,0.862427,11308440.0,-0.452629,MBH_wk010.h5,6.0,10.0
MBH011312326,MBH011308511,229.097125,104742.7,40255.521041,0.790765,0.425125,1.569293,11312330.0,16.672126,-0.498357,1.188901,1.571588,0.902949,11312290.0,-0.521703,MBH_wk011.h5,6.0,11.0
MBH011310024,MBH011312326,298.218347,88127.93,46626.854887,0.731714,0.350058,1.588459,11310020.0,15.798658,0.694093,4.242791,1.578812,-0.906093,11310110.0,0.767159,MBH_wk012.h5,6.0,12.0
MBH011313425,MBH011310024,389.308427,92855.5,44609.845146,0.785946,0.533847,1.569179,11313430.0,15.070081,0.619688,4.204966,1.588681,-0.868136,11313480.0,0.668345,MBH_wk013.h5,6.0,13.0
MBH011317078,MBH011313425,527.595085,132558.2,33332.942037,0.857195,0.585346,1.566156,11317080.0,14.264659,0.675707,4.247968,1.572822,-0.8386,11317140.0,0.741924,MBH_wk014.h5,6.0,14.0
MBH011318669,MBH011317078,740.270521,142698.4,31498.663144,0.880107,0.685761,1.572697,11318670.0,13.886229,0.788169,4.306482,1.567044,-0.831652,11318750.0,0.907828,MBH_wk015.h5,6.0,15.0


In [8]:
# next we loop over all the weeks
wks = np.unique(fileDF['observation week'])
wks
for idx,wk in enumerate(wks):
    # set up output file
    outputFile = ('/home/centos/Data/MBH_wk%03iC.h5' % wk)
    catPath = os.path.dirname(outputFile)
    
    # make metadata dataframe 
    now = datetime.now()
    metaDict = {'author' : 'Neil Cornish', 'creation date' : now.strftime("%Y-%m-%d %H:%M:%S"), 'observation week' : wk}
    if idx > 0:
        metaDict['parent'] = 'MBHcatalog_week%03i' % wks[idx-1]
    else :
        metaDict['parent'] = None
   
    metaDF = pd.DataFrame(data =metaDict, index = {'MBHcatalog_week%03i' % (wk)})
    metaDF.to_hdf(outputFile,key='metadata',mode='w')
    
    # pick out right subset of catalogs
    catwkx = allCats[allCats['observation week'] <= wk].drop_duplicates(subset='source number',keep='last').copy()
    
    # copy over chain files
    for cidx in range(0,len(catwkx)):
        sourceDF = pd.read_hdf(os.path.join(catPath,catwkx.iloc[cidx]['chain file']), key = catwkx.index[cidx] + '_chain')
        sourceDF.to_hdf(os.path.join(catPath,outputFile), key = catwkx.index[cidx] + '_chain')
    
    # fix lineage for sources which have "expired" and reset chain file
    parStr = list()
    chainStr = list()
    for cidx in range(0,len(catwkx)):
        if (catwkx.iloc[cidx]['observation week'] < wk) & (catwkx.iloc[cidx]['Parent'] != ''):
            parStr.append(catwkx.index[cidx])
        else:
            parStr.append(catwkx.iloc[cidx]['Parent'])
        
        chainStr.append(os.path.basename(outputFile))
        
    
    catwkx['chain file'] = chainStr    
    catwkx['Parent'] = parStr
    catwkx.drop(columns = ['source number','observation week'],inplace=True)
    catwkx.sort_values(by = 'Barycenter Merge Time',inplace=True)
    catwkx.to_hdf(os.path.join(catPath,outputFile),key = 'detections')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['author', 'creation date', 'parent'], dtype='object')]

  encoding=encoding,


In [9]:
catwkx

Unnamed: 0_level_0,Parent,Log Likelihood,Mass 1,Mass 2,Spin 1,Spin 2,Merger Phase,Barycenter Merge Time,Luminosity Distance,cos ecliptic colatitude,Ecliptic Longitude,Polarization,cos inclination,Detector Merger Time,Ecliptic Latitude,chain file
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
MBH000373540,,60.28295,11191.400618,1573.34712,0.912502,0.627869,1.555474,373540.2,22.205113,0.288597,2.336538,1.68453,-0.316716,373452.5,0.292761,MBH_wk015C.h5
MBH001865195,MBH001865195,164.506458,24034.822836,8748.509775,0.972429,0.918015,2.297636,1865195.0,95.079793,0.195796,3.250084,2.404726,-0.500549,1864860.0,0.197069,MBH_wk015C.h5
MBH002301433,,67.615841,12659.90728,3912.600441,0.961883,0.867574,1.575534,2301434.0,34.400859,0.3731,1.242727,1.585468,-0.013747,2301210.0,0.382348,MBH_wk015C.h5
MBH004556400,MBH004556400,243.337498,15868.967713,9036.333963,0.977598,0.962171,1.671275,4556400.0,28.921349,0.756914,0.471104,1.576525,-0.410775,4556105.0,0.858578,MBH_wk015C.h5
MBH004650719,MBH004650719,446.562104,22450.923508,8796.170612,0.974075,0.933973,1.493337,4650720.0,59.148264,-0.182069,1.626839,1.601119,0.460113,4650671.0,-0.18309,MBH_wk015C.h5
MBH005546845,MBH005546845,652.996924,13238.473697,2707.606755,0.986599,0.903154,1.673776,5546846.0,30.323258,0.480294,3.662275,1.810179,-0.800118,5547205.0,0.500989,MBH_wk015C.h5
MBH006058694,MBH006058694,119.338404,10349.085349,3303.28292,0.96701,0.885836,1.61407,6058695.0,47.235091,0.333101,1.464468,1.373326,-0.442646,6058450.0,0.33959,MBH_wk015C.h5
MBH006253789,MBH006253789,260.016246,51499.191655,5166.812496,0.990408,0.815457,1.460127,6253789.0,24.506354,0.592016,6.08403,1.503608,-0.374348,6253739.0,0.633558,MBH_wk015C.h5
MBH007449510,MBH007449510,72.207804,38276.504295,2615.575967,0.970797,0.368108,1.559661,7449511.0,100.93043,-0.682983,0.640946,1.612438,0.667808,7449392.0,-0.751839,MBH_wk015C.h5
MBH007807200,MBH007807200,189597.62955,641359.177017,83719.952286,0.988151,0.986588,0.975498,7807201.0,14.897055,0.981188,4.567808,1.592036,-0.570242,7807296.0,1.376523,MBH_wk015C.h5
