## Import libraries

In [1]:
import numpy as np
import pandas as pd

import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import dask

import re

## Importing as Dask Dataframes all files
(output_*.csv, evolved_*.dat, logfile_*.dat)

In [2]:
output = dd.read_csv('../Data/sevn_output_Z0.001A1L1/sevn_output_Z0.001A1L1/0/output_*.csv')
evolved = dd.read_table('../Data/sevn_output_Z0.001A1L1/sevn_output_Z0.001A1L1/0/evolved_*.dat', sep='\s+')
logfile = dd.read_csv('../Data/sevn_output_Z0.001A1L1/sevn_output_Z0.001A1L1/0/logfile_*.dat', header =None).squeeze()
# logfile = dd.read_table('sevn_output_Z0.001A1L1/sevn_output_Z0.001A1L1/0/logfile_*.dat', sep=';', header=None, names=['Object', 'name', 'ID', 'event', 'time', 'info'])

print(f'Number of pratitions for evolved files : {evolved.npartitions}')
print(f'Number of pratitions for output files : {output.npartitions}')
print(f'Number of pratitions for logfile files : {logfile.npartitions}')

Number of pratitions for evolved files : 30
Number of pratitions for output files : 60
Number of pratitions for logfile files : 30


## Filtering logfile files to count the number of RL overflows and CE

In [3]:
df_RLO = logfile.str.extract(r"B;((?:\d*\_)?\d+);(\d+);RLO_BEGIN;").dropna().rename(columns={0:'name', 1:'ID'}).groupby('name').size().to_frame(name='RLO').reset_index()
df_CE = logfile.str.extract(r"B;((?:\d*\_)?\d+);(\d+);CE;").dropna().rename(columns={0:'name', 1:'ID'}).groupby('name').size().to_frame(name='CE').reset_index()

This is an alternative method. The previous one is better since parallelize all operations

In [4]:
# df_logfile = logfile.str.findall(r"B;((?:\d*\_)?\d+);(\d+);RLO_BEGIN;").to_frame(name='id').explode('id').dropna().compute()
# df_logfile[['name','ID']] = pd.DataFrame(df_logfile['id'].tolist(), index=df_logfile.index)
# df_logfile =df_logfile.drop(columns=['id']).groupby('name').size().to_frame(name='RLO').reset_index()

# df_logfile.head(10)

## Filtering output files to get only binaries black holes

In [5]:
idxBHBH=(output.RemnantType_0==6) & (output.RemnantType_1==6) & (output.Semimajor.notnull())
output_bhbh=output[idxBHBH] #Distributed DataFrame containing only binaries BB
# output_bhbh = output_bhbh.compute()

## Joining tables
Due to the reshuffling among the partition it is not clear wheter or not is covenient to join the tables trhough dask or locally (https://docs.dask.org/en/stable/dataframe-groupby.html)

In [11]:
bhbh = output_bhbh.merge(evolved, on='name', how='inner').merge(df_RLO, on='name', how='left').merge(df_CE, on='name', how='left')

In [12]:
bhbh.head()

Unnamed: 0,ID,name,Mass_0_x,MHE_0,MCO_0,Radius_0,RHE_0,RCO_0,Luminosity_0,Temperature_0,...,spin_1,SN_1,Tstart_1,a,e,Tend,Dtout,Seed,RLO,CE
0,113161,0_633970558512000,13.43931,0.0,0.0,5.7e-05,0.0,0.0,1e-10,2416.568,...,0.0,rapid_gauNS,zams,2150.0,0.558,end,events,584653665136967,2.0,2.0
1,115326,0_982302651669916,48.80722,0.0,0.0,0.000207,0.0,0.0,1e-10,1268.076,...,0.0,rapid_gauNS,zams,7850.0,0.428,end,events,517569089613274,,
2,119321,0_192184231174707,35.85404,0.0,0.0,0.000152,0.0,0.0,1e-10,1479.512,...,0.0,rapid_gauNS,zams,96000.0,0.578,end,events,696338637666074,,
3,415724,0_194426475249492,24.70646,0.0,0.0,0.000105,0.0,0.0,1e-10,1782.306,...,0.0,rapid_gauNS,zams,803.0,0.0356,end,events,246916327964595,1.0,1.0
4,703313,0_990268926659684,38.02112,0.0,0.0,0.000161,0.0,0.0,1e-10,1436.73,...,0.0,rapid_gauNS,zams,459.0,0.235,end,events,599453684608335,2.0,


# Some Analysis

In [14]:
#list all the columns
print(bhbh.columns)

Index(['ID', 'name', 'Mass_0_x', 'MHE_0', 'MCO_0', 'Radius_0', 'RHE_0',
       'RCO_0', 'Luminosity_0', 'Temperature_0', 'Lambda_0', 'Phase_0',
       'PhaseBSE_0', 'RemnantType_0', 'Zams_0', 'Mass_1_x', 'MHE_1', 'MCO_1',
       'Radius_1', 'RHE_1', 'RCO_1', 'Luminosity_1', 'Temperature_1',
       'Lambda_1', 'Phase_1', 'PhaseBSE_1', 'RemnantType_1', 'Zams_1',
       'Semimajor', 'Eccentricity', 'GWtime', 'BWorldtime', 'BEvent', '#ID',
       'Mass_0_y', 'Z_0', 'spin_0', 'SN_0', 'Tstart_0', 'Mass_1_y', 'Z_1',
       'spin_1', 'SN_1', 'Tstart_1', 'a', 'e', 'Tend', 'Dtout', 'Seed', 'RLO',
       'CE', 'tdelay'],
      dtype='object')


In [13]:
#Add a column with the merging time
bhbh['tdelay'] = bhbh['GWtime']+ bhbh['BWorldtime']

In [None]:
bhbh.head()