In [1]:
import numpy as np
import pandas as pd

import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import dask

import re

## Importing as Dask Dataframes all files
(output_*.csv, evolved_*.dat, logfile_*.dat)

In [2]:
output = dd.read_csv('sevn_output_Z0.001A1L1/sevn_output_Z0.001A1L1/0/output_*.csv')
evolved = dd.read_table('sevn_output_Z0.001A1L1/sevn_output_Z0.001A1L1/0/evolved_*.dat', sep='\s+')
logfile = dd.read_csv('sevn_output_Z0.001A1L1/sevn_output_Z0.001A1L1/0/logfile_*.dat', header =None).squeeze()
# logfile = dd.read_table('sevn_output_Z0.001A1L1/sevn_output_Z0.001A1L1/0/logfile_*.dat', sep=';', header=None, names=['Object', 'name', 'ID', 'event', 'time', 'info'])

print(f'Number of pratitions for evolved files : {evolved.npartitions}')
print(f'Number of pratitions for output files : {output.npartitions}')
print(f'Number of pratitions for logfile files : {logfile.npartitions}')

Number of pratitions for evolved files : 30
Number of pratitions for output files : 60
Number of pratitions for logfile files : 30


## Filtering logfile files to count the number of RL overflows and CE

In [3]:
df_RLO = logfile.str.extract(r"B;((?:\d*\_)?\d+);(\d+);RLO_BEGIN;").dropna().rename(columns={0:'name', 1:'ID'}).groupby('name').size().to_frame(name='RLO').reset_index()
df_CE = logfile.str.extract(r"B;((?:\d*\_)?\d+);(\d+);CE;").dropna().rename(columns={0:'name', 1:'ID'}).groupby('name').size().to_frame(name='CE').reset_index()

This is an alternative method. The previous one is better since parallelize all operations

In [4]:
# df_logfile = logfile.str.findall(r"B;((?:\d*\_)?\d+);(\d+);RLO_BEGIN;").to_frame(name='id').explode('id').dropna().compute()
# df_logfile[['name','ID']] = pd.DataFrame(df_logfile['id'].tolist(), index=df_logfile.index)
# df_logfile =df_logfile.drop(columns=['id']).groupby('name').size().to_frame(name='RLO').reset_index()

# df_logfile.head(10)

## Filtering output files to get only binaries black holes

In [5]:
idxBHBH=(output.RemnantType_0==6) & (output.RemnantType_1==6) & (output.Semimajor.notnull())
output_bhbh=output[idxBHBH] #Distributed DataFrame containing only binaries BB
# output_bhbh = output_bhbh.compute()

## Joining tables
Due to the reshuffling among the partition it is not clear wheter or not is covenient to join the tables trhough dask or locally (https://docs.dask.org/en/stable/dataframe-groupby.html)

In [6]:
bhbh = output_bhbh.merge(evolved, on='name', how='inner').merge(df_RLO, on='name', how='inner').merge(df_CE, on='name', how='inner')

In [7]:
bhbh.head()

Unnamed: 0,ID,name,Mass_0_x,MHE_0,MCO_0,Radius_0,RHE_0,RCO_0,Luminosity_0,Temperature_0,...,spin_1,SN_1,Tstart_1,a,e,Tend,Dtout,Seed,RLO,CE
0,98,0_724296633239333,25.35502,0.0,0.0,0.000108,0.0,0.0,1e-10,1759.363,...,0.0,rapid_gauNS,zams,2690.0,0.0264,end,events,324470267133292,2,2
1,302986,0_710936744924376,41.39549,0.0,0.0,0.000176,0.0,0.0,1e-10,1376.927,...,0.0,rapid_gauNS,zams,4700.0,0.532,end,events,190919426911395,2,1
2,303827,0_712650372297186,17.9499,0.0,0.0,7.6e-05,0.0,0.0,1e-10,2091.012,...,0.0,rapid_gauNS,zams,435.0,0.0711,end,events,44464430933553,2,1
3,304803,0_447617162191296,8.636704,0.0,0.0,3.7e-05,0.0,0.0,1e-10,3014.487,...,0.0,rapid_gauNS,zams,89.8,0.00204,end,events,659506961639401,3,1
4,103663,0_933499514149433,25.235,0.0,0.0,0.000107,0.0,0.0,1e-10,1763.542,...,0.0,rapid_gauNS,zams,323.0,0.00775,end,events,27886306847488,2,1
