In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import dask
from dask.distributed import Client, LocalCluster,SSHCluster
import dask.distributed

# import hvplot.dask

In [2]:
cluster = SSHCluster(
    [ "bhbh-1", "bhbh-1", "bhbh-2", "bhbh-3"],
    connect_options={"client_keys": "/home/ubuntu/private/tbertola_key.pem"},
    worker_options={"nthreads": 8}, # because each bhbh-* has 4 cores
    scheduler_options={"port": 8786, "dashboard_address": ":8787"}
)

2023-06-02 16:22:28,806 - distributed.deploy.ssh - INFO - 2023-06-02 16:22:28,804 - distributed.scheduler - INFO - State start
2023-06-02 16:22:28,813 - distributed.deploy.ssh - INFO - 2023-06-02 16:22:28,812 - distributed.scheduler - INFO -   Scheduler at:   tcp://10.67.22.140:8786
2023-06-02 16:22:29,753 - distributed.deploy.ssh - INFO - 2023-06-02 16:22:29,752 - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.140:44459'
2023-06-02 16:22:30,470 - distributed.deploy.ssh - INFO - 2023-06-02 16:22:30,469 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-w03vteix', purging
2023-06-02 16:22:30,502 - distributed.deploy.ssh - INFO - 2023-06-02 16:22:30,501 - distributed.worker - INFO -       Start worker at:   tcp://10.67.22.140:35207
2023-06-02 16:22:31,577 - distributed.deploy.ssh - INFO - 2023-06-02 16:22:31,574 - distributed.nanny - INFO -         Start Nanny at: 'tcp://10.67.22.220:46139'
2023-06-02 16:22:31,839 - dis

In [3]:
client = Client(cluster)


+---------+--------+-----------+------------------+
| Package | Client | Scheduler | Workers          |
+---------+--------+-----------+------------------+
| tornado | 6.3.2  | 6.3.2     | {'6.2', '6.3.2'} |
+---------+--------+-----------+------------------+


In [4]:
cluster.workers

{0: <distributed.deploy.ssh.Worker: status=running>,
 1: <distributed.deploy.ssh.Worker: status=running>,
 2: <distributed.deploy.ssh.Worker: status=running>}

## Importing as Dask Dataframes all files
(output_*.csv, evolved_*.dat, logfile_*.dat)

In [5]:
output_column_to_remove = ['ID', 'MHE_0', 'MCO_0', 'Radius_0', 'RHE_0','RCO_0', 'Luminosity_0', 'Temperature_0', 'Lambda_0', 
                           'Phase_0', 'PhaseBSE_0', 'Zams_0', 'MHE_1', 'MCO_1','Radius_1', 'RHE_1', 'RCO_1', 
                           'Luminosity_1', 'Temperature_1','Lambda_1', 'Phase_1', 'PhaseBSE_1', 'Zams_1']
evolved_column_to_remove = ['#ID', 'spin_0', 'Tstart_0', 'spin_1', 'Tstart_1', 'Tend', 'Dtout', 'Seed']

In [25]:
#sevn_output_Z*/sevn_output_Z*/0
#sevn_output_Z0.01*/sevn_output_Z0.01*


output = dd.read_csv('/mnt/bhbh/fiducial_Hrad_5M/sevn_output_Z0.0001*/0/output_*.csv',
                     include_path_column=True, blocksize=None).\
            rename(columns={'Mass_0':'Mass_0_out', 'Mass_1':'Mass_1_out'}).\
            drop(columns=output_column_to_remove)#.\
#             repartition(npartitions=16)

evolved = dd.read_table('/mnt/bhbh/fiducial_Hrad_5M/sevn_output_Z0.0001*/0/evolved_*.dat', include_path_column=True, sep='\s+', blocksize=None).\
            drop(columns=evolved_column_to_remove)#.\
#             repartition(npartitions=16)
logfile = dd.read_csv('/mnt/bhbh/fiducial_Hrad_5M/sevn_output_Z0.0001*/0/logfile_*.dat', header=None, include_path_column=True, blocksize=None)#.\
#             repartition(npartitions=16)

print(f'Number of pratitions for evolved files : {evolved.npartitions}')
print(f'Number of pratitions for output files : {output.npartitions}')
print(f'Number of pratitions for logfile files : {logfile.npartitions}')

Number of pratitions for evolved files : 120
Number of pratitions for output files : 120
Number of pratitions for logfile files : 120


In [26]:
# for i in [output, evolved, logfile]:
#     print(i.columns, '\n')

In [28]:
# paths=output['path'].head()

In [None]:
# paths[0]

In [None]:
# output.dtypes

In [None]:
# evolved.dtypes

In [None]:
# logfile.dtypes

# Output: filtering to get only binary black holes, add Z value and Alpha parameter

In [29]:
idxBHBH=(output.RemnantType_0==6) & (output.RemnantType_1==6) & (output.Semimajor.notnull())
output=output[idxBHBH] #Distributed DataFrame containing only binaries BB

output['Z_0'] = output['path'].str.extract(r".+((?<=Z)\d.\d+)").squeeze() #Z_0 type is object
output['Z_0'] = output['Z_0'].astype(float) # Z_0 is float

output['alpha'] = output['path'].str.extract(r".+(?<=A)(.*)(?=L)").squeeze() #Z_0 type is object
output['alpha'] = output['alpha'].astype(float) # alpha is a float


# Evolved add Alpha parameter

In [30]:
evolved['alpha'] = evolved['path'].str.extract(r".+(?<=A)(.*)(?=L)").squeeze() #Z_0 type is object
evolved['alpha'] = evolved['alpha'].astype(float) # alpha is a float


# Logfile add Z value and Alpha parameter

In [31]:
logfile = logfile.sum(axis=1)

In [32]:
df_RLO = logfile.str.extract(r"B;((?:\d*\_)?\d+);(\d+);RLO_BEGIN;.+((?<=Z)\d.\d+).+(?<=A)(.*)(?=L)").\
                dropna().\
                rename(columns={0:'name', 1:'ID', 2:'Z_0', 3:'alpha'}).\
                groupby(['name','Z_0', 'alpha']).\
                size().\
                to_frame(name='RLO').\
                reset_index()

df_CE = logfile.str.extract(r"B;((?:\d*\_)?\d+);(\d+);CE;.+((?<=Z)\d.\d+).+(?<=A)(.*)(?=L)").\
                dropna().\
                rename(columns={0:'name', 1:'ID', 2:'Z_0', 3:'alpha'}).\
                groupby(['name','Z_0', 'alpha']).\
                size().\
                to_frame(name='CE').\
                reset_index()

df_BSN = logfile.str.extract(r"B;((?:\d*\_)?\d+);(\d+);BSN;.+((?<=Z)\d.\d+).+(?<=A)(.*)(?=L)").\
                dropna().\
                rename(columns={0:'name', 1:'ID', 2:'Z_0', 3:'alpha'}).\
                groupby(['name','Z_0', 'alpha']).\
                size().\
                to_frame(name='BSN').\
                reset_index()

df_RLO['Z_0'] = df_RLO['Z_0'].astype(float)
df_CE['Z_0'] = df_RLO['Z_0'].astype(float)
df_BSN['Z_0'] = df_RLO['Z_0'].astype(float)

df_RLO['alpha'] = df_RLO['alpha'].astype(float)
df_CE['alpha'] = df_RLO['alpha'].astype(float)
df_BSN['alpha'] = df_RLO['alpha'].astype(float)

In [33]:
# df_BSN.compute()

## Joining tables
Due to the reshuffling among the partition it is not clear wheter or not is covenient to join the tables trough dask or locally (https://docs.dask.org/en/stable/dataframe-groupby.html)

In [34]:
drop_list = ['RemnantType_0',  'RemnantType_1', 'path_x', 'path_y']

Take a look at this thing

https://www.coiled.io/blog/dask-dataframe-merge-join
https://www.coiled.io/blog/dask-set-index-dataframe

In [None]:
# ev=evolved.head()

In [None]:
# type(evolved['path'].str.extract(r"(?<=5M/)(.*)(?=/0)"))

In [None]:
evolved['dir']=evolved['path'].str.extract(r"(?<=5M/)(.*)(?=/0)").squeeze()
output['dir']=output['path'].str.extract(r"(?<=5M/)(.*)(?=/0)").squeeze()

In [37]:
output.compute()

KilledWorker: Attempted to run task ('assign-890cffe053cff1b8989fff511c946b62', 10) on 3 different workers, but all those workers died while running it. The last worker that attempt to run the task was tcp://10.67.22.220:35483. Inspecting worker logs is often a good next step to diagnose what went wrong. For more information see https://distributed.dask.org/en/stable/killed.html.

In [35]:
evolved['thr']=evolved['path'].str.extract(r"(?<=0\/evolved_)(.*)(?=\.csv)").squeeze()
output['thr']=output['path'].str.extract(r"(?<=0\/output_)(.*)(?=\.csv)").squeeze()
evolved['idx']=evolved['dir']+evolved['thr']
output['idx']=output['dir']+output['thr']
# logfile['dir']=logfile['path'].str.extract(r"(?<=5M/)(.*)(?=/0)").squeeze()

In [36]:
output.compute()

KeyboardInterrupt: 

In [None]:
f=open('lista_directories', 'r')
l=f.readlines()
part=[i.strip() for i in l]

In [None]:
# large_sorted = large.set_index("id", divisions=unique_divisions)

In [None]:
evolved= evolved.set_index('dir', divisions=part)
output= output.set_index('dir', divisions=part)

In [None]:
# cluster.close()

In [None]:
# evolved.compute()
# dask.compute(evolved)

In [None]:
len(output.divisions)


In [None]:
# dask.compute( output)

In [None]:
evolved=evolved.persist()
output=output.persist()

In [None]:
bhbh = evolved.merge(output, on=['dir','name'], how='inner')#.\
#                merge(df_RLO, on=['name','Z_0', 'alpha'], how='left')#.\
#                merge(df_CE,  on=['name','Z_0', 'alpha'], how='left').\
#                merge(df_BSN, on=['name','Z_0', 'alpha'], how='left').\
#                fillna(value=0).\
#                drop(columns=drop_list)


In [None]:
bhbh.compute()

In [None]:
bhbh['tdelay'] = bhbh['GWtime'] + bhbh['BWorldtime']

#bhbh.drop(columns=['GWtime', 'BWorldtime'])

bhbh['Mass_max_out'] = bhbh['Mass_1_out']
bhbh['Mass_max_out'] = bhbh['Mass_max_out'].where(cond=(bhbh['Mass_max_out'] > bhbh['Mass_0_out']), other=bhbh['Mass_0_out'])

bhbh['q'] = bhbh['Mass_1_out']/bhbh['Mass_0_out']
bhbh['q'] = bhbh['q'].where(cond=(bhbh['Mass_1_out'] < bhbh['Mass_0_out']), other=bhbh['Mass_0_out']/bhbh['Mass_1_out'])

bhbh['Mass_chirp'] = ((bhbh['Mass_0_out'] * bhbh['Mass_1_out'])**(3/5))/((bhbh['Mass_0_out'] + bhbh['Mass_1_out'])**(1/5))

In [None]:
bhbh.columns

In [None]:
bhbh.dtypes

In [None]:
bhbh.compute()

# Save in a distributed way 

In [None]:
# ! touch /mnt/bhbh/test.txt

In [None]:
# !echo test > /mnt/bhbh/test.txt

In [None]:
# test_pd=pd.DataFrame(data=[[i for i in range(4)] for j in range(1000000)], columns=[str(k) for k in range(4)])

# test_dd=dd.from_pandas(test_pd, npartitions=20)

# dd.to_parquet(test_dd, '/mnt/bhbh/test_parquet')

# The real saving

In [None]:
# bhbh.to_csv('./BHBH_partitioned') 

In [None]:
# bhbh.to_parquet('/mnt/bhbh/partitioned_summary')