This notebook uses the data generated with [Ethtx](https://github.com/EthTx) using their [beta data warehouses](https://tokenflow.live/blog/edw-open). The data refers to the transactions of the [LANDProxy](https://etherscan.io/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d) contract and the subcalls of each transaction.

The goal is to produce a dataframe for each unique `FUNCTION_NAME` contained in the data. On such dataframes, all the transactions and subcalls for the `FUNCTION_NAME` are present.

In [2]:
import glob, os
import pandas as pd

pd.set_option('display.max_colwidth', None)

# path = r'../data/LAND_decoded_calls'
# all_files = glob.glob(os.path.join(path, "*.csv"))

all_files = [r'../data/LAND_decoded_calls\LAND_decoded_calls_1_6_0.csv', 
# r'../data/LAND_decoded_calls\LAND_decoded_calls_0_1_0.csv', 
#  r'../data/LAND_decoded_calls\LAND_decoded_calls_2_4_0.csv', 
#  r'../data/LAND_decoded_calls\LAND_decoded_calls_3_3_0.csv', 
#  r'../data/LAND_decoded_calls\LAND_decoded_calls_0_4_0.csv'
]

df = pd.concat((pd.read_csv(f,  sep=",", engine="python", escapechar='\\')
               for f in all_files))


print(df.shape[0])
print(df.columns)



10737
Index(['LOAD_ID', 'CHAIN_ID', 'BLOCK', 'TIMESTAMP', 'TX_HASH', 'CALL_ID',
       'CALL_TYPE', 'FROM_ADDRESS', 'FROM_NAME', 'TO_ADDRESS', 'TO_NAME',
       'FUNCTION_SIGNATURE', 'FUNCTION_NAME', 'VALUE', 'ARGUMENTS',
       'RAW_ARGUMENTS', 'OUTPUTS', 'RAW_OUTPUTS', 'GAS_USED', 'ERROR',
       'STATUS', 'ORDER_INDEX', 'DECODING_STATUS', 'STORAGE_ADDRESS'],
      dtype='object')


To give an order to the records, we sort them using `TIMESTAMP` and `ORDER_INDEX` fields and then reset the indexes for the iteration with `iterrows()`. 

To make the dataset work with [BPMN Miner](https://www.sciencedirect.com/science/article/abs/pii/S0306437915001325) we need to make some changes to the records. The `FUNCTION_NAME` of the subcalls is prefixed with `{function_name}_{row['FROM_NAME']` (e.g. `approve_LAND.approve`) to highlights the sub-process name (`function_name`) and the smart contract that is calling it (`row['FROM_NAME]`). The `id` column is added to the transaction record and to the related subcalls to group them in traces. This column contains the address of the user invoking the "top-level" transaction. 
Moreover, we need to add new columns with the name of the "top-level" transaction method (e.g. `destroy`, `transferFrom`, `createEstate`) to the subcalls in order to group them in subprocesses. These columns have as values the hash of the top-level transaction. 

On events belonging to subcalls with lenght 1, the `function_name` column is prefixed with FK to indicate that they should not be selected during the primary key selection step in ProM.

In [3]:
# df = df[df['ERROR'] == '\\N'] #! screw the order of transactions calls

df = df.sort_values(by=["TIMESTAMP", "ORDER_INDEX"]) # sort by TIMESTAMP and ORDER_INDEX
df.reset_index(drop=True, inplace=True) # needed after sorting

user_address = ""
function_name = ""
calls_length = 0
columns = dict()

def update_columns(calls_length, function_name):
    if calls_length == 1:
        if columns.get(function_name) != function_name:
            columns.update({function_name: f"FK_{function_name}"})
    else: 
        columns.update({function_name: function_name})

for index, row in df.iterrows():
    if(row["CALL_ID"] == "\\N"):
        update_columns(calls_length=calls_length, function_name=function_name)

        calls_length = 0
        user_address = row["FROM_ADDRESS"]
        function_name = row["FUNCTION_NAME"]
        df.at[index, 'FUNCTION_NAME'] = f"{row['TO_NAME']}{'.'}{row['FUNCTION_NAME']}"
    else:
        calls_length += 1
        df.at[index, 'FUNCTION_NAME'] = f"{function_name}_{row['FROM_NAME']}{'.'}{row['FUNCTION_NAME']}"
        df.at[index, function_name] = row["TX_HASH"]

    # needed for last elements
    if(index == len(df) - 1):
        update_columns(calls_length=calls_length, function_name=function_name)

    df.at[index, 'id'] = user_address

# columns that should be renamed are stored in the columns dict
df.rename(columns=columns, inplace=True)

Create the `.xes` log with the `id` column as trace key and remove "nan" attributes.

In [4]:
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter

df = dataframe_utils.convert_timestamp_columns_in_df(
    df)

# create XES standard columns
df["case:concept:name"] = df["id"]
df["time:timestamp"] = df["TIMESTAMP"]
df["concept:name"] = df["FUNCTION_NAME"]

# remove unnecessary fields
df.drop(['LOAD_ID', 'CHAIN_ID', 'BLOCK', 'TIMESTAMP', 'TX_HASH', 'CALL_ID',
       'CALL_TYPE', 'FROM_ADDRESS', 'FROM_NAME', 'TO_ADDRESS', 'TO_NAME',
       'FUNCTION_SIGNATURE', 'FUNCTION_NAME', 'VALUE', 'ARGUMENTS',
       'RAW_ARGUMENTS', 'OUTPUTS', 'RAW_OUTPUTS', 'GAS_USED', 'ERROR',
       'STATUS', 'ORDER_INDEX', 'DECODING_STATUS', 'STORAGE_ADDRESS'], axis=1, inplace=True)

# specify that the field identifying the case identifier attribute is the field with name 'case:concept:name'
parameters = {
    log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'case:concept:name'}
log = log_converter.apply(df, parameters=parameters,
                          variant=log_converter.Variants.TO_EVENT_LOG)

events = 0

# remove "nan" attributes from events
for t in log:
    events += len(t)
    for i, e in enumerate(t):
        t[i] = {k: v for k, v in e.items() if pd.Series(v).notna().all()}

print(f"Traces: {len(log)}")
print(f"Events: {events}")

events_type = len(df.columns) - 4 # 4 are 'id', 'case:concept:name', 'time:timestamp', 'concept:name'

print(f"Events type: {events_type}")
print(f"Duplication ratio: {events / events_type}")

xes_exporter.apply(
    log, "../data/logs/land_proxy_internal_160.xes")

  from .autonotebook import tqdm as notebook_tqdm


Traces: 840
Events: 10737
Events type: 15
Duplication ratio: 715.8


exporting log, completed traces :: 100%|██████████| 840/840 [00:00<00:00, 3401.51it/s]
