This notebook uses the data generated with [Ethtx](https://github.com/EthTx) using their [beta data warehouses](https://tokenflow.live/blog/edw-open). The data refers to the transactions of the [LANDProxy](https://etherscan.io/address/0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d) contract and the subcalls of each transaction.

The goal is to produce a dataframe for each unique `FUNCTION_NAME` contained in the data. On such dataframes, all the transactions and subcalls for the `FUNCTION_NAME` are present.

In [100]:
import glob, os
import pandas as pd

pd.set_option('display.max_colwidth', None)

# path = r'../data/LAND_decoded_calls'
# all_files = glob.glob(os.path.join(path, "*.csv"))
# df = pd.concat((pd.read_csv(f,  sep=",", engine="python", escapechar='\\')
#                for f in all_files))

df = pd.read_csv(r'../data/LAND_decoded_calls\LAND_decoded_calls_0_0_0.csv', sep=",", engine="python", escapechar='\\')

print(df.shape[0])
print(df.columns)

9724
Index(['LOAD_ID', 'CHAIN_ID', 'BLOCK', 'TIMESTAMP', 'TX_HASH', 'CALL_ID',
       'CALL_TYPE', 'FROM_ADDRESS', 'FROM_NAME', 'TO_ADDRESS', 'TO_NAME',
       'FUNCTION_SIGNATURE', 'FUNCTION_NAME', 'VALUE', 'ARGUMENTS',
       'RAW_ARGUMENTS', 'OUTPUTS', 'RAW_OUTPUTS', 'GAS_USED', 'ERROR',
       'STATUS', 'ORDER_INDEX', 'DECODING_STATUS', 'STORAGE_ADDRESS'],
      dtype='object')


Since transactions can happen inside the same block, they will have the same timestamp. To give a time order to the records, we sort them using `TIMESTAMP` and `ORDER_INDEX` fields and add incrementally 1 second to records with same timestamp.

Moreover, we add the `ORIGIN_ADDRESS` field to the transaction record and to the related subcalls, and the `FUNCTION_NAME` of the subcalls is prefixed with the `FROM_NAME` (e.g. `LAND.approve`).

In [101]:
df = df[df['ERROR'] == '\\N'] # remove errored transactions

df = df.sort_values(by=["TIMESTAMP", "ORDER_INDEX"]) # sort by TIMESTAMP and ORDER_INDEX
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP']) # convert TIMESTAMP from object to datetime

last_timestamp = ""
counter = 1
user_address = ""
function_name = ""
calls_length = 0

for index, row in df.iterrows():
    if(row["TIMESTAMP"] == last_timestamp):
        counter = counter + 1
        new_timestamp = pd.to_datetime(row["TIMESTAMP"] + pd.to_timedelta(counter, unit='s'))
    else:
        last_timestamp = row["TIMESTAMP"]
        counter = 1
        new_timestamp = pd.to_datetime(row["TIMESTAMP"] + pd.to_timedelta(counter, unit='s')) 

    df.at[index,'TIMESTAMP'] = new_timestamp

    print(calls_length)
    if(row["CALL_ID"] == "\\N"):
        if(calls_length == 1):
            prevIndex = index - 1
            df.at[prevIndex, function_name] = 1

        user_address = row["FROM_ADDRESS"]
        function_name = row["FUNCTION_NAME"]
        calls_length = 0
    else:
        calls_length = calls_length + 1
        df.at[index, 'FUNCTION_NAME'] = f"{function_name}_{row['FUNCTION_NAME']}"
        df.at[index, function_name] = row["TX_HASH"]

    df.at[index, 'ORIGIN_ADDRESS'] = user_address

0
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
3
4
5
6
7
8
9


Create the `.xes` log with the `ORIGIN_ADDRESS` as trace key

In [102]:
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter

df = dataframe_utils.convert_timestamp_columns_in_df(
    df)
df = df.sort_values(
    by=["TIMESTAMP"])

#! remove the records with `CALL_ID == \\N` since we are interested only on the internal calls
df = df[df['CALL_ID'] != '\\N']

# create columns: from -> case:concept:name, inputFunctionName -> concept:name, timeStamp -> time:timestamp, from -> org:resource
df["org:resource"] = df["FROM_ADDRESS"]
df["case:concept:name"] = df["ORIGIN_ADDRESS"]
df["time:timestamp"] = df["TIMESTAMP"]
df["concept:name"] = df["FUNCTION_NAME"]

# remove unnecessary fields
df.drop(['LOAD_ID', 'CHAIN_ID', 'BLOCK', 'TIMESTAMP', 'TX_HASH', 'CALL_ID',
       'CALL_TYPE', 'FROM_ADDRESS', 'FROM_NAME', 'TO_ADDRESS', 'TO_NAME',
       'FUNCTION_SIGNATURE', 'FUNCTION_NAME', 'VALUE', 'ARGUMENTS',
       'RAW_ARGUMENTS', 'OUTPUTS', 'RAW_OUTPUTS', 'GAS_USED', 'ERROR',
       'STATUS', 'ORDER_INDEX', 'DECODING_STATUS', 'STORAGE_ADDRESS'], axis=1, inplace=True)

# specify that the field identifying the case identifier attribute is the field with name 'case:concept:name'
parameters = {
    log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: 'case:concept:name'}
log = log_converter.apply(df, parameters=parameters,
                          variant=log_converter.Variants.TO_EVENT_LOG)


xes_exporter.apply(
    log, "../data/logs/test.xes")

exporting log, completed traces :: 100%|██████████| 849/849 [00:00<00:00, 1123.08it/s]
