In [None]:
import pandas as pd
import numpy as np

import os
from os.path import join

from dotenv import load_dotenv
load_dotenv()  

path = os.environ['PROJECT_PATH']

### List of unique sender & receiver 

In [None]:
# RETAINED METHODOLOGICAL DOCUMENTATION

# load unique governance tokens transfers 
df_tt = pd.read_csv(join( path, 'token_transfers_22021209_refined.csv'))

# unique sender
unique_sender = df_tt.from_address.unique()

# unique receivr 
unique_receiver = df_tt.to_address.unique()

# sender & receiver  
array_SR= np.concatenate([unique_receiver, unique_sender])

# remove dublicates between unique sender & receiver 
unique_addresses = np.unique(array_SR)

# convert to csv for storage 
df_unique_addresses = pd. DataFrame(unique_addresses, columns=['unique_addresses'])

In [None]:
# remove burner addresses 
known_burner_addresses = ['0x0000000000000000000000000000000000000000',
                        '0x0000000000000000000000000000000000000000',
                        '0x0000000000000000000000000000000000000001',
                        '0x0000000000000000000000000000000000000002',
                        '0x0000000000000000000000000000000000000003',
                        '0x0000000000000000000000000000000000000004',
                        '0x0000000000000000000000000000000000000005',
                        '0x0000000000000000000000000000000000000006',
                        '0x0000000000000000000000000000000000000007',
                        '0x000000000000000000000000000000000000dead']

In [None]:

df_unique_addresses_filtered = df_unique_addresses[df_unique_addresses.unique_addresses.isin(known_burner_addresses) == False]

In [None]:
# safe dataset as df as csv
df_unique_addresses_filtered.to_csv(join(path, 'df_unique_addresses2.csv') )


In [None]:
df_unique_addresses_filtered = pd.read_csv(join(path, 'df_unique_addresses2.csv' ))
df_unique_addresses_filtered

### Bash Manipulation
Step1: consolidate all csv files using ubuntu comand line: 
> nohup awk '(NR == 1) || (FNR > 1)' transactions_merged_to_11307940.csv transactions_to_15050010.csv > transactions_merged_all.csv


Step2: Filter out unique_addresses out of transactions_merged_all and store in new df
> awk -F, '(NR==FNR){a[$2];next}(($6 in a) || ($7 in a))' df_unique_addresses2.csv transactions_merged_all.csv > tx_all_uniq_addresses2.csv

Step3: Remove columns 
> cut -d, -f2,3,5,11,13-15 --complement tx_all_uniq_addresses_selected.csv 

Rational: 
- Irrelevant for culstering heuristic

Ref.: https://linuxconfig.org/how-to-remove-columns-from-csv-based-on-column-number-using-bash-shell

# Deciding on relevant columns in Data Set

In [None]:
import numpy as np
import pandas as pd

import dask.dataframe as dd
import dask.array as da
import dask.bag as db

In [None]:
columns = [
       'hash', # unique identifier of tx
       'nonce', # --> remove
       'block_hash', # --> remove
       'block_number', # necessary when filtering for snapshot dates
       'transaction_index', # --> remove
       'from_address', # --> network node 
       'to_address', # --> network node
       'value', # --> weight 
       'gas', # --> clustering heuristic gas pattern (Beres et al. 2020)
       'gas_price', # --> clustering heuristic gas pattern (Beres et al. 2020)
       'input', # --> indicates smart contract interaction
       'block_timestamp', # --> remove can be inferred from blockheight
       'max_fee_per_gas', # -->
       'max_priority_fee_per_gas', # -->
       'transaction_type'# -->
       ]

['hash', 'block_number','from_address', 'to_address', 'value', 'gas', 'gas_price', 'input']

In [None]:
for ind, col in enumerate(columns): 
    print(f'Col: {col} || Col Num: {ind+1}')

In [None]:
ddf_sample_tx = dd.read_csv(join(path,'raw/transactions_to_11286141.csv')) #number of rows 1,622,714,622

In [None]:
ddf_sample_tx.columns

In [None]:
# select relevant columns 


# Dask 

In [None]:
df = dd.read_csv(join(path,'tx_all_uniq_addresses_reduced.csv'), dtype='str', names=['hash', 'block_number','from_address', 'to_address', 'value', 'gas', 'gas_price','block_timestamp'])


In [None]:
df.head()

# Getting data into Edge List Format

- Reference: https://networkx.org/documentation/stable/reference/readwrite/edgelist.html

In [None]:
import networkx as nx 
G = nx.read_edgelist(join(path, file), data=False)
file = 'tx_all_uniq_addresses.edgelist'


In [None]:
G.number_of_nodes()

In [None]:
from networkit import Graph
import networkit as nk

g = Graph()
reader = nk.graphio.LineFileReader(path="edge_simple_test.txt")
graph = Graph(reader)






# nk.graphio.EdgeListReader("edge_simple_test.txt")



In [None]:
nk.graphio.Format.EdgeListSpaceOne('edge_simple_test.txt')

In [None]:
from networkit import GraphReader

gr = GraphReader("edge_simple_test.txt")
g = gr.readGraph()

In [None]:
import igraph as ig 

g = ig.Graph.TupleList('edge_simple_test.txt')



In [None]:
import networkx as nx
G = nx.Graph()
G.add_edge(1, 2, weight=7, color="red")
# nx.write_edgelist(G, "test1.edgelist", data=["color", "weight"])

In [None]:
G = nx.read_edgelist("test.edgelist", data=True)
# list(G.edges(data=True))

### Other & Out-Takes

In [None]:
#### removing tokens 
import web3 as Web3

In [None]:
Web3.Web3.toChecksumAddress('0x3432b6a60d23ca0dfca7761b7ab56459d9c964d0')

Grep command to delete Gnosis( ) and Frax-Share ( ), as Gnosis is fragmenting in terms of Governance power and Frax is a non-standard ERC20

> grep -v -e '0x6810e776880C02933D47DB1b9fc05908e5386b96' -e '0x3432b6a60d23ca0dfca7761b7ab56459d9c964d0' token_transfers_22021209.csv > token_transfers_22021209_refined.csv