In [1]:
import numpy as np
import pandas as pd
import datetime
import os
import time

# Disable garbage collector that will slow down code running with the existance of large dataframe
import gc
gc.disable()

import pyarrow as pa
import pyarrow.csv as csv


file_dir = "/Users/zhicong/Dropbox/DEXs/Data/dex_swaps"
num_of_files = len(list(filter(lambda f: f.endswith('.csv'), os.listdir(file_dir))))
print("Total number of files:", num_of_files)

Total number of files: 1561


# Data Formatting

This section combines all dataset to groups of 100s and change the format to parquet for more efficient data storage and reading.

In [None]:
def skip_action(row):
    return "skip"

# Intial Setup
last_block_tx_hash = pd.Series(dtype = "str")
data = pd.DataFrame(columns = ['BLOCK_NUMBER','BLOCK_TIMESTAMP','TX_HASH','SENDER','TX_TO','PLATFORM','POOL_NAME','CONTRACT_ADDRESS',
                               'EVENT_NAME','AMOUNT_IN','SYMBOL_IN','AMOUNT_OUT','SYMBOL_OUT'])

# Run loop for every hundred
for i in range(1501, num_of_files+1):

    # Read csv file
    temp = csv.read_csv(os.path.join(file_dir, "DEXs_swaps_" + str(i) + ".csv"), 
                        parse_options = csv.ParseOptions(delimiter = "\t", invalid_row_handler = skip_action)).to_pandas()

    # Change weird token names,i.e. token names with commas
    temp.columns = ['column']
    temp['column'] = temp['column'].str.replace('~=[,,_,,]:3', 'WeirdToken1', regex = False)
    temp['column'] = temp['column'].str.replace('Donald Trump, 1980', 'WeirdToken2', regex = False)
    temp['column'] = temp['column'].str.replace('Fear, Uncertainty and Doubt', 'WeirdToken3', regex = False)
    temp['column'] = temp['column'].str.replace('[🧠,💥]', 'WeirdToken4', regex = False)
    temp['column'] = temp['column'].str.replace(',,,', ',WeirdToken5,', regex = False) #This token simply has the symbol as comma
    temp['column'] = temp['column'].str.replace(',,$', ',WeirdToken5', regex = True) #This token simply has the symbol as comma
    temp['column'] = temp['column'].str.replace('$10,000', 'WeirdToken6', regex = False)
    temp['column'] = temp['column'].str.replace('PONGO,', 'WeirdToken7', regex = False)
    temp['column'] = temp['column'].str.replace('Caca,', 'WeirdToken8', regex = False)
    temp['column'] = temp['column'].str.replace('ROME (🏛,🏛)', 'WeirdToken9', regex = False)
    temp['column'] = temp['column'].str.replace('(3, 3)', 'WeirdToken10', regex = False)
    temp['column'] = temp['column'].str.replace('3,3', 'WeirdToken11', regex = False)
    temp['column'] = temp['column'].str.replace('100% NOT TOMATO, really', 'WeirdToken12', regex = False)
    temp['column'] = temp['column'].str.replace('( milady,remilio )', 'WeirdToken13', regex = False)
    temp['column'] = temp['column'].str.replace('range(50k, 1M);', 'WeirdToken13', regex = False)

    # Split columns
    temp = temp['column'].str.split(',', n = 12, expand = True).dropna()
    temp.columns = ['BLOCK_NUMBER','BLOCK_TIMESTAMP','TX_HASH','SENDER','TX_TO','PLATFORM','POOL_NAME','CONTRACT_ADDRESS',
                    'EVENT_NAME','AMOUNT_IN','SYMBOL_IN','AMOUNT_OUT','SYMBOL_OUT']

    # Remove duplicated transactions (this occurs between data files)
    temp = temp[~temp['TX_HASH'].isin(last_block_tx_hash)]
    del last_block_tx_hash
    # Only save last block of previous data file for comparison to save memory
    last_block_tx_hash = temp[temp['BLOCK_NUMBER'] == temp['BLOCK_NUMBER'].min()]['TX_HASH']

    # Concat and delete temp to save memory
    data = pd.concat([data,temp], axis = 0)
    del temp
    
data.to_parquet(file_dir + "/DEXs_swaps_1501-1561.gzip", compression = "gzip", index = False)

# Data Cleaning

Here, we will merge all the datasets together to one single dataset for future analysis.

In [2]:
last_block_tx_hash = pd.Series(dtype = "str")
data = pd.DataFrame(columns = ['BLOCK_NUMBER','BLOCK_TIMESTAMP','TX_HASH','SENDER','TX_TO','PLATFORM','POOL_NAME','CONTRACT_ADDRESS',
                               'EVENT_NAME','AMOUNT_IN','SYMBOL_IN','AMOUNT_OUT','SYMBOL_OUT'])

for file_name in ['DEXs_swaps_1-100.gzip', 'DEXs_swaps_101-200.gzip', 'DEXs_swaps_201-300.gzip', 'DEXs_swaps_301-400.gzip', 'DEXs_swaps_401-500.gzip', 'DEXs_swaps_501-600.gzip',
                  'DEXs_swaps_601-700.gzip', 'DEXs_swaps_701-800.gzip', 'DEXs_swaps_801-900.gzip', 'DEXs_swaps_901-1000.gzip', 'DEXs_swaps_1001-1100.gzip', 'DEXs_swaps_1101-1200.gzip',
                  'DEXs_swaps_1201-1300.gzip', 'DEXs_swaps_1301-1400.gzip', 'DEXs_swaps_1401-1500.gzip', 'DEXs_swaps_1501-1561.gzip']:
    
    print(file_name)

    # Read files
    temp = pd.read_parquet(os.path.join(file_dir, file_name))

    # Remove duplicated transactions (this occurs between data files)
    temp = temp[~temp['TX_HASH'].isin(last_block_tx_hash)]
    del last_block_tx_hash
    # Only save last block of previous data file for comparison to save memory
    last_block_tx_hash = temp[temp['BLOCK_NUMBER'] == temp['BLOCK_NUMBER'].min()]['TX_HASH']

    # Concat and delete temp to save memory
    data = pd.concat([data,temp], axis = 0)
    del temp

DEXs_swaps_1-100.gzip
DEXs_swaps_101-200.gzip
DEXs_swaps_201-300.gzip
DEXs_swaps_301-400.gzip
DEXs_swaps_401-500.gzip
DEXs_swaps_501-600.gzip
DEXs_swaps_601-700.gzip
DEXs_swaps_701-800.gzip
DEXs_swaps_801-900.gzip
DEXs_swaps_901-1000.gzip
DEXs_swaps_1001-1100.gzip
DEXs_swaps_1101-1200.gzip
DEXs_swaps_1201-1300.gzip
DEXs_swaps_1301-1400.gzip
DEXs_swaps_1401-1500.gzip
DEXs_swaps_1501-1561.gzip


In [3]:
# Sort dataframe by block number
data['BLOCK_NUMBER'] = data['BLOCK_NUMBER'].astype(int)
data.sort_values(by = "BLOCK_NUMBER", ascending = False, inplace = True)
data.reset_index(drop = True, inplace = True)

print("Byte size of DataFrame", data.memory_usage(index = True).sum()/1000000000, "GigaBytes")

data.head(10)

Byte size of DataFrame 16.217388432 GigaBytes


Unnamed: 0,BLOCK_NUMBER,BLOCK_TIMESTAMP,TX_HASH,SENDER,TX_TO,PLATFORM,POOL_NAME,CONTRACT_ADDRESS,EVENT_NAME,AMOUNT_IN,SYMBOL_IN,AMOUNT_OUT,SYMBOL_OUT
0,17244715,2023-05-12 14:26:35.000,0x8f97d6d39577690712052b707005ed22973fdf3b1d88...,0xdef1c0ded9bec7f1a1670819833240f027b25eff,0xe66b31678d6c16e9ebf358268a790b763c133750,uniswap-v3,cbETH-WETH 500 10,0x840deeef2f115cf50da625f7368c24af6fe74410,Swap,0.2811618733,WETH,0.2717965972,cbETH
1,17244715,2023-05-12 14:26:35.000,0xdd2d6d6e17ebc7eecc16307a998daed6a35eb18af622...,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,0x87440f2ab4c3f901b207749547ebada1a91df20b,uniswap-v2,JAN-WETH,0x2481f31c15324fd3079790b2b884e69d3ecd6200,Swap,0.03,WETH,58959.215645394,JAN
2,17244715,2023-05-12 14:26:35.000,0xd0abad35c8e2973da9e87833be15d13eb4496e25df12...,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,uniswap-v2,FOUR-WETH,0xd101821c56b4405af4a376cbe81fa0dc90207dc2,Swap,4.57572130895647e+27,FOUR,29.869240287,WETH
3,17244715,2023-05-12 14:26:35.000,0x890522376b1a8815257bea945c1aa06cc5511da42eb4...,0x6b75d8af000000e20b7a7ddf000ba900b4009a80,0x6b75d8af000000e20b7a7ddf000ba900b4009a80,uniswap-v2,LADS-WETH,0x47f6cae32e714d49d688f35b9f074827f90ff7fc,Swap,5679665411937.38,LADS,0.4236608483,WETH
4,17244715,2023-05-12 14:26:35.000,0x1b0194672242ac640fd4c0471cad22d9e20b277cfcd0...,0x6b75d8af000000e20b7a7ddf000ba900b4009a80,0x6b75d8af000000e20b7a7ddf000ba900b4009a80,uniswap-v2,LADS-WETH,0x47f6cae32e714d49d688f35b9f074827f90ff7fc,Swap,0.3778953819,WETH,5679665411937.38,LADS
5,17244715,2023-05-12 14:26:35.000,0xffade960384450e22ac2d331926ea8cf9987c9bd9bd4...,0x68b3465833fb72a70ecdf485e0e4c7bd8665fc45,0xbb04979a8a6df58395e885f2db41a247a096a010,uniswap-v3,PP-WETH 3000 60,0xab4bd97fc92dfc35d489447df9f5250432736533,Swap,0.05,WETH,78073.181314632,PP
6,17244715,2023-05-12 14:26:35.000,0x9b297d57d740b2fadddca3aaeb2ca4f7afb2ed7e881f...,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,0xea197d00969ae5350f4af952c309a08fceb58129,uniswap-v3,PP-WETH 3000 60,0xab4bd97fc92dfc35d489447df9f5250432736533,Swap,0.175,WETH,334825.388071209,PP
7,17244715,2023-05-12 14:26:35.000,0x5b15a2f1d9c37ad503d94ecdb8eb2af8ed362ad18503...,0x00000000a991c429ee2ec6df19d40fe0c80088b8,0x00000000a991c429ee2ec6df19d40fe0c80088b8,uniswap-v3,PP-WETH 3000 60,0xab4bd97fc92dfc35d489447df9f5250432736533,Swap,2842035.2683507,PP,1.631028227,WETH
8,17244715,2023-05-12 14:26:35.000,0x8f2438f67b93fdfe85efae4f6d5253e9f113a1c63033...,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,0x00878e2a3f0612e8000efbb15703428aac697723,uniswap-v3,HEX-WETH 3000 60,0x9e0905249ceefffb9605e034b534544684a58be6,Swap,0.065,WETH,2232.42061573,HEX
9,17244715,2023-05-12 14:26:35.000,0x9aa49e7cb8db4772959324f059d2613834c6c2107860...,0x4870525eae23fceb31df613d179ef6275e1b93a9,0x058d79a4c6eb5b11d0248993ffa1faa168ddd3c0,uniswap-v3,wNXM-ICHI 3000 60,0x96467eae14bf1da07b5fedd07b2a4030b8f1481d,Swap,1494.078159075,ICHI,156.024944166,wNXM


In [4]:
# Save data
data.to_parquet(file_dir + "/DEXs_swaps_full.gzip", compression = "gzip", index = False)

# Special Case: "USDC-WETH 500 10" pool

In [2]:
def skip_action(row):
    return "skip"

# Intial Setup
last_block_tx_hash = pd.Series(dtype = "str")
data = pd.DataFrame(columns = ['BLOCK_NUMBER','BLOCK_TIMESTAMP','TX_HASH','SENDER','TX_TO','PLATFORM','POOL_NAME','CONTRACT_ADDRESS',
                               'EVENT_NAME','AMOUNT_IN','SYMBOL_IN','AMOUNT_OUT','SYMBOL_OUT'])

# Run loop for every hundred
for i in range(1501, num_of_files+1):
    if i % 100 == 0: print(i)

    # Read csv file
    temp = csv.read_csv(os.path.join(file_dir, "DEXs_swaps_" + str(i) + ".csv"), 
                        parse_options = csv.ParseOptions(delimiter = "\t", invalid_row_handler = skip_action)).to_pandas()

    # Split columns
    temp.columns = ['column']
    temp = temp['column'].str.split(',', n = 12, expand = True).dropna()
    temp.columns = ['BLOCK_NUMBER','BLOCK_TIMESTAMP','TX_HASH','SENDER','TX_TO','PLATFORM','POOL_NAME','CONTRACT_ADDRESS',
                    'EVENT_NAME','AMOUNT_IN','SYMBOL_IN','AMOUNT_OUT','SYMBOL_OUT']
    
    # Filter for pool of interest
    temp = temp[temp['POOL_NAME'] == "USDC-WETH 500 10"]

    # Remove duplicated transactions (this occurs between data files)
    temp = temp[~temp['TX_HASH'].isin(last_block_tx_hash)]
    del last_block_tx_hash
    # Only save last block of previous data file for comparison to save memory
    last_block_tx_hash = temp[temp['BLOCK_NUMBER'] == temp['BLOCK_NUMBER'].min()]['TX_HASH']

    # Concat and delete temp to save memory
    data = pd.concat([data,temp], axis = 0)
    del temp

data.to_parquet(file_dir + "/USDC-WETH 500 10_4.gzip", compression = "gzip", index = False)