In [1]:
import numpy as np
import pandas as pd
import os
import datetime

import pyarrow.parquet as pq

# Go to settings and set files.maxMemoryForLargeFilesMB to 16384 before running this

file_dir = '/Users/zhicong/Dropbox/DEXs/Data/dex_swaps'

# Load Data

In [2]:
data = pd.DataFrame(columns = ['BLOCK_NUMBER','BLOCK_TIMESTAMP','TX_HASH','SENDER','TX_TO','PLATFORM','EVENT_NAME','AMOUNT_IN','SYMBOL_IN','AMOUNT_OUT','SYMBOL_OUT'])
parquet_file = pq.ParquetFile(os.path.join(file_dir, "DEXs_swaps_full.gzip"))

# Set tokens that we want to analyze
token1 = "USDC"
token2 = "WETH"

# Read by batch to be able to fit dataframe into memory
for i in parquet_file.iter_batches(batch_size = 1000000):

    # Filter out only tokens we want
    temp = i.to_pandas()
    temp = temp[((temp['SYMBOL_IN'] == token1) & (temp['SYMBOL_OUT'] == token2)) | ((temp['SYMBOL_IN'] == token2) & (temp['SYMBOL_OUT'] == token1))]

    data = pd.concat([data,temp], axis = 0)
    del temp

# Final cleaning
data.reset_index(drop = True, inplace = True)
data['AMOUNT_IN'] = data['AMOUNT_IN'].astype(float)
data['AMOUNT_OUT'] = data['AMOUNT_OUT'].astype(float)
data['BLOCK_TIMESTAMP'] = pd.to_datetime(data['BLOCK_TIMESTAMP'])

# Get swap price of WETH/USDC
data['swap_price'] = np.where(data['SYMBOL_IN'] == 'WETH', data['AMOUNT_OUT']/data['AMOUNT_IN'], data['AMOUNT_IN']/data['AMOUNT_OUT'])
data.replace([np.inf, -np.inf], np.nan, inplace = True)
data.dropna(subset = ['swap_price'], how = "all", inplace = True)

# Remove anomalies (simple as removing obsurb values)
data = data[(data['swap_price'] > 100) & (data['swap_price'] < 5000)]
data = data[~data['PLATFORM'].isin(['curve'])]
data.reset_index(inplace = True, drop = True)

# Add a column called 'volume' that indicates the volume of swap in USDC
data['volume'] = np.where(data['SYMBOL_IN'] == 'WETH', data['AMOUNT_OUT'], data['AMOUNT_IN'])

data

Unnamed: 0,BLOCK_NUMBER,BLOCK_TIMESTAMP,TX_HASH,SENDER,TX_TO,PLATFORM,EVENT_NAME,AMOUNT_IN,SYMBOL_IN,AMOUNT_OUT,SYMBOL_OUT,POOL_NAME,CONTRACT_ADDRESS,swap_price,volume
0,17244711,2023-05-12 14:25:47,0x09536b4a391886d4be909cffea3ce156aade96de09f0...,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,uniswap-v3,Swap,113907.573860,USDC,64.054072,WETH,USDC-WETH 500 10,0x88e6a0c2ddd26feeb64f039a2c41296fcb3f5640,1778.303398,113907.573860
1,17244711,2023-05-12 14:25:47,0xd18e70fa4d6e6bda6b8d1e4fa04b6cee7b6c3d429307...,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,0xdf31685465ddf564c111f3ddc3fc9043d9eb8552,uniswap-v3,Swap,4.000000,WETH,7104.979083,USDC,USDC-WETH 500 10,0x88e6a0c2ddd26feeb64f039a2c41296fcb3f5640,1776.244771,7104.979083
2,17244711,2023-05-12 14:25:47,0xeb896c0bce00527e71102ef43480a851eafacb92e86e...,0x9507c04b10486547584c37bcbd931b2a4fee9a41,0x9507c04b10486547584c37bcbd931b2a4fee9a41,uniswap-v3,Swap,66.305009,WETH,117791.972275,USDC,USDC-WETH 500 10,0x88e6a0c2ddd26feeb64f039a2c41296fcb3f5640,1776.516948,117791.972275
3,17244710,2023-05-12 14:25:35,0x2dbbc5778bc6db4bc9c92d9ddf2a5998667c86737763...,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,0xef1c6e67703c7bd7107eed8303fbe6ec2554bf6b,uniswap-v3,Swap,500.000000,USDC,0.281206,WETH,USDC-WETH 500 10,0x88e6a0c2ddd26feeb64f039a2c41296fcb3f5640,1778.053952,500.000000
4,17244710,2023-05-12 14:25:35,0xd9c448a5888047ed94e0898f9cb6b003845bcd319037...,0xdef1c0ded9bec7f1a1670819833240f027b25eff,0xdef1c0ded9bec7f1a1670819833240f027b25eff,uniswap-v3,Swap,1000.000000,USDC,0.562414,WETH,USDC-WETH 500 10,0x88e6a0c2ddd26feeb64f039a2c41296fcb3f5640,1778.050681,1000.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11288602,10045107,2020-05-11 13:08:48,0x840bb5ab0a779b4f733443651ac54dabe24e243c3606...,0x57ead0a9f49fafdd2447f615b036f3c316af5171,0x57ead0a9f49fafdd2447f615b036f3c316af5171,uniswap-v2,Swap,0.001000,WETH,0.204870,USDC,USDC-WETH,0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc,204.870000,0.204870
11288603,10014418,2020-05-06 18:51:12,0x735cf98e86a5df67b6a837ae50de1d7a589d9f6baaf0...,0xf164fc0ec4e93095b804a4795bbe1e041497b92a,0x12ede161c702d1494612d19f05992f43aa6a26fb,uniswap-v2,Swap,0.000689,USDC,0.000003,WETH,USDC-WETH,0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc,201.910785,0.000689
11288604,10013764,2020-05-06 16:24:55,0x43b6bfd06dde0814fe9c1b63ce98ec4c67c72d96169d...,0xf164fc0ec4e93095b804a4795bbe1e041497b92a,0x8688a84fcfd84d8f78020d0fc0b35987cc58911f,uniswap-v2,Swap,0.000005,WETH,0.001000,USDC,USDC-WETH,0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc,200.678676,0.001000
11288605,10008585,2020-05-05 21:12:35,0x697b7aaca56a80a8d3a2f560ed7f1ecb97c22b2edd6e...,0xf164fc0ec4e93095b804a4795bbe1e041497b92a,0x11e4857bb9993a50c685a79afad4e6f65d518dda,uniswap-v2,Swap,0.000049,WETH,0.010000,USDC,USDC-WETH,0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc,202.919065,0.010000


# Address Clustering

Before identifying arbitrageurs, we want to be able to cluster addresses (generated from wallet application) together if they are the same entity.

We simply make the assumption that people only conduct swaps in DEXs and "Sender" and "Tx_to" will always be the same entity.

In [3]:
address_cluster = data[['SENDER','TX_TO']].copy()
address_cluster.drop_duplicates(inplace = True)
address_cluster = address_cluster[address_cluster['SENDER'] != address_cluster['TX_TO']]
address_cluster = pd.DataFrame(address_cluster.groupby('SENDER')['TX_TO'].apply(list).to_dict().items(),
                               columns = ['id','address'])

for i in range(len(address_cluster)):
    address_cluster.loc[i,'address'] = [address_cluster.loc[i,'id']] + address_cluster.loc[i,'address']
    address_cluster.loc[i,'id'] = i

address_cluster

Unnamed: 0,id,address
0,0,"[0x000000000000006f6502b7f2bbac8c30a3f67e9a, 0..."
1,1,"[0x0000000000000eb4ec62758aae93400b3e5f7f18, 0..."
2,2,"[0x0000000000002d534ff79e9c69e7fcc742f0be83, 0..."
3,3,"[0x00000000000062f06c7007906b2a4034fa5c4818, 0..."
4,4,"[0x000000000000660def84e69995117c0176ba446e, 0..."
...,...,...
1417,1417,"[0xff26d7d5710b916b62c8317c9228c06a38446203, 0..."
1418,1418,"[0xff2fbc735d33ae830f056107f1b551783ec4ed5b, 0..."
1419,1419,"[0xff3d72aef9e537699287b1049b2284510c000000, 0..."
1420,1420,"[0xffcd3404852afb1420540c8465433f832727bdd7, 0..."


Now, we have a dataframe that has the list of addresses that are clustered.

# Identifying Arbitrageurs

We search for pairs of trades that are:
- made by the same address,
- of the same volume but in different exchanges,
- and within 3 blocks.

We first search for addresses that have traded in multiple exchanges.

In [4]:
arbitrageurs = data[(data['SENDER'] == data['TX_TO']) & (data['PLATFORM'].isin(['uniswap-v2','sushiswap']))][['SENDER','PLATFORM']].drop_duplicates()
arbitrageurs = arbitrageurs.groupby(['SENDER']).size().reset_index(name = 'No_of_Exchange')
arbitrageurs = arbitrageurs[arbitrageurs['No_of_Exchange'] >= 2]
arbitrageurs = arbitrageurs['SENDER']
arbitrageurs

0       0x000000000000006f6502b7f2bbac8c30a3f67e9a
3       0x0000000000000eb4ec62758aae93400b3e5f7f18
4       0x00000000000017c75025d397b91d284bbe8fc7f2
6       0x0000000000002d534ff79e9c69e7fcc742f0be83
7       0x0000000000005117dd3a72e64a705198753fdd54
                           ...                    
1981    0xff26d7d5710b916b62c8317c9228c06a38446203
1982    0xff2fbc735d33ae830f056107f1b551783ec4ed5b
1983    0xff40485f9b692b3775edfc1550964e2ff613cbc6
1985    0xffbb58e97332d9244b997ec2b81eff5bf1be576f
1991    0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63
Name: SENDER, Length: 994, dtype: object

In [5]:
arbitrage_trades = data[(data['SENDER'] == data['TX_TO']) & (data['SENDER'].isin(arbitrageurs))].copy()
arbitrage_trades.sort_values(by = ['SENDER','BLOCK_NUMBER'], ascending = False, inplace = True)
arbitrage_trades.head(50)

Unnamed: 0,BLOCK_NUMBER,BLOCK_TIMESTAMP,TX_HASH,SENDER,TX_TO,PLATFORM,EVENT_NAME,AMOUNT_IN,SYMBOL_IN,AMOUNT_OUT,SYMBOL_OUT,POOL_NAME,CONTRACT_ADDRESS,swap_price,volume
8706516,12543172,2021-05-31 16:23:57,0x5f06d3abc9a1d51f19849f6dfd46f1dcf9cdeda66008...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,4806.072,USDC,1.795842,WETH,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,2676.221649,4806.072
8706520,12543172,2021-05-31 16:23:57,0x5f06d3abc9a1d51f19849f6dfd46f1dcf9cdeda66008...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,64211.96,USDC,24.0016,WETH,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,2675.319764,64211.96
8717914,12536963,2021-05-30 17:07:17,0x459447e288e94ccde579576896acba83a16d0e5408c9...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,2.91082,WETH,7000.016,USDC,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,2404.826316,7000.016
8768725,12514741,2021-05-27 06:09:40,0xeb80bb20785ccc9adca1079b3874c791f6719bd73e10...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,7.544656,WETH,20441.86,USDC,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,2709.449359,20441.86
8773253,12512562,2021-05-26 22:17:30,0xaf69766db3e8580a15dd1fad940e5772a20437592dea...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,uniswap-v2,Swap,1637.039,USDC,0.5781349,WETH,USDC-WETH,0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc,2831.586712,1637.039
8773254,12512562,2021-05-26 22:17:30,0xaf69766db3e8580a15dd1fad940e5772a20437592dea...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,20695.45,USDC,7.303297,WETH,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,2833.712699,20695.45
8831169,12490576,2021-05-23 12:42:01,0xd941290d9f139a04fdca2a43e534c83b2ec9ebd7e45f...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,26.74074,WETH,52949.99,USDC,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,1980.124565,52949.99
8834711,12489642,2021-05-23 08:58:57,0xa59747b8be5abbac5b069e0fbcdc1d300d88b2791703...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,25.12064,WETH,50428.44,USDC,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,2007.450115,50428.44
8834754,12489633,2021-05-23 08:57:50,0xd621f6246eae33a943145a815031e5d8e24221c922e9...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,9.719491,WETH,19478.53,USDC,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,2004.069054,19478.53
8835056,12489578,2021-05-23 08:44:14,0x01284a577ababfa3663fc52ddb69363a5bf921e7fe18...,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,0xfffb2ab6f574dd3b2a9661cafb5e60208686ed63,sushiswap,Swap,6.348813,WETH,13148.86,USDC,USDC-WETH,0x397ff1542f962076d0bfe58ea045ffa2d347aca0,2071.073782,13148.86
