In [None]:
import numpy as np
import pandas as pd
import os
import pyarrow.parquet as pq
import datetime as dt
import networkx as nx

from plotnine import *
from mizani.formatters import percent_format, date_format
from mizani.breaks import date_breaks

import math
from sympy import symbols, Symbol, Eq, nsolve, solve, sqrt
from decimal import Decimal, getcontext
getcontext().prec = 50

data_dir = '/Users/zhicong/Dropbox/DEXs'
pool_name = ['DAI-USDC 100 1', 'DAI-USDC 500 10','DAI-USDC 3000 60', 'FRAX-USDC 100 1', 'FRAX-USDC 500 10','FRAX-USDC 3000 60',
             'USDC-USDT 100 1', 'USDC-USDT 500 10','USDC-USDT 3000 60', 'USDC-WETH 100 1', 'USDC-WETH 500 10','USDC-WETH 3000 60',
             'WBTC-WETH 100 1', 'WBTC-WETH 500 10','WBTC-WETH 3000 60', 'WETH-sETH2 500 10','WETH-sETH2 3000 60',
             'WETH-USDT 100 1', 'WETH-USDT 500 10','WETH-USDT 3000 60',]

# Load Data

In [None]:
swaps = pd.DataFrame(columns = ['BLOCK_NUMBER','BLOCK_TIMESTAMP','TX_HASH','SENDER','TX_TO','PLATFORM','POOL_NAME','CONTRACT_ADDRESS',
                                'EVENT_NAME','AMOUNT_IN','SYMBOL_IN','AMOUNT_OUT','SYMBOL_OUT'])

parquet_file = pq.ParquetFile(os.path.join(data_dir, "dex_swaps", "DEXs_swaps_full.gzip"))
# Read by batch to be able to fit dataframe into memory
for i in parquet_file.iter_batches(batch_size = 1000000):

    # Filter out only tokens we want
    temp = i.to_pandas()
    temp = temp[temp['POOL_NAME'].isin(pool_name)]

    swaps = pd.concat([swaps,temp], axis = 0)
    del temp

# Final cleaning
swaps.reset_index(drop = True, inplace = True)
swaps['BLOCK_NUMBER'] = swaps['BLOCK_NUMBER'].astype(int)
swaps['AMOUNT_IN'] = swaps['AMOUNT_IN'].astype(float)
swaps['AMOUNT_OUT'] = swaps['AMOUNT_OUT'].astype(float)
swaps['BLOCK_TIMESTAMP'] = pd.to_datetime(swaps['BLOCK_TIMESTAMP'])

# Since the pool charges a transaction fee of 5 basis point on the incoming token, and stored it elsewhere (not in liquidity pool), we subtract that
#swaps['AMOUNT_IN'] = swaps['AMOUNT_IN']*(1-0.0005)

# Changing signs of addition and extraction
swaps['AMOUNT_OUT'] = -swaps['AMOUNT_OUT']
swaps['AMOUNT0_ADJUSTED'] = np.where(swaps['SYMBOL_IN'] == 'WETH', swaps['AMOUNT_OUT'], swaps['AMOUNT_IN'])
swaps['TOKEN0_SYMBOL'] = 'USDC'
swaps['AMOUNT1_ADJUSTED'] = np.where(swaps['SYMBOL_IN'] == 'WETH', swaps['AMOUNT_IN'], swaps['AMOUNT_OUT'])
swaps['TOKEN1_SYMBOL'] = 'WETH'

swaps = swaps[['POOL_NAME','BLOCK_NUMBER','BLOCK_TIMESTAMP','SENDER','EVENT_NAME','AMOUNT0_ADJUSTED','TOKEN0_SYMBOL','AMOUNT1_ADJUSTED','TOKEN1_SYMBOL']]
swaps.sort_values(by = 'BLOCK_NUMBER', ascending = True, inplace = True)
swaps.reset_index(drop = True, inplace = True)

swaps.head()