In [95]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
#######################################################################################################################
## ADDING THE GAS LIMITS AND BASE FEE INTO THE DATAFRAME
#######################################################################################################################
## first we put the gas limits as well as the base fee into the  transaction dataframe
#######################################################################################################################


#Read in the data, use your own machines specific path as you see fit
transaction_data=pd.read_csv("/Users/jacobmcgraw/Downloads/CAMCOS_Datasets/txs_12965_12967.csv")
block_data=pd.read_csv("/Users/jacobmcgraw/Downloads/CAMCOS_Datasets/bxs_12965_12967.csv")


#Get the block number and gas limits and base fee from the dataset
my_block_number=list(block_data.block_number)
my_gas_limit=list(block_data.gas_limit)
my_base_fee=list(block_data.base_fee_per_gas)
#initialize a dictionary to assign gas limits and base fee to the transaction data
gas_limit_tracker={}
base_fee_tracker={}


#makes a dicionary with the key being the block number and the 
#value being the gas limits and base fee, because this will allow us to 
#easily assign a gas limit and base fee to the transaction going forward
for i in range(len(my_block_number)):
    gas_limit_tracker[my_block_number[i]]=my_gas_limit[i]
    base_fee_tracker[my_block_number[i]]=my_base_fee[i]
    
    
##get the block numbers from the transaction data
transaction_block_numbers=list(transaction_data.block_number)


#initialize a list for the purpose of saving the gas limits that will 
#be assigned to the transaction data
gas_limits_for_transaction_data=[]
base_fee_for_transaction_data=[]


#assign the correct gas limit to each observation
for i in transaction_block_numbers:
    gas_limits_for_transaction_data.append(gas_limit_tracker[i])
    base_fee_for_transaction_data.append(base_fee_tracker[i])
    
    
#add the column into the dataframe
transaction_data['gas_limit']=gas_limits_for_transaction_data
transaction_data['base_fee']=base_fee_for_transaction_data


# The final step is to remove the NA's from then dataframe, from testing of the 
# dataset, i have found that the max priority fee per gas and the max fee per gas
# have the the same number of NA's -this can be oberved with the line 
# np.sum(transaction_data.isna())- that we can remove the NA's with the line...
transaction_data=transaction_data[pd.notnull(transaction_data.max_fee_per_gas)]


#######################################################################################################################
## ADDING THE RESCALED GAS PRICES INTO THE DATAFRAME
#######################################################################################################################
## now we need to add another column to revert the gas price into a metric that we 
## can compare to the pre EIP 1559 data. to do this, we will need to work under the
## assumption that gas limits represent the same metric that they do in the pre EIP 
## 1559 network (which is an assumption that the previous paper made that we will 
## continue in this proposal). then, we see that the user bid has a specific value 
## in the post EIP 1559 section which is min(base fee + tip , max tip), while in the pre EIP section the userbid is equal 
## to gas price * gas limit. Therefore, if we set these metrics to be equal, we can 
## solve for the equivilent of the gas prices in the post EIP section by taking the 
## min(base fee + tip , max tip) and dividing it by the gas limit
#######################################################################################################################


#get all the values...
b_fee=list(transaction_data.base_fee)
g_limit=list(transaction_data.gas_limit)
m_fee=list(transaction_data.max_fee_per_gas)
tip=list(transaction_data.max_priority_fee_per_gas)


#initialize a variable to store the rescaled gas prices
rescaled_gas_prices=[]


#rescale the gas prices according to the mechanism mentioned earlier
for i in range(len(b_fee)):
    my_min=min(b_fee[i]+tip[i],m_fee[i])
    rescaled_gas_prices.append(my_min/g_limit[i])
    
    
#add the rescaled gas prices into the dataframe
transaction_data['rescaled_gas_prices']=rescaled_gas_prices


#######################################################################################################################
## COMPARING VARIANCE
#######################################################################################################################
## We will be comparing the variance in two ways, first, we will be simply taking 
## the variance of the entire dataset, then we will run a simulation where the 
## code will randomly take sets of 500 from both the pre and post EIP 1559 data and 
## compare the variance in a simulation of many times and reports the results.
#######################################################################################################################



#######################################################################################################################
## Clean up the data for comparison, remove outliers
#######################################################################################################################
##
##I will be using the "03_22_03_26.csv" dataset in the CAMCOS google drive for the
## largest portion of data available
##
##
## we randomly pick 40,000 results, for a better comparance of the variance 
#######################################################################################################################



#read in the data and convert it to a list for better calculation speed
large_pre_gas_prices=list(pd.read_csv("/Users/jacobmcgraw/Downloads/CAMCOS_Datasets/03_22_03_26.csv").gas_price)


#randomly generate 40000 indexes for the larger dataset
pre_index=np.random.uniform(0,len(large_pre_gas_prices)-2,40000)
pre_index=[round(x) for x in pre_index]


#initialize a variable that will be the dataset to contain the 40000
#random observations from the larger dataset
pre_gas_prices=[]


#get the 40000 random observations from the larger dataset, put them in pre_gas_prices
for i in pre_index:
    pre_gas_prices.append(large_pre_gas_prices[i])
    
    
#grab the post EIP gas prices for later comparison
post_gas_prices=list(transaction_data.rescaled_gas_prices)
#gets 10% quantile and 90% quantile for both pre and post 
#for later use in removing outliers
pre_up_lim=np.quantile(pre_gas_prices,0.9)
pre_lo_lim=np.quantile(pre_gas_prices,0.1)
post_up_lim=np.quantile(post_gas_prices,0.9)
post_lo_lim=np.quantile(post_gas_prices,0.1)
#Initialize two variables, that will become our lists that contain the final 
#trimmed down gas prices
pre_gas=[]
post_gas=[]


#remove outliers from pre EIP data, so that pre_gas and post_gas are
#our final variables
for i in range(len(pre_gas_prices)):
    if (pre_gas_prices[i]<pre_up_lim) & (pre_gas_prices[i]>pre_lo_lim):
        pre_gas.append(pre_gas_prices[i])
for i in range(len(post_gas_prices)):
    if (post_gas_prices[i]<post_up_lim) & (post_gas_prices[i]>post_lo_lim):
        post_gas.append(post_gas_prices[i])

        
#######################################################################################################################
## Simulation 1: with non-ideal data
##
##
## NOTE!!! the data was of notably different scales, so in order to compare variance
## with any sort of accuracy, we must normalize the data in our simulation
#######################################################################################################################



#a function designed to take two lists, pre and post EIP respectively,
#and return False if post is bigger and True if post is smaller
def variance_checker(pre,post):
    if np.var(pre)<np.var(post):
        return False
    else:
        return True
    
    
#a function desinged to take two lists, along with a specefied integer, and then
#generate an amount of random indexes associated with indexes to the two lists 
#in the amount of the number specified
def random_index_generator(list1,list2,number):
    result1=list(np.random.uniform(0,len(list1)-2,number))
    result1=[round(x) for x in result1]
    result2=list(np.random.uniform(0,len(list2)-2,number))
    result2=[round(x) for x in result2]
    return [result1,result2]


#declare a variable to represent the number of trials to take place in the simulation 
trials=10000
#initialize a list to represent the output of the simulation
results=[]


#this code runs a simulation that randomly takes 500 observations from each dataset and 
#records the percentage of times the variance is smaller in the post EIP dataset
for i in range(trials):
    my_index=random_index_generator(pre_gas,post_gas,500)
    index_1=my_index[0]
    index_2=my_index[1]
    my_pre_gas=preprocessing.normalize([[pre_gas[x] for x in index_1]])
    my_post_gas=preprocessing.normalize([[post_gas[x] for x in index_2]])
    results.append(variance_checker(my_pre_gas,my_post_gas))

    
#output results of simulation and simple variance of the two datasets
print("the variance in the post EIP-1559 data is " +  str(np.var(preprocessing.normalize([post_gas]))) + \
      " and the variance in the pre EIP-1559 data is " + str(np.var(preprocessing.normalize([pre_gas]))) + \
      ". the percentage of times the variance was lower in post EIP-1559 data " + \
      "durring our simulation after normalizing was " +
      str(int(round((sum(results)/len(results))*100))) + '%. Note, the data had ' + \
      "to be normalized to make up for the discrepency of size in the units")


#output summary stats of pre and post EIP gas prices
print('\n Some summary stats: \n \t Pre-EIP: \n')
print('\t Max: ' + str(np.max(pre_gas)))
print('\n \t Min: ' + str(np.min(pre_gas)))
print('\n \t Mean: ' + str(np.mean(pre_gas)))
print('\n \t Variance: ' + str(np.var(pre_gas)))
print('\n \t Quartile 25,50,75: ' + str(np.quantile(pre_gas,0.25)) + "," + \
      str(np.quantile(pre_gas,0.5)) + ',' +  str(np.quantile(pre_gas,0.75)))
print('\n \n \t Post-EIP:')
print('\t Max: ' + str(np.max(post_gas)))
print('\n \t Min: ' + str(np.min(post_gas)))
print('\n \t Mean: ' + str(np.mean(post_gas)))
print('\n \t Variance: ' + str(np.var(post_gas)))
print('\n \t Quartile 25,50,75: ' + str(np.quantile(post_gas,0.25)) + "," + \
      str(np.quantile(post_gas,0.5)) + ',' +  str(np.quantile(post_gas,0.75)))

the variance in the post EIP-1559 data is 2.3283230549441218e-06 and the variance in the pre EIP-1559 data is 1.5433711088319542e-06. the percentage of times the variance was lower in post EIP-1559 data durring our simulation after normalizing was 0%. Note, the data had to be normalized to make up for the discrepency of size in the units

 Some summary stats: 
 	 Pre-EIP: 

	 Max: 257600000000

 	 Min: 107000001459

 	 Mean: 165073852387.94214

 	 Variance: 1.4151026183521242e+21

 	 Quartile 25,50,75: 134200000000.0,158000000000.0,190000000000.0

 
 	 Post-EIP:
	 Max: 2743.2609377589847

 	 Min: 1097.6

 	 Mean: 1524.066205733545

 	 Variance: 186551.99617085487

 	 Quartile 25,50,75: 1229.9784848915706,1373.867470296797,1663.615035941559
