# Data Set Up

In [1]:
!pip install pyathena

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting pyathena
  Downloading PyAthena-2.3.0-py3-none-any.whl (37 kB)
Installing collected packages: pyathena
Successfully installed pyathena-2.3.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!pip install awswrangler

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting awswrangler
  Downloading awswrangler-2.12.1-py3-none-any.whl (211 kB)
     |████████████████████████████████| 211 kB 6.9 MB/s            
Collecting requests-aws4auth<2.0.0,>=1.1.1
  Downloading requests_aws4auth-1.1.1-py2.py3-none-any.whl (31 kB)
Collecting opensearch-py<2.0.0,>=1.0.0
  Downloading opensearch_py-1.0.0-py2.py3-none-any.whl (207 kB)
     |████████████████████████████████| 207 kB 100.9 MB/s            
[?25hCollecting jsonpath-ng<2.0.0,>=1.5.3
  Downloading jsonpath_ng-1.5.3-py3-none-any.whl (29 kB)
Collecting pymysql<1.1.0,>=0.9.0
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
     |████████████████████████████████| 43 kB 4.8 MB/s             
Collecting progressbar2<4.0.0,>=3.53.3
  Downloading progressbar2-3.55.0-py2.py3-none-any.whl (26 kB)
Collecting pg8000<1.22.0,>=1.16.0
  Downloading pg8000-1.21.3-py3-none-any.whl (34 kB)
Collecting redshift-connector<2.1.0,>=2.

In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from numpy import intersect1d
from datetime import datetime
from pyathena import connect
import boto3
import awswrangler as wr

Matplotlib is building the font cache; this may take a moment.


In [4]:
from ethereum import (
    timestamp_to_datetime,
    EthereumData
)
from utils import (
    latest,
    lead_lag,
    add_latest_avail_block,
    lagged_block_data
)

## Blocks

In [5]:
blocks_df = wr.s3.read_csv(path="s3://sagemaker-w210-eth/raw_data/infuria/blocks")

In [6]:
blocks_df.head()

Unnamed: 0,number,hash,parent_hash,nonce,sha3_uncles,logs_bloom,transactions_root,state_root,receipts_root,miner,difficulty,total_difficulty,size,extra_data,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas
0,12962018,0x93288d45fd0daac2605174fc6d14d27a1f18ed6d72ae...,0x9c42cc01eb5564709c476eff08065a67fc8e479e996c...,0xf1828e43fadfc703,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a...,0x60af99ca65d7000f80980c1cef011a3b585619005110...,0xa6d240b3a7409a4f197e347d5dbed9159802e4787b82...,0x6459e84f5309663bb565625ed662809c0233889631c8...,0x0cadca37374ac1442ab84af32b4d1dc0e90837b1fc71...,0x829bd824b016326a401d083b33d092293333a830,7604452341876898,28471525393295665706633,55433,0xe4b883e5bda9e7a59ee4bb99e9b1bc020321,15000000,14994306,1628126126,160,
1,12962019,0xeb13f4f4bc463d6ff438df7ea04074a064ca95b474cc...,0x93288d45fd0daac2605174fc6d14d27a1f18ed6d72ae...,0xa4bfd2e7099b4a6b,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a...,0x532383a629274b7190ccc2d0aa02986d54e296d49823...,0x9b9701c0c94d550770390ec857e85e650874ec32cd79...,0xb75212595e6aedf34bae4f08d1dddd3a34a933f0cb09...,0x95cda15e5b562751ce34c3760b5ebedae43fc60fe9d5...,0x8595dd9e0438640b5e1254f9df579ac12a86865f,7604589780830370,28471532997885446537003,79403,0x657a696c2e6d65,14985353,14969945,1628126136,217,
2,12962020,0x50ded5bb455b2bc00392dd36dcfff3d3019eb9517478...,0xeb13f4f4bc463d6ff438df7ea04074a064ca95b474cc...,0xe21691d992c44366,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a...,0x30f7606e59608213d0e6f610b860da97c90291082a18...,0x6e39ad0229b49c1c0793d4ac9f84ee3cfafde05f31ca...,0xb533fcc03912aad33abe748bd627bd1ca1233284a462...,0xdcabba7c1257c64d0ed6d845048727a3291aed33f358...,0x7f101fe45e6649a6fb8f3f8b43ed03d353f2b90c,7601014041179921,28471540598899487716924,69949,0x466c6578706f6f6c2f53302f55532d45617374202d20...,14970720,14964505,1628126157,155,
3,12962021,0xc3e9e7c41845ae59c14e7b278ab9a2d4c56368761d3c...,0x50ded5bb455b2bc00392dd36dcfff3d3019eb9517478...,0x130800068dfdf6ed,0x34db8d1d8cd5772d238ecfa7e41114456c3f188235b9...,0xe0b1082ad3da6c93ca8e20eda8619e514f6651111166...,0x463fa688671b03a26f1812b161dab03b7ba55d4f2336...,0xd9e98811a4b7d7a22912ded685ff1a1537eda9729509...,0xe654f88bcd7177c2e5457910a848513dd475920897db...,0x09ab1303d3ccaf5f018cd511146b07a240c70294,7601151480133393,28471548200050967850317,92704,0x6d696e6572616c6c2e696f,14956102,14944125,1628126167,195,
4,12962022,0x662aac52ee02b41ac3f3de32382eff0c5d1a4eeb31d2...,0xc3e9e7c41845ae59c14e7b278ab9a2d4c56368761d3c...,0x979e4f65cdc6a89d,0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a...,0x76a1f1b641489b8d5090e6dfa2781725a2c61333f898...,0x8371509cd79d207d83603fec4397472abc9deb1f8c82...,0xacc983062bb1a54aded29c48652d5def40c47987f424...,0xd6924c31610c313db53b63a03554b8e124d1b6a3195e...,0x5a0b54d5dc17e0aadc383d2db43b0a0d3e029c4c,7597577419340707,28471555797628387191024,65855,0xd883010a06846765746888676f312e31362e36856c69...,14963388,14943452,1628126200,178,


In [7]:
print("Blocks shape:", blocks_df.shape)
print("Minimum block number:", blocks_df.number.min())
print("Maximum block number:", blocks_df.number.max())

Blocks shape: (566141, 19)
Minimum block number: 12961718
Maximum block number: 13527858


In [8]:
cols = [
    'number', 'difficulty', 'total_difficulty', 'size', 'gas_limit', 
    'gas_used', 'timestamp', 'transaction_count', 'base_fee_per_gas'
]

blocks_df = blocks_df[cols]


In [9]:
print("# of unique blocks:", len(pd.unique(blocks_df.number)))

# of unique blocks: 566141


In [10]:
blocks_df.head()

Unnamed: 0,number,difficulty,total_difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas
0,12962018,7604452341876898,28471525393295665706633,55433,15000000,14994306,1628126126,160,
1,12962019,7604589780830370,28471532997885446537003,79403,14985353,14969945,1628126136,217,
2,12962020,7601014041179921,28471540598899487716924,69949,14970720,14964505,1628126157,155,
3,12962021,7601151480133393,28471548200050967850317,92704,14956102,14944125,1628126167,195,
4,12962022,7597577419340707,28471555797628387191024,65855,14963388,14943452,1628126200,178,


In [11]:
# set index and add readable timestamp
blocks_df = blocks_df.set_index('number')
blocks_df['timestamp_2'] = pd.to_datetime(blocks_df['timestamp'], unit='s')

In [12]:
blocks_df = blocks_df.sort_index()

In [13]:
blocks_df.head()

Unnamed: 0_level_0,difficulty,total_difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas,timestamp_2
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12961718,7774193366499997,28469210465037599682985,73653,14970676,14957821,1628121604,152,,2021-08-05 00:00:04
12961719,7778126798308205,28469218243164397991190,103790,14985294,14975998,1628121607,235,,2021-08-05 00:00:07
12961720,7766870496834469,28469226010034894825659,101080,14999927,14981552,1628121647,196,,2021-08-05 00:00:47
12961721,7770800353022723,28469233780835247848382,68027,15000000,14987640,1628121663,194,,2021-08-05 00:01:03
12961722,7767143455866321,28469241547978703714703,543,15000000,0,1628121685,0,,2021-08-05 00:01:25


In [14]:
blocks_df['base_fee_per_gas'].describe()

count    5.628590e+05
mean     8.389799e+10
std      7.564743e+10
min      1.000000e+09
25%      4.832533e+10
50%      6.923107e+10
75%      9.901271e+10
max      3.941920e+12
Name: base_fee_per_gas, dtype: float64

#### Create percent changes in variables

In [15]:
cols = ['difficulty', 'total_difficulty', 'size', 'gas_limit', 'base_fee_per_gas']
for c in cols:
    print(c, blocks_df[c].dtype)

difficulty int64
total_difficulty object
size int64
gas_limit int64
base_fee_per_gas float64


In [16]:
# Change difficulty to float to calculate change in difficulty
blocks_df['total_difficulty'] = blocks_df['total_difficulty'].astype(float)

In [17]:
cols = ['difficulty', 'total_difficulty', 'size', 'gas_limit', 'base_fee_per_gas']
for col in cols:
    blocks_df[col+'_pct_chg'] = blocks_df[col]/blocks_df[col].shift(1)-1

In [18]:
blocks_df.tail()

Unnamed: 0_level_0,difficulty,total_difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas,timestamp_2,difficulty_pct_chg,total_difficulty_pct_chg,size_pct_chg,gas_limit_pct_chg,base_fee_per_gas_pct_chg
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
13527854,10142061783382444,3.349723e+22,59466,30029295,14081332,1635724692,185,172171000000.0,2021-10-31 23:58:12,0.000495,3.027732e-07,3.496484,0.000976,-0.098995
13527855,10142130502859180,3.349724e+22,113946,30058619,30037769,1635724708,407,170833200000.0,2021-10-31 23:58:28,7e-06,3.027752e-07,0.916154,0.000977,-0.00777
13527856,10127342585857119,3.349725e+22,101346,30029266,20028596,1635724748,186,192157700000.0,2021-10-31 23:59:08,-0.001458,3.023336e-07,-0.110579,-0.000977,0.124827
13527857,10127411305333855,3.349726e+22,12068,30000000,2550891,1635724758,30,200178800000.0,2021-10-31 23:59:18,7e-06,3.023356e-07,-0.880923,-0.000975,0.041742
13527858,10132425049862023,3.349727e+22,132704,29980831,29719237,1635724760,408,179411800000.0,2021-10-31 23:59:20,0.000495,3.024852e-07,9.996354,-0.000639,-0.103743


#### Calculate lagged variables from blocks to join to PIT

In [19]:
cols = ['difficulty', 'total_difficulty', 'size', 'gas_limit', 'base_fee_per_gas']
for col in cols:
    # Last 5 blocks
    blocks_df[col+'_pct_chg_last_5'] = blocks_df[col]/blocks_df[col].shift(5)-1
    # 25 blocks ago to 5 blocks ago percentage changes
    blocks_df[col+'_pct_chg_last_25_to_5'] = blocks_df[col].shift(5)/blocks_df[col].shift(25)-1
    # 50 blocks ago to 5 blocks ago percentage changes
    blocks_df[col+'_pct_chg_last_50_to_5'] = blocks_df[col].shift(5)/blocks_df[col].shift(50)-1
    # 100 blocks ago to 5 blocks ago percentage changes
    blocks_df[col+'_pct_chg_last_100_to_5'] = blocks_df[col].shift(5)/blocks_df[col].shift(100)-1

In [20]:
blocks_df.head()

Unnamed: 0_level_0,difficulty,total_difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas,timestamp_2,difficulty_pct_chg,...,size_pct_chg_last_50_to_5,size_pct_chg_last_100_to_5,gas_limit_pct_chg_last_5,gas_limit_pct_chg_last_25_to_5,gas_limit_pct_chg_last_50_to_5,gas_limit_pct_chg_last_100_to_5,base_fee_per_gas_pct_chg_last_5,base_fee_per_gas_pct_chg_last_25_to_5,base_fee_per_gas_pct_chg_last_50_to_5,base_fee_per_gas_pct_chg_last_100_to_5
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12961718,7774193366499997,2.846921e+22,73653,14970676,14957821,1628121604,152,,2021-08-05 00:00:04,,...,,,,,,,,,,
12961719,7778126798308205,2.846922e+22,103790,14985294,14975998,1628121607,235,,2021-08-05 00:00:07,0.000506,...,,,,,,,,,,
12961720,7766870496834469,2.846923e+22,101080,14999927,14981552,1628121647,196,,2021-08-05 00:00:47,-0.001447,...,,,,,,,,,,
12961721,7770800353022723,2.846923e+22,68027,15000000,14987640,1628121663,194,,2021-08-05 00:01:03,0.000506,...,,,,,,,,,,
12961722,7767143455866321,2.846924e+22,543,15000000,0,1628121685,0,,2021-08-05 00:01:25,-0.000471,...,,,,,,,,,,


In [21]:
blocks_df.describe()

Unnamed: 0,difficulty,total_difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas,difficulty_pct_chg,total_difficulty_pct_chg,...,size_pct_chg_last_50_to_5,size_pct_chg_last_100_to_5,gas_limit_pct_chg_last_5,gas_limit_pct_chg_last_25_to_5,gas_limit_pct_chg_last_50_to_5,gas_limit_pct_chg_last_100_to_5,base_fee_per_gas_pct_chg_last_5,base_fee_per_gas_pct_chg_last_25_to_5,base_fee_per_gas_pct_chg_last_50_to_5,base_fee_per_gas_pct_chg_last_100_to_5
count,566141.0,566141.0,566141.0,566141.0,566141.0,566141.0,566141.0,562859.0,566140.0,566140.0,...,566091.0,566041.0,566136.0,566116.0,566091.0,566041.0,562854.0,562834.0,562809.0,562759.0
mean,8881305000000000.0,3.086043e+22,77185.83,29923290.0,15442160.0,1631910000.0,188.815684,83897990000.0,7.155172e-07,2.872804e-07,...,4.08801,4.129498,1e-05,3.6e-05,8e-05,0.000167,0.012018,0.031181,0.061357,0.086395
std,777247200000000.0,1.456622e+21,57594.66,1146282.0,10294740.0,2193156.0,142.875043,75647430000.0,0.0007033183,1.28319e-08,...,23.283212,23.399238,0.003215,0.006055,0.008896,0.012699,0.160737,0.333681,0.758804,0.915449
min,7213602000000000.0,2.846921e+22,520.0,14890370.0,0.0,1628122000.0,0.0,1000000000.0,-0.008297229,2.519518e-07,...,-0.999368,-0.999269,-0.004873,-0.015512,-0.024138,-0.031755,-0.470438,-0.863385,-0.953787,-0.97217
25%,8224747000000000.0,2.957782e+22,28275.0,30000000.0,6122355.0,1630011000.0,74.0,48325330000.0,-0.0004816755,2.78465e-07,...,-0.597942,-0.598376,-0.000976,-0.000976,-0.000976,-0.000976,-0.100389,-0.131083,-0.149842,-0.167211
50%,8964495000000000.0,3.080368e+22,66197.0,30000000.0,13962690.0,1631902000.0,161.0,69231070000.0,6.689213e-06,2.894665e-07,...,-0.00124,-0.003027,0.0,0.0,0.0,0.0,-0.011156,-0.008042,-0.008954,-0.00853
75%,9495806000000000.0,3.211024e+22,118983.0,30029300.0,26336530.0,1633807000.0,283.0,99012710000.0,0.0004892359,2.967394e-07,...,1.486775,1.486212,0.000975,0.000976,0.000976,0.000976,0.099221,0.137729,0.160197,0.183855
max,1.066084e+16,3.349727e+22,1457148.0,30382990.0,30292930.0,1635725000.0,1431.0,3941920000000.0,0.0009949027,3.193828e-07,...,855.144424,889.976364,0.999015,0.996091,1.003898,0.998041,0.801837,9.533298,50.93609,58.384491


## Transactions

In [None]:
# Old way

# transactions_df_1 = pd.read_csv(r'data/transactions_small/12961718_13136426.csv')
# transactions_df_2 = pd.read_csv(r'data/transactions_small/13136427_13330089.csv')
# transactions_df_3 = pd.read_csv(r'data/transactions_small/13330090_13527858.csv')

# print("transactions_df_1 shape:", transactions_df_1.shape)
# print("transactions_df_2 shape:", transactions_df_2.shape)
# print("transactions_df_3 shape:", transactions_df_3.shape)

# transactions_df = pd.concat([transactions_df_1, transactions_df_2, transactions_df_3], ignore_index=True)
# transactions_df.shape

# transactions_df['block_timestamp_2'] = pd.to_datetime(transactions_df['block_timestamp'], unit='s')

# cols = [
#     'transaction_id', 'block_number', 'transaction_index', 'value', 
#     'gas', 'gas_price', 'block_timestamp', 'max_fee_per_gas', 'max_priority_fee_per_gas',
#     'transaction_type'
# ]
# transactions_df = transactions_df[cols]

# print("Minimum block number:", transactions_df.block_number.min())
# print("Maximum block number:", transactions_df.block_number.max())

In [26]:
transactions_df = pd.read_csv(r'data/transactions_small.csv')
transactions_df.shape

(106896300, 10)

In [27]:
transactions_df.head()

Unnamed: 0,transaction_id,block_number,transaction_index,value,gas,gas_price,max_fee_per_gas,max_priority_fee_per_gas,transaction_type,block_timestamp
0,133302900000000000,13330290,0,300000000000000000,611894,74759089945,75000000000.0,1500000000.0,2,1633049163
1,133302900000000001,13330290,1,0,184981,74759089945,177000000000.0,1500000000.0,2,1633049163
2,133302900000000002,13330290,2,0,278299,74259089945,75000000000.0,1000000000.0,2,1633049163
3,133302900000000003,13330290,3,74423577302830566,21000,1613659229046,,,0,1633049163
4,133302900000000004,13330290,4,9368049882777163,21000,111116168767,111116200000.0,111116200000.0,2,1633049163


In [28]:
print("Minimum block number:", transactions_df.block_number.min())
print("Maximum block number:", transactions_df.block_number.max())

Minimum block number: 12961718
Maximum block number: 13527858


In [29]:
print("# of unique blocks:", len(pd.unique(transactions_df.block_number)))

# of unique blocks: 556937


In [30]:
transactions_df.describe()

Unnamed: 0,transaction_id,block_number,transaction_index,gas,gas_price,max_fee_per_gas,max_priority_fee_per_gas,transaction_type,block_timestamp
count,106896300.0,106896300.0,106896300.0,106896300.0,106896300.0,52487540.0,52487540.0,106896300.0,106896300.0
mean,1.324922e+17,13249220.0,147.9638,173490.8,98893360000.0,159523900000.0,16931260000.0,0.982345,1631970000.0
std,1655212000000000.0,165521.2,131.4379,381874.5,5156944000000.0,15986180000000.0,7357534000000.0,0.9996853,2221511.0
min,1.296172e+17,12961720.0,0.0,21000.0,0.0,1423420000.0,0.0,0.0,1628122000.0
25%,1.310515e+17,13105140.0,51.0,36507.0,54964970000.0,78000000000.0,1500000000.0,0.0,1630036000.0
50%,1.324835e+17,13248350.0,117.0,99262.0,78022670000.0,111427200000.0,1940000000.0,0.0,1631951000.0
75%,1.339462e+17,13394620.0,213.0,226090.0,113925900000.0,165575100000.0,3000000000.0,2.0,1633919000.0
max,1.352786e+17,13527860.0,1430.0,30176250.0,5.324367e+16,1.15774e+17,5.324361e+16,2.0,1635725000.0


## Receipts

In [22]:
receipts_df = pd.read_csv(r'data/receipts_small.csv')
receipts_df.shape

(177191385, 7)

In [23]:
receipts_df.head()

Unnamed: 0,transaction_id,transaction_index,block_number,receipt_cumulative_gas_used,receipt_gas_used,receipt_status,receipt_effective_gas_price
0,130970770000000042,42,13097077,20269277,615110,1,699757885383
1,130792690000000094,94,13079269,4909942,105802,1,62815025192
2,130989030000000240,240,13098903,16465005,218423,1,69615073997
3,131641020000000106,106,13164102,7507934,21000,1,157010799148
4,131636110000000098,98,13163611,5929557,21000,1,99382274301


In [24]:
print("Minimum block number:", receipts_df.block_number.min())
print("Maximum block number:", receipts_df.block_number.max())

Minimum block number: 12961719
Maximum block number: 13538797


In [25]:
print("# of unique blocks:", len(pd.unique(receipts_df.block_number)))

# of unique blocks: 567717


In [31]:
receipts_df.describe()

Unnamed: 0,transaction_id,transaction_index,block_number,receipt_cumulative_gas_used,receipt_gas_used,receipt_status,receipt_effective_gas_price
count,177191400.0,177191400.0,177191400.0,177191400.0,177191400.0,177191400.0,177191400.0
mean,1.321286e+17,147.0498,13212860.0,10042850.0,82396.54,0.963564,95599850000.0
std,1573253000000000.0,131.1281,157325.3,7987629.0,169332.0,0.1873725,5661691000000.0
min,1.296172e+17,0.0,12961720.0,19260.0,13524.0,0.0,0.0
25%,1.308061e+17,51.0,13080610.0,3310022.0,21000.0,1.0,52000000000.0
50%,1.319993e+17,116.0,13199930.0,7915357.0,46109.0,1.0,74430280000.0
75%,1.332139e+17,211.0,13321380.0,15348480.0,92378.0,1.0,108748800000.0
max,1.35388e+17,1430.0,13538800.0,30292930.0,30000000.0,1.0,5.324367e+16


In [32]:
receipts_df.rename(columns={'receipt_cumulative_gas_used': 'cumulative_gas_used',
                            'receipt_gas_used': 'gas_used',
                            'receipt_status': 'status',
                            'receipt_effective_gas_price': 'effective_gas_price'
                           }, inplace=True)

In [33]:
receipts_df.head()

Unnamed: 0,transaction_id,transaction_index,block_number,cumulative_gas_used,gas_used,status,effective_gas_price
0,130970770000000042,42,13097077,20269277,615110,1,699757885383
1,130792690000000094,94,13079269,4909942,105802,1,62815025192
2,130989030000000240,240,13098903,16465005,218423,1,69615073997
3,131641020000000106,106,13164102,7507934,21000,1,157010799148
4,131636110000000098,98,13163611,5929557,21000,1,99382274301


In [34]:
cols = ['transaction_id', 'block_number', 'cumulative_gas_used',
    'gas_used', 'status', 'effective_gas_price']
receipts_df = receipts_df[cols]

In [None]:
# cols = [
#     'transaction_id', 'block_number', 'cumulative_gas_used',
#     'gas_used', 'status', 'effective_gas_price'
# ]

# # where block_number between 12967000 and 12968000

# receipts_df = pd.read_sql(f"SELECT {', '.join(cols)} FROM ethereumetl.receipts_small  ", conn)
# receipts_df.shape

## Merge Transactions and Receipts

In [35]:
transactions_receipts_df = transactions_df.merge(receipts_df,
                                             how='inner',
                                             left_on=['transaction_id', 'block_number'],
                                             right_on=['transaction_id', 'block_number'])
transactions_receipts_df.shape

(174852847, 14)

In [36]:
transactions_receipts_df.head()

Unnamed: 0,transaction_id,block_number,transaction_index,value,gas,gas_price,max_fee_per_gas,max_priority_fee_per_gas,transaction_type,block_timestamp,cumulative_gas_used,gas_used,status,effective_gas_price
0,133302900000000000,13330290,0,300000000000000000,611894,74759089945,75000000000.0,1500000000.0,2,1633049163,611894,611894,1,74759089945
1,133302900000000001,13330290,1,0,184981,74759089945,177000000000.0,1500000000.0,2,1633049163,746250,134356,1,74759089945
2,133302900000000002,13330290,2,0,278299,74259089945,75000000000.0,1000000000.0,2,1633049163,922183,175933,1,74259089945
3,133302900000000003,13330290,3,74423577302830566,21000,1613659229046,,,0,1633049163,943183,21000,1,1613659229046
4,133302900000000004,13330290,4,9368049882777163,21000,111116168767,111116200000.0,111116200000.0,2,1633049163,964183,21000,1,111116168767


In [37]:
#transactions_receipts_df = transactions_receipts_df.reset_index()
transactions_receipts_df.columns

Index(['transaction_id', 'block_number', 'transaction_index', 'value', 'gas',
       'gas_price', 'max_fee_per_gas', 'max_priority_fee_per_gas',
       'transaction_type', 'block_timestamp', 'cumulative_gas_used',
       'gas_used', 'status', 'effective_gas_price'],
      dtype='object')

In [38]:
# Calculate aggregated variables at block level
transactions_receipts_agg_df = transactions_receipts_df[['block_number', 'gas', 'gas_price', 'gas_used', 'effective_gas_price']]\
        .groupby('block_number').agg(['min', 'mean', 'count'])
transactions_receipts_agg_df.columns = transactions_receipts_agg_df.columns.map('_'.join).str.strip('_')
transactions_receipts_agg_df

Unnamed: 0_level_0,gas_min,gas_mean,gas_count,gas_price_min,gas_price_mean,gas_price_count,gas_used_min,gas_used_mean,gas_used_count,effective_gas_price_min,effective_gas_price_mean,effective_gas_price_count
block_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
12961719,21000,173559.787234,470,0,6.130997e+10,470,19730,63727.651064,470,0,6.130997e+10,470
12961720,21000,255567.969388,392,0,5.420722e+10,392,19853,76436.489796,392,0,5.420722e+10,392
12961721,21000,223509.886598,388,0,5.656310e+10,388,19301,77255.876289,388,0,5.656310e+10,388
12961723,21000,171902.410714,336,39000000000,4.676679e+10,336,19393,89120.190476,336,39000000000,4.676679e+10,336
12961724,21000,242598.598039,408,0,5.865438e+10,408,16220,73487.240196,408,0,5.865438e+10,408
...,...,...,...,...,...,...,...,...,...,...,...,...
13527854,21000,138959.481081,185,172555259180,1.880757e+11,185,21000,76115.308108,185,172555259180,1.880757e+11,185
13527855,21000,157866.857494,407,171833195532,2.008356e+11,407,21000,73802.872236,407,171833195532,2.008356e+11,407
13527856,21000,167440.677419,186,192157722010,2.027112e+11,186,21000,107680.623656,186,192157722010,2.027112e+11,186
13527857,21000,150144.533333,30,201588828056,2.168811e+11,30,21000,85029.700000,30,201588828056,2.168811e+11,30


In [39]:
# Keep only certain columns
transactions_receipts_agg_df = transactions_receipts_agg_df[['gas_min', 'gas_mean', 'gas_price_min', 'gas_price_mean', 
                                                            'gas_used_min', 'gas_used_mean', 'effective_gas_price_min',
                                                             'effective_gas_price_mean', 'effective_gas_price_count']]
transactions_receipts_agg_df.rename(columns={'effective_gas_price_count': 'number_transactions_in_block'}, inplace=True)
transactions_receipts_agg_df        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0_level_0,gas_min,gas_mean,gas_price_min,gas_price_mean,gas_used_min,gas_used_mean,effective_gas_price_min,effective_gas_price_mean,number_transactions_in_block
block_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12961719,21000,173559.787234,0,6.130997e+10,19730,63727.651064,0,6.130997e+10,470
12961720,21000,255567.969388,0,5.420722e+10,19853,76436.489796,0,5.420722e+10,392
12961721,21000,223509.886598,0,5.656310e+10,19301,77255.876289,0,5.656310e+10,388
12961723,21000,171902.410714,39000000000,4.676679e+10,19393,89120.190476,39000000000,4.676679e+10,336
12961724,21000,242598.598039,0,5.865438e+10,16220,73487.240196,0,5.865438e+10,408
...,...,...,...,...,...,...,...,...,...
13527854,21000,138959.481081,172555259180,1.880757e+11,21000,76115.308108,172555259180,1.880757e+11,185
13527855,21000,157866.857494,171833195532,2.008356e+11,21000,73802.872236,171833195532,2.008356e+11,407
13527856,21000,167440.677419,192157722010,2.027112e+11,21000,107680.623656,192157722010,2.027112e+11,186
13527857,21000,150144.533333,201588828056,2.168811e+11,21000,85029.700000,201588828056,2.168811e+11,30


In [40]:
cols = ['gas_mean', 'gas_price_mean', 'gas_used_mean', 'effective_gas_price_mean', 'number_transactions_in_block']
for col in cols:
    # Last 5 blocks
    transactions_receipts_agg_df[col+'_pct_chg_last_5'] = transactions_receipts_agg_df[col]/transactions_receipts_agg_df[col].shift(5)-1
    # 25 blocks ago to 5 blocks ago percentage changes
    transactions_receipts_agg_df[col+'_pct_chg_last_25_to_5'] = transactions_receipts_agg_df[col].shift(5)/transactions_receipts_agg_df[col].shift(25)-1
    # 50 blocks ago to 5 blocks ago percentage changes
    transactions_receipts_agg_df[col+'_pct_chg_last_50_to_5'] = transactions_receipts_agg_df[col].shift(5)/transactions_receipts_agg_df[col].shift(50)-1
    # 100 blocks ago to 5 blocks ago percentage changes
    transactions_receipts_agg_df[col+'_pct_chg_last_100_to_5'] = transactions_receipts_agg_df[col].shift(5)/transactions_receipts_agg_df[col].shift(100)-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
transactions_receipts_agg_df.head()

Unnamed: 0_level_0,gas_min,gas_mean,gas_price_min,gas_price_mean,gas_used_min,gas_used_mean,effective_gas_price_min,effective_gas_price_mean,number_transactions_in_block,gas_mean_pct_chg_last_5,...,gas_used_mean_pct_chg_last_50_to_5,gas_used_mean_pct_chg_last_100_to_5,effective_gas_price_mean_pct_chg_last_5,effective_gas_price_mean_pct_chg_last_25_to_5,effective_gas_price_mean_pct_chg_last_50_to_5,effective_gas_price_mean_pct_chg_last_100_to_5,number_transactions_in_block_pct_chg_last_5,number_transactions_in_block_pct_chg_last_25_to_5,number_transactions_in_block_pct_chg_last_50_to_5,number_transactions_in_block_pct_chg_last_100_to_5
block_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12961719,21000,173559.787234,0,61309970000.0,19730,63727.651064,0,61309970000.0,470,,...,,,,,,,,,,
12961720,21000,255567.969388,0,54207220000.0,19853,76436.489796,0,54207220000.0,392,,...,,,,,,,,,,
12961721,21000,223509.886598,0,56563100000.0,19301,77255.876289,0,56563100000.0,388,,...,,,,,,,,,,
12961723,21000,171902.410714,39000000000,46766790000.0,19393,89120.190476,39000000000,46766790000.0,336,,...,,,,,,,,,,
12961724,21000,242598.598039,0,58654380000.0,16220,73487.240196,0,58654380000.0,408,,...,,,,,,,,,,


## Point In Time

In [None]:
# cols = [
#     'value', 'transaction_hash', 'log_index', 'block_number'
# ]

pit_df_old = pd.read_sql(f"SELECT * FROM ethereumetl.latest_avail_block_60 where number between 12967000 and 12968000 ", conn)
pit_df_old.shape

In [None]:
pit_df_old.head()

In [42]:
pit_df = pd.read_csv(r'data/pit_60.csv')

In [43]:
pit_df.shape

(566141, 4)

In [44]:
pit_df.head()

Unnamed: 0,number,datetime,lag_cutoff,latest_avail_block
0,12961718,2021-08-05 00:00:04,2021-08-04 23:59:04,
1,12961719,2021-08-05 00:00:07,2021-08-04 23:59:07,
2,12961720,2021-08-05 00:00:47,2021-08-04 23:59:47,
3,12961721,2021-08-05 00:01:03,2021-08-05 00:00:03,
4,12961722,2021-08-05 00:01:25,2021-08-05 00:00:25,12961719.0


In [45]:
pit_df = pit_df.set_index('number')
pit_df.rename(columns={'lag_cutoff': 'lag_cutoff_60',
                       'latest_avail_block': 'latest_avail_60'},
             inplace=True)

In [None]:
# pit_df = pit_df.set_index('number')
# pit_df['latest_avail_time_dt'] = pd.to_datetime(pit_df['latest_avail_time'], unit='s')
# pit_df.rename(columns={'lag_cutoff': 'lag_cutoff_60',
#                        'latest_avail_block': 'latest_avail_60',
#                       'latest_avail_time': 'latest_avail_time_60',
#                       'latest_avail_time_dt': 'latest_avail_time_dt_60'},
#              inplace=True)

In [None]:
# pit_df = pit_df.set_index('number')
# pit_df['latest_avail_time_dt'] = pd.to_datetime(pit_df['latest_avail_time'], unit='s')
# pit_df.rename(columns={'lag_cutoff': 'lag_cutoff_60',
#                        'latest_avail': 'latest_avail_60',
#                       'latest_avail_time': 'latest_avail_time_60',
#                       'latest_avail_time_dt': 'latest_avail_time_dt_60'},
#              inplace=True)

In [46]:
pit_df.head()

Unnamed: 0_level_0,datetime,lag_cutoff_60,latest_avail_60
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12961718,2021-08-05 00:00:04,2021-08-04 23:59:04,
12961719,2021-08-05 00:00:07,2021-08-04 23:59:07,
12961720,2021-08-05 00:00:47,2021-08-04 23:59:47,
12961721,2021-08-05 00:01:03,2021-08-05 00:00:03,
12961722,2021-08-05 00:01:25,2021-08-05 00:00:25,12961719.0


# Merge Data Sets

In [47]:
print("blocks length:", len(blocks_df))
print("pit length:", len(pit_df))
print("transactions length:", len(transactions_df))
print("receipts length:", len(receipts_df))
print("transactions_receipts_agg_df length:", len(transactions_receipts_agg_df))

blocks length: 566141
pit length: 566141
transactions length: 106896300
receipts length: 177191385
transactions_receipts_agg_df length: 556936


In [48]:
# Merge point-in-time with blocks
# merged_df = pit_df[['lag_cutoff_60', 'latest_avail_60', 'latest_avail_time_60', 'latest_avail_time_dt_60']].merge(blocks_df,
#                         how='inner',
#                         left_index=True,
#                         right_index=True)

# Merge point-in-time with blocks
merged_df = pit_df[['lag_cutoff_60', 'latest_avail_60', 'datetime']].merge(blocks_df,
                        how='inner',
                        left_index=True,
                        right_index=True)

merged_df.shape

(566141, 37)

In [49]:
print("Length of merged_df", len(merged_df))
merged_df.tail()

Length of merged_df 566141


Unnamed: 0_level_0,lag_cutoff_60,latest_avail_60,datetime,difficulty,total_difficulty,size,gas_limit,gas_used,timestamp,transaction_count,...,size_pct_chg_last_50_to_5,size_pct_chg_last_100_to_5,gas_limit_pct_chg_last_5,gas_limit_pct_chg_last_25_to_5,gas_limit_pct_chg_last_50_to_5,gas_limit_pct_chg_last_100_to_5,base_fee_per_gas_pct_chg_last_5,base_fee_per_gas_pct_chg_last_25_to_5,base_fee_per_gas_pct_chg_last_50_to_5,base_fee_per_gas_pct_chg_last_100_to_5
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13527854,2021-10-31 23:57:12,13527848.0,2021-10-31 23:58:12,10142061783382444,3.349723e+22,59466,30029295,14081332,1635724692,185,...,0.24067,-0.881631,0.001617,-0.00064,-0.00064,-0.00064,-0.202803,0.322248,-0.527752,0.701928
13527855,2021-10-31 23:57:28,13527849.0,2021-10-31 23:58:28,10142130502859180,3.349724e+22,113946,30058619,30037769,1635724708,407,...,33.849049,0.481497,0.002595,-0.000639,0.0,-0.000639,-0.126427,0.064287,-0.54974,0.369973
13527856,2021-10-31 23:58:08,13527852.0,2021-10-31 23:59:08,10127342585857119,3.349725e+22,101346,30029266,20028596,1635724748,186,...,-0.704577,0.736733,0.000976,0.000639,0.000639,0.000639,-0.037228,-0.034343,-0.497471,0.429607
13527857,2021-10-31 23:58:18,13527854.0,2021-10-31 23:59:18,10127411305333855,3.349726e+22,12068,30000000,2550891,1635724758,30,...,6.175762,0.080803,-0.000976,0.000639,0.001616,0.000976,0.127682,-0.235097,-0.593387,0.441729
13527858,2021-10-31 23:58:20,13527854.0,2021-10-31 23:59:20,10132425049862023,3.349727e+22,132704,29980831,29719237,1635724760,408,...,2.207616,-0.644881,-0.000639,0.0,0.000639,0.000639,-0.061103,-0.14465,-0.512152,0.476677


In [50]:
# Merge point-in-time/blocks with transactions_receipts_agg_df
pipeline_df = merged_df.merge(transactions_receipts_agg_df,
                        how='left',
                        left_index=True,
                        right_index=True)
pipeline_df.shape

(566141, 66)

In [51]:
pipeline_df.columns

Index(['lag_cutoff_60', 'latest_avail_60', 'datetime', 'difficulty',
       'total_difficulty', 'size', 'gas_limit', 'gas_used', 'timestamp',
       'transaction_count', 'base_fee_per_gas', 'timestamp_2',
       'difficulty_pct_chg', 'total_difficulty_pct_chg', 'size_pct_chg',
       'gas_limit_pct_chg', 'base_fee_per_gas_pct_chg',
       'difficulty_pct_chg_last_5', 'difficulty_pct_chg_last_25_to_5',
       'difficulty_pct_chg_last_50_to_5', 'difficulty_pct_chg_last_100_to_5',
       'total_difficulty_pct_chg_last_5',
       'total_difficulty_pct_chg_last_25_to_5',
       'total_difficulty_pct_chg_last_50_to_5',
       'total_difficulty_pct_chg_last_100_to_5', 'size_pct_chg_last_5',
       'size_pct_chg_last_25_to_5', 'size_pct_chg_last_50_to_5',
       'size_pct_chg_last_100_to_5', 'gas_limit_pct_chg_last_5',
       'gas_limit_pct_chg_last_25_to_5', 'gas_limit_pct_chg_last_50_to_5',
       'gas_limit_pct_chg_last_100_to_5', 'base_fee_per_gas_pct_chg_last_5',
       'base_fee_per_gas_

In [52]:
pd.options.display.max_columns = None
pipeline_df.describe()

Unnamed: 0,latest_avail_60,difficulty,total_difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas,difficulty_pct_chg,total_difficulty_pct_chg,size_pct_chg,gas_limit_pct_chg,base_fee_per_gas_pct_chg,difficulty_pct_chg_last_5,difficulty_pct_chg_last_25_to_5,difficulty_pct_chg_last_50_to_5,difficulty_pct_chg_last_100_to_5,total_difficulty_pct_chg_last_5,total_difficulty_pct_chg_last_25_to_5,total_difficulty_pct_chg_last_50_to_5,total_difficulty_pct_chg_last_100_to_5,size_pct_chg_last_5,size_pct_chg_last_25_to_5,size_pct_chg_last_50_to_5,size_pct_chg_last_100_to_5,gas_limit_pct_chg_last_5,gas_limit_pct_chg_last_25_to_5,gas_limit_pct_chg_last_50_to_5,gas_limit_pct_chg_last_100_to_5,base_fee_per_gas_pct_chg_last_5,base_fee_per_gas_pct_chg_last_25_to_5,base_fee_per_gas_pct_chg_last_50_to_5,base_fee_per_gas_pct_chg_last_100_to_5,gas_min,gas_mean,gas_price_min,gas_price_mean,gas_used_min,gas_used_mean,effective_gas_price_min,effective_gas_price_mean,number_transactions_in_block,gas_mean_pct_chg_last_5,gas_mean_pct_chg_last_25_to_5,gas_mean_pct_chg_last_50_to_5,gas_mean_pct_chg_last_100_to_5,gas_price_mean_pct_chg_last_5,gas_price_mean_pct_chg_last_25_to_5,gas_price_mean_pct_chg_last_50_to_5,gas_price_mean_pct_chg_last_100_to_5,gas_used_mean_pct_chg_last_5,gas_used_mean_pct_chg_last_25_to_5,gas_used_mean_pct_chg_last_50_to_5,gas_used_mean_pct_chg_last_100_to_5,effective_gas_price_mean_pct_chg_last_5,effective_gas_price_mean_pct_chg_last_25_to_5,effective_gas_price_mean_pct_chg_last_50_to_5,effective_gas_price_mean_pct_chg_last_100_to_5,number_transactions_in_block_pct_chg_last_5,number_transactions_in_block_pct_chg_last_25_to_5,number_transactions_in_block_pct_chg_last_50_to_5,number_transactions_in_block_pct_chg_last_100_to_5
count,566137.0,566141.0,566141.0,566141.0,566141.0,566141.0,566141.0,566141.0,562859.0,566140.0,566140.0,566140.0,566140.0,562858.0,566136.0,566116.0,566091.0,566041.0,566136.0,566116.0,566091.0,566041.0,566136.0,566116.0,566091.0,566041.0,566136.0,566116.0,566091.0,566041.0,562854.0,562834.0,562809.0,562759.0,556936.0,556936.0,556936.0,556936.0,556936.0,556936.0,556936.0,556936.0,556936.0,556931.0,556911.0,556886.0,556836.0,556931.0,556911.0,556886.0,556836.0,556931.0,556911.0,556886.0,556836.0,556931.0,556911.0,556886.0,556836.0,556931.0,556911.0,556886.0,556836.0
mean,13244780.0,8881305000000000.0,3.086043e+22,77185.83,29923290.0,15442160.0,1631910000.0,188.815684,83897990000.0,7.155172e-07,2.872804e-07,5.531328,2e-06,0.003672,4e-06,1.4e-05,3.2e-05,6.8e-05,1.436403e-06,5.74562e-06,1.292769e-05,2.7e-05,4.114598,4.054488,4.08801,4.129498,1e-05,3.6e-05,8e-05,0.000167,0.012018,0.031181,0.061357,0.086395,21667.53,188929.7,85063280000.0,102234400000.0,21340.77,89072.96,85063280000.0,102234400000.0,313.955009,0.125859,0.133857,0.138133,0.142811,inf,inf,inf,inf,0.104803,0.117146,0.123761,0.12736,inf,inf,inf,inf,1.718159,1.703634,1.717208,1.731129
std,163429.8,777247200000000.0,1.456622e+21,57594.66,1146282.0,10294740.0,2193156.0,142.875043,75647430000.0,0.0007033183,1.28319e-08,30.235284,0.00154,0.086,0.001557,0.003085,0.004593,0.006636,6.415319e-08,2.56534e-07,5.769133e-07,1e-06,23.649602,23.228035,23.283212,23.399238,0.003215,0.006055,0.008896,0.012699,0.160737,0.333681,0.758804,0.915449,78597.86,199575.8,93825890000.0,232812300000.0,74542.71,99265.0,93825890000.0,232812300000.0,254.926877,1.20674,1.265515,1.248601,1.262789,,,,,1.144399,1.219309,1.305462,1.260407,,,,,8.335019,8.288193,8.58524,8.485832
min,12961720.0,7213602000000000.0,2.846921e+22,520.0,14890370.0,0.0,1628122000.0,0.0,1000000000.0,-0.008297229,2.519518e-07,-0.999018,-0.000977,-0.125,-0.012636,-0.017439,-0.02271,-0.030165,1.260136e-06,5.043736e-06,1.13541e-05,2.4e-05,-0.998664,-0.999158,-0.999368,-0.999269,-0.004873,-0.015512,-0.024138,-0.031755,-0.470438,-0.863385,-0.953787,-0.97217,21000.0,21000.0,0.0,0.0,13524.0,21000.0,0.0,0.0,1.0,-0.994731,-0.998408,-0.995015,-0.997452,-1.0,-1.0,-1.0,-1.0,-0.997642,-0.998496,-0.997669,-0.998602,-1.0,-1.0,-1.0,-1.0,-0.998529,-0.998674,-0.998836,-0.998945
25%,13103250.0,8224747000000000.0,2.957782e+22,28275.0,30000000.0,6122355.0,1630011000.0,74.0,48325330000.0,-0.0004816755,2.78465e-07,-0.639274,-0.000976,-0.074312,-0.000968,-0.001939,-0.002916,-0.004359,1.392347e-06,5.569531e-06,1.253254e-05,2.6e-05,-0.596841,-0.594604,-0.597942,-0.598376,-0.000976,-0.000976,-0.000976,-0.000976,-0.100389,-0.131083,-0.149842,-0.167211,21000.0,146996.1,48592840000.0,58780050000.0,21000.0,69216.48,48592840000.0,58780050000.0,117.0,-0.186976,-0.191853,-0.194747,-0.197589,-0.1023639,-0.1307723,-0.1486779,-0.1659071,-0.200695,-0.207281,-0.210872,-0.214521,-0.1023639,-0.1307723,-0.1486779,-0.1659071,-0.569849,-0.5671,-0.566667,-0.567908
50%,13244780.0,8964495000000000.0,3.080368e+22,66197.0,30000000.0,13962690.0,1631902000.0,161.0,69231070000.0,6.689213e-06,2.894665e-07,-0.025471,0.0,-0.009572,3.3e-05,6.9e-05,0.000152,0.000155,1.447359e-06,5.789464e-06,1.302601e-05,2.7e-05,0.004609,-0.000644,-0.00124,-0.003027,0.0,0.0,0.0,0.0,-0.011156,-0.008042,-0.008954,-0.00853,21000.0,168698.8,69553560000.0,82115630000.0,21000.0,81944.86,69553560000.0,82115630000.0,254.0,-0.001188,-0.000429,-0.000378,-0.000692,-0.005103424,-0.007376271,-0.01002255,-0.01028926,0.000486,0.001644,0.001492,0.000998,-0.005103424,-0.007376271,-0.01002255,-0.01028926,0.0,0.0,0.0,0.0
75%,13386320.0,9495806000000000.0,3.211024e+22,118983.0,30029300.0,26336530.0,1633807000.0,283.0,99012710000.0,0.0004892359,2.967394e-07,1.682185,0.000976,0.095224,0.000994,0.002026,0.003238,0.004561,1.483684e-06,5.934608e-06,1.33522e-05,2.8e-05,1.47121,1.464949,1.486775,1.486212,0.000975,0.000976,0.000976,0.000976,0.099221,0.137729,0.160197,0.183855,21000.0,196251.6,99475970000.0,116462000000.0,21000.0,97208.95,99475970000.0,116462000000.0,449.0,0.227457,0.236486,0.240042,0.245835,0.108407,0.1379401,0.1558095,0.1783967,0.251799,0.264973,0.270974,0.27636,0.108407,0.1379401,0.1558095,0.1783967,1.313433,1.303571,1.310481,1.309859
max,13527850.0,1.066084e+16,3.349727e+22,1457148.0,30382990.0,30292930.0,1635725000.0,1431.0,3941920000000.0,0.0009949027,3.193828e-07,831.663636,0.998047,0.125,0.003946,0.010374,0.017396,0.025886,1.596272e-06,6.375318e-06,1.432934e-05,3e-05,998.208729,1580.500921,855.144424,889.976364,0.999015,0.996091,1.003898,0.998041,0.801837,9.533298,50.93609,58.384491,29999970.0,29999970.0,7200000000000.0,147966900000000.0,29999970.0,29999970.0,7200000000000.0,147966900000000.0,2862.0,217.191753,262.436604,240.524252,195.207636,inf,inf,inf,inf,305.385106,378.465636,358.507674,438.580493,inf,inf,inf,inf,1195.0,687.5,1370.0,1343.0


In [53]:
pipeline_df.index

Int64Index([12961718, 12961719, 12961720, 12961721, 12961722, 12961723,
            12961724, 12961725, 12961726, 12961727,
            ...
            13527849, 13527850, 13527851, 13527852, 13527853, 13527854,
            13527855, 13527856, 13527857, 13527858],
           dtype='int64', name='number', length=566141)

# Save the final data set

In [54]:
from io import StringIO 
import boto3

bucket = 'sagemaker-w210-eth' # already created on S3
csv_buffer = StringIO()
pipeline_df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'pipeline_df_20211113_2.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'C0P4XA3PYEY1WE7G',
  'HostId': 'uzTd2oQWvb73tnUSFCb519kFFsWb/3eHrYgqbv9ts9k3cXQDqJNF5l99fvBwZaF6kfBK7IUtEpc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'uzTd2oQWvb73tnUSFCb519kFFsWb/3eHrYgqbv9ts9k3cXQDqJNF5l99fvBwZaF6kfBK7IUtEpc=',
   'x-amz-request-id': 'C0P4XA3PYEY1WE7G',
   'date': 'Sun, 14 Nov 2021 01:28:08 GMT',
   'etag': '"230cd6e1e81d89f819b19406d8f2dece"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"230cd6e1e81d89f819b19406d8f2dece"'}