# Updated Data Transformations

In [13]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from numpy import intersect1d
from datetime import datetime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
#!pip3 install awswrangler

In [14]:
import awswrangler as wr
import boto3
import io

import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

In [15]:
# add ethereum to python path (only do this once)
import sys
sys.path.insert(0, "/home/ec2-user/SageMaker/ethereum")

In [16]:
from ethereum import (
    timestamp_to_datetime,
    EthereumData
)
from utils import (
    latest,
    lead_lag,
    add_latest_avail_block,
    lagged_block_data,
    get_pit_blocks
)

In [17]:
role = get_execution_role()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='arn:aws:iam::971504885040:role/SageMaker')['Role']['Arn']

In [9]:
blocks = wr.s3.read_csv(path="s3://sagemaker-w210-eth/raw_data/infuria/blocks")   

In [10]:
cols = [
    'number', 'difficulty', 'total_difficulty', 'size', 'gas_limit', 
    'gas_used', 'timestamp', 'transaction_count', 'base_fee_per_gas'
]

In [11]:
blocks = blocks[cols]

In [12]:
blocks.head()

Unnamed: 0,number,difficulty,total_difficulty,size,gas_limit,gas_used,timestamp,transaction_count,base_fee_per_gas
0,12962018,7604452341876898,28471525393295665706633,55433,15000000,14994306,1628126126,160,
1,12962019,7604589780830370,28471532997885446537003,79403,14985353,14969945,1628126136,217,
2,12962020,7601014041179921,28471540598899487716924,69949,14970720,14964505,1628126157,155,
3,12962021,7601151480133393,28471548200050967850317,92704,14956102,14944125,1628126167,195,
4,12962022,7597577419340707,28471555797628387191024,65855,14963388,14943452,1628126200,178,


### Point-In-Time Blocks

In [17]:
pit = get_pit_blocks(blocks, lag=60)
pit.head()

Unnamed: 0_level_0,datetime,lag_cutoff,latest_avail_block
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12961718,2021-08-05 00:00:04,2021-08-04 23:59:04,
12961719,2021-08-05 00:00:07,2021-08-04 23:59:07,
12961720,2021-08-05 00:00:47,2021-08-04 23:59:47,
12961721,2021-08-05 00:01:03,2021-08-05 00:00:03,
12961722,2021-08-05 00:01:25,2021-08-05 00:00:25,12961719.0


In [18]:
pit_path = "../data/pit_60.csv"
pit.to_csv(pit_path)

In [10]:
del blocks
del pit

### Transactions Small

In [8]:
def get_size_gb(object_):
    return sys.getsizeof(object_) / (1024*1024*1024)

In [9]:
def transaction_id(df, index_col='transaction_index', block_number_col='block_number'):
    idx_str = df[index_col].astype(str).str.pad(width=6, side='left', fillchar='0')
    blk_str = df[block_number_col].astype(str).str.pad(width=12, side='right', fillchar='0')
    id_str = blk_str + idx_str
    return pd.Series(id_str.astype(int), name="transaction_id")

In [10]:
def make_small_transactions(df):
    small_cols = [
        'block_number', 'transaction_index', 'value', 'gas', 'gas_price', 
        'max_fee_per_gas', 'max_priority_fee_per_gas', 'transaction_type', 
        'block_timestamp'
    ]
    df = df[small_cols]
    id_ = transaction_id(df)
    df = pd.concat([id_, df], axis=1)
    return df

In [11]:
transactions_files = [
    "s3://sagemaker-w210-eth/raw_data/infuria/transactions/transactions_12961718_13136426.csv",
    "s3://sagemaker-w210-eth/raw_data/infuria/transactions/transactions_13136427_13330089.csv",
    "s3://sagemaker-w210-eth/raw_data/infuria/transactions/transactions_13330090_13527858.csv"
]

In [18]:
for f in tqdm(transactions_files[2:]):
    
    ### read
    df = wr.s3.read_csv(f)
    
    print("current size: ")
    print(get_size_gb(df))
    
    ### reduce size
    df = make_small_transactions(df)
    
    print("new size: ")
    print(get_size_gb(df))
    print()
    
    ### save
    save_file_name = f.replace(
        "s3://sagemaker-w210-eth/raw_data/infuria/transactions/transactions_", ""
    )
    save_file_path = "../data/transactions_small/" + save_file_name
    df.to_csv(save_file_path)
    
    del df

### Receipts Small

In [28]:
def make_small_receipts(df):
    small_cols = [
        'transaction_index',
        'block_number',
        'cumulative_gas_used',
        'gas_used',
        'status',
        'effective_gas_price'
    ]
    df = df[small_cols]
    id_ = transaction_id(df)
    df = pd.concat([id_, df], axis=1)
    return df

In [20]:
receipts_files = [
    "s3://sagemaker-w210-eth/raw_data/infuria/receipts/receipts_12961718_13136426.csv",
    "s3://sagemaker-w210-eth/raw_data/infuria/receipts/receipts_13136427_13330089.csv",
    "s3://sagemaker-w210-eth/raw_data/infuria/receipts/receipts_13330090_13527858.csv"
]

In [None]:
for f in tqdm(receipts_files):
    
    ### read
    df = wr.s3.read_csv(f)
    
    print("current size: ")
    print(get_size_gb(df))
    
    ### reduce size
    df = make_small_receipts(df)
    
    print("new size: ")
    print(get_size_gb(df))
    print()
    
    ### save
    save_file_name = f.replace(
        "s3://sagemaker-w210-eth/raw_data/infuria/receipts/receipts_", ""
    )
    save_file_path = "../data/receipts_small/" + save_file_name
    df.to_csv(save_file_path)
    
    del df

  0%|          | 0/3 [00:00<?, ?it/s]

current size: 
0.46388217620551586
new size: 
0.07776185125112534



 33%|███▎      | 1/3 [00:11<00:22, 11.42s/it]