In [14]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import boto3
from tqdm import tqdm
from sagemaker import get_execution_role
from sagemaker.session import Session

In [None]:
# Directory

!pwd

In [None]:
# Confirm sagemaker role exists

import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

role = get_execution_role()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='arn:aws:iam::971504885040:role/SageMaker')['Role']['Arn']

In [None]:
# Import files from S3

import pandas as pd
import boto3
import io

bucket = "sagemaker-w210-eth"

# Txt file
data_key = '2021-09-01/transaction_hashes_13136427_13142881.txt'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket, Key=data_key)
contents = obj['Body'].read()
print(contents.decode("utf-8"))


In [6]:
%matplotlib inline

In [15]:
role = get_execution_role()
bucket = "w210-ethereum"    
client = boto3.client('s3')

In [16]:
tables = ["blocks", "transactions", "tokens", "logs", 
          "traces", "contracts", "balances", "token_transfers"]
#tables = {i : f"`etherium-liquidity.crypto_etherium.{i}`" for i in tables}
tables = {i : f"`bigquery-public-data.crypto_ethereum.{i}`" for i in tables}

In [17]:
tables

{'blocks': '`bigquery-public-data.crypto_ethereum.blocks`',
 'transactions': '`bigquery-public-data.crypto_ethereum.transactions`',
 'tokens': '`bigquery-public-data.crypto_ethereum.tokens`',
 'logs': '`bigquery-public-data.crypto_ethereum.logs`',
 'traces': '`bigquery-public-data.crypto_ethereum.traces`',
 'contracts': '`bigquery-public-data.crypto_ethereum.contracts`',
 'balances': '`bigquery-public-data.crypto_ethereum.balances`',
 'token_transfers': '`bigquery-public-data.crypto_ethereum.token_transfers`'}

In [18]:
limit = 10000

data = {
    k : client.query(
        f"SELECT * FROM {v} ORDER BY RAND() LIMIT {limit}"
        ).to_dataframe() for k, v in tqdm(tables.items())
    }

  0%|          | 0/8 [00:00<?, ?it/s]

AttributeError: 'S3' object has no attribute 'query'

## Functions

In [6]:
def scatterplots_over_time(df, time_col):
  """Create scatterplots over time

  Parameters:
  -----------
  df: dataframe
    dataframe containing desired data
  time_col: str
    Name of column that contains date to plot over time

  Returns:
  -------
  no return, just plots
  """
  numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

  # Set the number of rows and columns
  num_cols = 2
  if len(numeric_cols)//num_cols == len(numeric_cols)/num_cols:
    num_rows = len(numeric_cols)//num_cols
  else:
    num_rows = len(numeric_cols)//num_cols + 1

  # Create the figure
  fig, ax = plt.subplots(num_rows, num_cols, figsize=(12,12))

  # Create subplots
  row = 0
  col = 0

  for i, c in enumerate(numeric_cols):

    # Set row
    row = i//num_cols

    # Create plot for one variable
    ax[row,col].scatter(x=df[time_col], y=df[c])

    # Set title
    ax[row, col].set_title(c)

    if (col+1)//num_cols == (col+1)/num_cols:
      col = 0
      row += 1
    else:
      col += 1

  # Delete subplot if necessary
  if len(numeric_cols)//num_cols != len(numeric_cols)/num_cols:
    for x in range((len(numeric_cols)+1)%num_cols+1):
      fig.delaxes(ax[num_rows-1][num_cols-x-1])

  # Set overall title
  fig.suptitle("Scatterplots of Numeric Columns")

In [7]:
def create_boxplots(df, figsize=(12,12)):
  """Create boxplots

  Parameters:
  -----------
  df: dataframe
    dataframe containing desired data
  time_col: str
    Name of column that contains date to plot over time

  Returns:
  -------
  no return, just plots
  """
  numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

  # Set the number of rows and columns
  num_cols = 2
  if len(numeric_cols)//num_cols == len(numeric_cols)/num_cols:
    num_rows = len(numeric_cols)//num_cols
  else:
    num_rows = len(numeric_cols)//num_cols + 1

  # Create the figure
  fig, ax = plt.subplots(num_rows, num_cols, figsize=figsize)

  # Create subplots
  row = 0
  col = 0

  for i, c in enumerate(numeric_cols):

    # Set row
    row = i//num_cols

    # Create plot for one variable
    sns.boxplot(data=df, y=(c), ax=ax[row, col])

    # Set title
    ax[row, col].set_title(c)

    if (col+1)//num_cols == (col+1)/num_cols:
      col = 0
      row += 1
    else:
      col += 1

  # Delete subplot if necessary
  if len(numeric_cols)//num_cols != len(numeric_cols)/num_cols:
    for x in range((len(numeric_cols)+1)%num_cols+1):
      fig.delaxes(ax[num_rows-1][num_cols-x-1])

  # Set overall title
  fig.suptitle("Boxplots of Numeric Columns")

  fig.tight_layout()
  fig.subplots_adjust(top = 0.95)

In [8]:
def create_boxplots_over_time(df, time_col):
  """Create boxplots over time

  Parameters:
  -----------
  df: dataframe
    dataframe containing desired data
  time_col: str
    Name of column that contains date to plot over time

  Returns:
  -------
  no return, just plots
  """
  numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

  # Set the number of rows and columns
  num_cols = 2
  if len(numeric_cols)//num_cols == len(numeric_cols)/num_cols:
    num_rows = len(numeric_cols)//num_cols
  else:
    num_rows = len(numeric_cols)//num_cols + 1

  # Create the figure
  fig, ax = plt.subplots(num_rows, num_cols, figsize=(12,12))

  # Create subplots
  row = 0
  col = 0

  for i, c in enumerate(numeric_cols):

    # Set row
    row = i//num_cols

    # Create plot for one variable
    sns.boxplot(data=df, x=df[time_col].dt.year, y=(c), ax=ax[row, col])

    # Set title
    ax[row, col].set_title(c)

    if (col+1)//num_cols == (col+1)/num_cols:
      col = 0
      row += 1
    else:
      col += 1

  # Delete subplot if necessary
  if len(numeric_cols)//num_cols != len(numeric_cols)/num_cols:
    for x in range((len(numeric_cols)+1)%num_cols+1):
      fig.delaxes(ax[num_rows-1][num_cols-x-1])

  # Set overall title
  fig.suptitle("Boxplots of Numeric Columns By Year")

  fig.tight_layout()
  fig.subplots_adjust(top = 0.95)

In [None]:
def autocorrelation_plot(df, col, num_lags):
    """
    Parameters:
    -----------
    df: dataframe
    dataframe containing desired data
    col: str
    name of column to calculate autocorrelation
    num_lags: int
    number of lags to include in plot

    Returns:
    -------
    no return, just plots
    """
    
    

## Blocks

In [10]:
df = data['blocks']

NameError: ignored

In [None]:
df.head()

In [None]:
df = df.reset_index()

In [None]:
df.columns

In [None]:
df.describe(percentiles=[.01, .05, .10, .25, .50, .75, .90, .95, .99])

In [None]:
df.dtypes

In [None]:
df['timestamp'].groupby([df['timestamp'].dt.year.rename('y'), df['timestamp'].dt.month.rename('m')]).count().plot(title='Count Over Time', legend=False)

In [None]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print("Percent Not Missing")
missing_data.head(25)

In [None]:
print("Unique Values, Total Count")
for col in df:
    print(col, df[col].nunique(), df[col].count())

In [None]:
# Increase the size of the heatmap.
plt.figure(figsize=(16, 6))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap of Blocks', fontdict={'fontsize':12}, pad=12);

In [None]:
scatterplots_over_time(df, 'timestamp')

In [None]:
create_boxplots(df)

In [None]:
create_boxplots_over_time(df, 'timestamp')