In [0]:
from delta.tables import *
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, DataType, BooleanType, NumericType, TimestampType, IntegerType, DecimalType, StringType, LongType, DateType, FloatType
import boto3
from botocore.client import Config
import requests
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import numpy as np
from datetime import datetime, timedelta
from itertools import groupby
from io import StringIO
import pandas as pd
import seaborn as sns
from decimal import Decimal
from matplotlib.ticker import StrMethodFormatter, PercentFormatter
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import math
import json



In [0]:
from pyspark.sql import functions as f

# Load the data
trades = (
    DeltaTable
    .forPath(spark, 's3://amberdata-marketdata-deltalake/spot/order-book-snapshots/').toDF()
    .where('year = "2022" \
            AND month = "02"\
            AND day = "01"\
            AND exchange = "binance" \
            AND pair = "eth_usdt"')
    .select(f.explode('orderBookSides')).select("col.exchangeTimestamp", "col.isBid", f.explode('col.data'))
)

# Extract quantity (the second item in 'col' list) and filter out null quantities
trades = trades.withColumn('quantity', trades['col'][1]).filter(f.col('quantity').isNotNull())

# Group by timestamp and isBid, then sum up quantities
result = trades.groupBy('exchangeTimestamp', 'isBid').sum('quantity')

result = result.toPandas()

result

Unnamed: 0,exchangeTimestamp,isBid,sum(quantity)
0,1643737200000,True,11593.4895
1,1643707860000,True,10208.9959
2,1643728140000,False,11722.9052
3,1643709960000,False,12308.6448
4,1643740560000,False,14852.2318
...,...,...,...
2875,1643746260000,False,13142.4084
2876,1643687760000,True,9101.4293
2877,1643713320000,False,8583.9532
2878,1643698200000,True,11338.0651


In [0]:
# Create new columns 'bid_quantity' and 'ask_quantity' based on 'isBid' column
result.loc[result['isBid'] == True, 'bid_quantity'] = result['sum(quantity)']
result.loc[result['isBid'] == False, 'ask_quantity'] = result['sum(quantity)']

# Fill NaN values with 0
result['ask_quantity'].fillna(0, inplace=True)
result['bid_quantity'].fillna(0, inplace=True)

# Sort values by 'exchangeTimestamp' and 'isBid'
df = result.sort_values(by=['exchangeTimestamp', 'isBid'])

# Pivot 'isBid' column to create 'bid_quantity' and 'ask_quantity'
final_df = df.pivot_table(index='exchangeTimestamp', columns='isBid', values='sum(quantity)', aggfunc='sum')

# Rename columns
final_df.columns = ['ask_quantity' if col == False else 'bid_quantity' for col in final_df.columns]

# Reset index
final_df.reset_index(inplace=True)

# Fill NaN values with 0
final_df['bid_quantity'].fillna(0, inplace=True)
final_df['ask_quantity'].fillna(0, inplace=True)

# Convert 'exchangeTimestamp' to datetime
final_df['exchangeTimestamp'] = pd.to_datetime(final_df['exchangeTimestamp'], unit='ms')

final_df.set_index('exchangeTimestamp', inplace=True)

# Resample to 1 minute intervals
final_df = final_df.resample('1T').sum()

final_df

Unnamed: 0_level_0,ask_quantity,bid_quantity
exchangeTimestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-02-01 00:00:00,15942.61322,16200.3272
2022-02-01 00:01:00,15366.17042,16786.4351
2022-02-01 00:02:00,15117.66102,16863.5252
2022-02-01 00:03:00,15748.63832,16682.6139
2022-02-01 00:04:00,14950.52272,16258.1865
...,...,...
2022-02-01 23:55:00,14356.90980,12476.9920
2022-02-01 23:56:00,14336.10870,12528.0474
2022-02-01 23:57:00,14387.11090,12615.2155
2022-02-01 23:58:00,14205.14270,12465.1727


In [0]:
final_df['difference'] = final_df['ask_quantity'] - final_df['bid_quantity']

In [0]:
import plotly.graph_objects as go

# Create traces
trace1 = go.Scatter(
    x = final_df.index,
    y = final_df['ask_quantity'],
    mode = 'lines',
    name = 'Active Ask Quantity'
)
trace2 = go.Scatter(
    x = final_df.index,
    y = final_df['bid_quantity'],
    mode = 'lines',
    name = 'Active Bid Quantity'
)

layout = go.Layout(
    title=f'Active ETH Bid vs Ask Quantity over Time on Binance',
    yaxis=dict(title='ETH Quantity'),
)

fig = go.Figure(data=[trace1, trace2], layout=layout)

fig.show()

In [0]:
API_Key = "UAK9097d1aac5f33bf4cd338875944e539d"

def get_data(start_date, end_date, headers):
    url = f"https://web3api.io/api/v2/market/spot/ohlcv/eth_usd/historical?exchange=bitstamp&startDate={start_date}&endDate={end_date}&timeInterval=minutes"
    response = requests.get(url, headers=headers)
    data = json.loads(response.text)
    ohlc = pd.DataFrame(data['payload']['data']['bitstamp'], columns=data['payload']['metadata']['columns'])
    ohlc['timestamp'] = pd.to_datetime(ohlc['timestamp'], unit='ms')
    ohlc.set_index('timestamp', inplace=True)
    ohlc.columns = [col.title() for col in ohlc.columns]
    return ohlc

headers = {
    "accept": "application/json",
    "x-api-key": API_Key
}

start_date = "2022-02-01T00:00:00"
end_date = "2022-02-02T00:00:00"

current_date = start_date

ohlc_list = []

while current_date != end_date:
    next_date = (datetime.strptime(current_date, '%Y-%m-%dT%H:%M:%S') + timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S')
    current_ohlc = get_data(current_date, next_date, headers)
    ohlc_list.append(current_ohlc)
    current_date = next_date

ohlc = pd.concat(ohlc_list, axis=0, join='inner', ignore_index=False)

In [0]:
import pandas as pd
import numpy as np

# Define the number of bins
num_bins = 100

# Compute the bin edges based on the range of the 'Close' price
bin_edges = np.linspace(start=ohlc['Close'].min(), stop=ohlc['Close'].max(), num=num_bins+1)

# Create a new column 'PriceBin' in the dataframe that indicates the bin each price belongs to
ohlc['PriceBin'] = pd.cut(ohlc['Close'], bins=bin_edges)

# Group by the 'PriceBin' column and compute the total volume for each bin
volume_profile = ohlc.groupby('PriceBin')['Volume'].sum()

# Convert the 'volume_profile' series to a dataframe
volume_profile_df = volume_profile.reset_index()

volume_profile_df

Unnamed: 0,PriceBin,Volume
0,"(2678.51, 2679.857]",0.000000
1,"(2679.857, 2681.203]",0.000000
2,"(2681.203, 2682.55]",0.000000
3,"(2682.55, 2683.897]",7.264043
4,"(2683.897, 2685.244]",147.829015
...,...,...
95,"(2806.446, 2807.793]",182.949912
96,"(2807.793, 2809.14]",13.824023
97,"(2809.14, 2810.487]",0.000000
98,"(2810.487, 2811.833]",0.000000


In [0]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplot figure
fig = make_subplots(rows=1, cols=2, shared_yaxes=True, subplot_titles=('ETH/USDT OHLC Close Prices', 'Volume Profile'), column_widths=[0.85, 0.15])

# Create trace for the OHLC close prices
trace0 = go.Scatter(
    x = ohlc.index,
    y = ohlc['Close'],
    mode = 'lines',
    name = 'OHLC Close Prices',
    line=dict(color='black')
)

# Add the trace to the figure
fig.add_trace(trace0, row=1, col=1)

# Create trace for the volume profile
trace1 = go.Bar(
    y = volume_profile_df['PriceBin'].apply(lambda x: x.mid),
    x = volume_profile_df['Volume'],
    orientation = 'h',
    marker_color='gray',
    name = 'Volume Profile'
)

# Add the trace to the figure
fig.add_trace(trace1, row=1, col=2)

# Update layout
fig.update_layout(
    height=500,
    width=1200,
    title_text="ETH/USDT OHLC Close Prices and Volume Profile over Time",
    showlegend=False,
)

# Update yaxis properties
fig.update_yaxes(title_text="Price", row=1, col=1)
fig.update_xaxes(title_text="Volume", row=1, col=2)

fig.show()

In [0]:
# Calculate Order Book Imbalance
final_df['imbalance'] = (final_df['bid_quantity'] - final_df['ask_quantity']) / (final_df['bid_quantity'] + final_df['ask_quantity'])

# Create subplot figure
fig = make_subplots(rows=2, cols=1)

# Create traces for the first subplot (ohlc close prices)
trace0 = go.Scatter(
    x = ohlc.index,
    y = ohlc['Close'],
    mode = 'lines',
    name = 'OHLC Close Prices'
)

# Add the trace to the figure
fig.add_trace(trace0, row=1, col=1)

# Create trace for the second subplot (Order Book Imbalance)
trace1 = go.Scatter(
    x = final_df.index,
    y = final_df['imbalance'],
    mode = 'lines',
    name = 'Order Book Imbalance'
)

# Add the trace to the figure
fig.add_trace(trace1, row=2, col=1)

# Update layout
fig.update_layout(height=500, width=1200, title_text="ETH/USDT OHLC Close Prices and Order Book Imbalance over Time")
fig.show()

In [0]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplot figure
fig = make_subplots(rows=2, cols=1)

# Create traces for the first subplot (ohlc close prices)
trace0 = go.Scatter(
    x = ohlc.index,
    y = ohlc['Close'],
    mode = 'lines',
    name = 'OHLC Close Prices'
)

# Add the trace to the figure
fig.add_trace(trace0, row=1, col=1)

# Create traces for the second subplot (ask and bid quantities)
trace1 = go.Scatter(
    x = final_df.index,
    y = final_df['ask_quantity'],
    mode = 'lines',
    name = 'Active Ask Quantity'
)

trace2 = go.Scatter(
    x = final_df.index,
    y = final_df['bid_quantity'],
    mode = 'lines',
    name = 'Active Bid Quantity'
)

# Add the traces to the figure
fig.add_trace(trace1, row=2, col=1)
fig.add_trace(trace2, row=2, col=1)

# Update layout
fig.update_layout(height=500, width=1200, title_text="ETH/USDT OHLC Close Prices and Active Ask vs Bid Quantity over Time")
fig.show()