In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

# Extract all zip files in 'data/btc' folder
data_path = Path("../../data/btc")
# i = 0
# for file in data_path.glob('*.zip'):
#     with zipfile.ZipFile(file, 'r') as zip_ref:
#         zip_ref.extractall(data_path)
#     i += 1
#
#     if i > 2:
#         break

# Read all CSV files and concatenate into a single DataFrame
csv_files = list(data_path.glob("bitstamp_ob*.csv"))
df = pd.concat((pd.read_csv(csv_file) for csv_file in csv_files), ignore_index=True)

# Display the head of the resulting DataFrame
df.head()

Unnamed: 0,time,type,price,amount
0,1534197602,b,6236.82,0.0796
1,1534197602,b,6236.79,0.040085
2,1534197602,b,6236.78,2.4022
3,1534197602,b,6235.02,0.0401
4,1534197602,b,6235.0,1.0


In [3]:
class OrderBookFeatures:
    def __init__(self) -> None:
        pass

    @staticmethod
    def get_volumes(
        orderbook: pd.DataFrame,
    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        bid_volume = (
            orderbook[orderbook["type"] == "b"]
            .groupby("time", as_index=False)
            .sum()
            .set_index("time")["amount "]
            .rename("bid_volume")
        )
        ask_volume = (
            orderbook[orderbook["type"] == "a"]
            .groupby("time", as_index=False)
            .sum()
            .set_index("time")["amount "]
            .rename("ask_volume")
        )
        volume_diff = (ask_volume - bid_volume).rename("volume_diff")

        return ask_volume, bid_volume, volume_diff

    @staticmethod
    def get_spread(orderbook: pd.DataFrame) -> pd.DataFrame:
        highest_buy = (
            orderbook[orderbook["type"] == "b"].groupby("time", as_index=False).max()
        )
        lowest_sell = (
            orderbook[orderbook["type"] == "a"].groupby("time", as_index=False).min()
        )
        spread = (
            lowest_sell.set_index("time")["price"]
            - highest_buy.set_index("time")["price"]
        )
        spread = spread.rename("spread")
        return spread

    @staticmethod
    def get_orders(
        orderbook: pd.DataFrame,
    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        bid_orders = (
            orderbook[orderbook["type"] == "b"]
            .groupby("time", as_index=False)
            .count()
            .set_index("time")["price"]
            .rename("bid_depth")
        )
        ask_orders = (
            orderbook[orderbook["type"] == "a"]
            .groupby("time", as_index=False)
            .count()
            .set_index("time")["price"]
            .rename("ask_depth")
        )
        depth_diff = (ask_orders - bid_orders).rename("depth_diff")
        return ask_orders, bid_orders, depth_diff

    @staticmethod
    def get_weighted_spread(orderbook: pd.DataFrame) -> pd.DataFrame:
        def get_cumulative_amount(group, order_type):
            group = group.sort_values(
                ["price"], ascending=False if order_type == "b" else True
            )
            total_amount = group["amount "].sum()
            threshold = total_amount * 0.1
            cumulative_amount = 0
            for _, row in group.iterrows():
                cumulative_amount += row["amount "]
                if cumulative_amount >= threshold:
                    return cumulative_amount
            return np.nan

        top_10_bid = (
            orderbook[orderbook["type"] == "b"]
            .sort_values(["time"], ascending=True)
            .groupby("time", as_index=False)
            .apply(lambda group: get_cumulative_amount(group, "b"))
        )
        top_10_bid = top_10_bid.set_index("time")
        top_10_bid.columns.values[0] = "bid_amount"

        top_10_ask = (
            orderbook[orderbook["type"] == "a"]
            .sort_values(["time"], ascending=True)
            .groupby("time", as_index=False)
            .apply(lambda group: get_cumulative_amount(group, "a"))
        )
        top_10_ask = top_10_ask.set_index("time")
        top_10_ask.columns.values[0] = "ask_amount"

        return (top_10_bid.iloc[:, 0] - top_10_ask.iloc[:, 0]).rename("weighted_spread")

    @staticmethod
    def get_slopes(orderbook: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
        def get_cumulative_price(group, order_type):
            group = group.sort_values(
                ["price"], ascending=False if order_type == "b" else True
            )
            total_amount = group["amount "].sum()
            threshold = total_amount * 0.1
            cumulative_amount = 0
            for _, row in group.iterrows():
                cumulative_amount += row["amount "]
                if cumulative_amount >= threshold:
                    return row["price"]
            return np.nan

        top_10_bid = (
            orderbook[orderbook["type"] == "b"]
            .sort_values(["time"], ascending=True)
            .groupby("time", as_index=False)
            .apply(lambda group: get_cumulative_price(group, "b"))
        )
        top_10_bid = top_10_bid.set_index("time")
        top_10_bid.columns.values[0] = "bid_slope"

        top_10_ask = (
            orderbook[orderbook["type"] == "a"]
            .sort_values(["time"], ascending=True)
            .groupby("time", as_index=False)
            .apply(lambda group: get_cumulative_price(group, "a"))
        )
        top_10_ask = top_10_ask.set_index("time")
        top_10_ask.columns.values[0] = "ask_slope"

        return top_10_ask, top_10_bid

    def get_features(self, orderbook: pd.DataFrame) -> pd.DataFrame:
        spread = self.get_spread(orderbook)
        ask_depth, bid_depth, depth_diff = self.get_orders(orderbook)
        ask_volume, bid_volume, volume_diff = self.get_volumes(orderbook)
        ws = self.get_weighted_spread(orderbook)
        ask_slope, bid_slope = self.get_slopes(orderbook)
        features = pd.concat(
            [
                spread,
                ask_depth,
                bid_depth,
                depth_diff,
                ask_volume,
                bid_volume,
                volume_diff,
                ws,
                ask_slope,
                bid_slope,
            ],
            axis=1,
        )
        features.index = pd.to_datetime(features.index, unit="s")
        return features

    def __call__(self, orderbook: pd.DataFrame) -> pd.DataFrame:
        return self.get_features(orderbook)

In [4]:
ob = OrderBookFeatures()
features = ob(df)
features

  .apply(lambda group: get_cumulative_amount(group, "b"))
  .apply(lambda group: get_cumulative_amount(group, "a"))
  .apply(lambda group: get_cumulative_price(group, "b"))
  .apply(lambda group: get_cumulative_price(group, "a"))


Unnamed: 0_level_0,spread,ask_depth,bid_depth,depth_diff,ask_volume,bid_volume,volume_diff,weighted_spread,ask_slope,bid_slope
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-06-04 22:00:02,6.19,5000,3510,1490,2695.804973,586356.113693,-583660.308720,80025.676785,7587.31,1.0
2018-06-04 22:00:34,6.19,5000,3510,1490,2696.561478,586351.269401,-583654.707923,80020.013482,7587.31,1.0
2018-06-04 22:01:01,4.97,5000,3510,1490,2699.165417,586350.938081,-583651.772664,80016.828223,7587.31,1.0
2018-06-04 22:01:33,4.96,5000,3508,1492,2656.934508,586307.355517,-583650.421009,79972.016340,7599.99,1.0
2018-06-04 22:02:02,4.90,5000,3509,1491,2657.946212,586317.596946,-583659.650734,79981.174003,7599.99,1.0
...,...,...,...,...,...,...,...,...,...,...
2018-09-30 21:57:34,1.58,5000,3546,1454,2954.913408,502228.305974,-499273.392566,59296.999012,6618.00,1.0
2018-09-30 21:58:01,1.59,5000,3542,1458,2926.069752,502208.483731,-499282.413979,59288.813590,6618.00,1.0
2018-09-30 21:58:34,0.76,5000,3543,1457,2927.899054,502203.145173,-499275.246118,59281.645729,6618.00,1.0
2018-09-30 21:59:01,0.75,5000,3543,1457,2959.560045,502203.053143,-499243.493098,59275.328542,6618.00,1.0


In [5]:
snapshot = df[df["time"] == 1534197602]
bids = snapshot[snapshot["type"] == "b"]
bids["cum_vol"] = bids["amount "].cumsum()
bids["p_vol"] = bids["cum_vol"] / bids["amount "].sum()
bids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bids["cum_vol"] = bids["amount "].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bids["p_vol"] = bids["cum_vol"] / bids["amount "].sum()


Unnamed: 0,time,type,price,amount,cum_vol,p_vol
0,1534197602,b,6236.82,0.079600,0.079600,1.322102e-07
1,1534197602,b,6236.79,0.040085,0.119685,1.987880e-07
2,1534197602,b,6236.78,2.402200,2.521885,4.188679e-06
3,1534197602,b,6235.02,0.040100,2.561985,4.255282e-06
4,1534197602,b,6235.00,1.000000,3.561985,5.916214e-06
...,...,...,...,...,...,...
3118,1534197602,b,0.05,10500.000000,288218.621986,4.787115e-01
3119,1534197602,b,0.04,1000.000000,289218.621986,4.803724e-01
3120,1534197602,b,0.03,12500.000000,301718.621986,5.011341e-01
3121,1534197602,b,0.02,71479.000000,373197.621986,6.198558e-01


In [6]:
# features = features.sort_index(ascending=True)
features.to_csv("../../data/btc/features.csv")

In [7]:
# df.to_csv('../../data/btc/df.csv')

In [8]:
ask_orders = (
    df[df["type"] == "a"]
    .groupby("time", as_index=False)
    .count()
    .set_index("time")["price"]
    .rename("ask_depth")
)
ask_orders.unique()

array([  5000,  85000,  10000,  30000, 130000, 135000,  20000,  15000])

In [9]:
csv_files

[PosixPath('../../data/btc/bitstamp_ob_14_08_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_14_09_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_16_07_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_16_06_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_08_08_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_08_09_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_24_07_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_24_06_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_26_08_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_26_09_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_31_07_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_18_09_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_18_08_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_28_06_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_28_07_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_06_06_2018.csv'),
 PosixPath('../../data/btc/bitstamp_ob_06_07_2018.csv'),
 PosixPath('../../data/btc/bits

In [10]:
df["time"].min(), df["time"].max()

(np.int64(1528149602), np.int64(1538344773))