In [9]:
from datetime import datetime, timezone

In [1]:
import boto3

s3 = boto3.client(
    "s3", 
    endpoint_url = "http://localhost:9100",
    aws_access_key_id = "minioadmin",
    aws_secret_access_key = "minioadmin", 
    region_name = "us-east-1",
)

In [2]:
s3.list_buckets()

{'ResponseMetadata': {'RequestId': '188A0DC42E8F6EA0',
  'HostId': '40fd399614142fea3be9690e18526c1881df2b9fc838b215f9c270b056695f9e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '369',
   'content-type': 'application/xml',
   'server': 'MinIO AIStor',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-id-2': '40fd399614142fea3be9690e18526c1881df2b9fc838b215f9c270b056695f9e',
   'x-amz-request-id': '188A0DC42E8F6EA0',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '624',
   'x-ratelimit-remaining': '624',
   'x-xss-protection': '1; mode=block',
   'date': 'Mon, 12 Jan 2026 18:02:19 GMT'},
  'RetryAttempts': 0},
 'Buckets': [{'Name': 'market-data',
   'CreationDate': datetime.datetime(2026, 1, 10, 16, 18, 43, 288000, tzinfo=tzutc())}],
 'Owner': {'DisplayName': 'minio',
  'ID': '02d6176db174dc93cb1b899f7c6078f08654445fe8cf1b6ce98d8855f66bdbf4'}}

In [3]:
s3.list_objects_v2(
    Bucket="market-data",
    Prefix="symbol=SPY/year=2020/"
)

{'ResponseMetadata': {'RequestId': '188A0DC64E4D98B4',
  'HostId': '40fd399614142fea3be9690e18526c1881df2b9fc838b215f9c270b056695f9e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '3168',
   'content-type': 'application/xml',
   'server': 'MinIO AIStor',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-id-2': '40fd399614142fea3be9690e18526c1881df2b9fc838b215f9c270b056695f9e',
   'x-amz-request-id': '188A0DC64E4D98B4',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '624',
   'x-ratelimit-remaining': '624',
   'x-xss-protection': '1; mode=block',
   'date': 'Mon, 12 Jan 2026 18:02:29 GMT'},
  'RetryAttempts': 0},
 'IsTruncated': False,
 'Contents': [{'Key': 'symbol=SPY/year=2020/month=01.parquet',
   'LastModified': datetime.datetime(2026, 1, 10, 16, 48, 27, 108000, tzinfo=tzutc()),
   'ETag': '"378e84d6e62f12118d4fb47da3b1108f"',
   'Size': 441587,
   'Stor

In [4]:
obj = s3.get_object(
    Bucket="market-data",
    Key="symbol=SPY/year=2020/month=01.parquet"
)
data = obj["Body"].read()

In [6]:
import polars as pl

path = f"s3://market-data/symbol=SPY/**/*.parquet",
df = pl.scan_parquet(
    path,
    storage_options={
        "aws_access_key_id": "minioadmin",
        "aws_secret_access_key": "minioadmin",
        "endpoint_url": "http://localhost:9100",
    },
)

print(df.select(pl.len()).collect())


shape: (1, 1)
┌─────────┐
│ len     │
│ ---     │
│ u32     │
╞═════════╡
│ 1238488 │
└─────────┘


# Get within a range 

In [7]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
import polars as pl

def month_range(start, end):
    cur = start.replace(day=1)
    while cur <= end:
        yield cur
        cur += relativedelta(months=1)


def load_ticks(symbol, start_ts, end_ts):
    paths = []

    for d in month_range(start_ts, end_ts):
        paths.append(
            f"s3://market-data/symbol={symbol}/year={d.year}/month={d.month:02d}.parquet"
        )

    df = pl.scan_parquet(
        paths,
        storage_options={
            "aws_access_key_id": "minioadmin",
            "aws_secret_access_key": "minioadmin",
            "endpoint_url": "http://localhost:9100",
        },
    )

    df = df.filter(
        (pl.col("timestamp") >= start_ts) &
        (pl.col("timestamp") <= end_ts)
    ).sort("timestamp")

    return df.collect()


In [12]:
print(    datetime(2020, 1, 2, 0, 0, tzinfo=timezone.utc))

2020-01-02 00:00:00+00:00


In [10]:
ticks = load_ticks(
    "SPY",
    datetime(2020, 1, 2, 0, 0, tzinfo=timezone.utc),
    datetime(2023, 1, 3, 0, 0, tzinfo=timezone.utc),
)

print(ticks.shape)
print(ticks.head())

(628944, 9)
shape: (5, 9)
┌────────┬──────────────────────┬────────┬────────┬───┬────────┬────────┬─────────────┬────────────┐
│ symbol ┆ timestamp            ┆ open   ┆ high   ┆ … ┆ close  ┆ volume ┆ trade_count ┆ vwap       │
│ ---    ┆ ---                  ┆ ---    ┆ ---    ┆   ┆ ---    ┆ ---    ┆ ---         ┆ ---        │
│ str    ┆ datetime[ns, UTC]    ┆ f64    ┆ f64    ┆   ┆ f64    ┆ f64    ┆ f64         ┆ f64        │
╞════════╪══════════════════════╪════════╪════════╪═══╪════════╪════════╪═════════════╪════════════╡
│ SPY    ┆ 2020-01-02 09:00:00  ┆ 323.52 ┆ 323.67 ┆ … ┆ 323.56 ┆ 9391.0 ┆ 18.0        ┆ 323.575879 │
│        ┆ UTC                  ┆        ┆        ┆   ┆        ┆        ┆             ┆            │
│ SPY    ┆ 2020-01-02 09:01:00  ┆ 323.68 ┆ 323.69 ┆ … ┆ 323.61 ┆ 905.0  ┆ 6.0         ┆ 323.651111 │
│        ┆ UTC                  ┆        ┆        ┆   ┆        ┆        ┆             ┆            │
│ SPY    ┆ 2020-01-02 09:02:00  ┆ 323.68 ┆ 323.68 ┆ … ┆ 323.68 ┆ 

In [11]:
ticks

symbol,timestamp,open,high,low,close,volume,trade_count,vwap
str,"datetime[ns, UTC]",f64,f64,f64,f64,f64,f64,f64
"""SPY""",2020-01-02 09:00:00 UTC,323.52,323.67,323.52,323.56,9391.0,18.0,323.575879
"""SPY""",2020-01-02 09:01:00 UTC,323.68,323.69,323.61,323.61,905.0,6.0,323.651111
"""SPY""",2020-01-02 09:02:00 UTC,323.68,323.68,323.68,323.68,121.0,1.0,323.68
"""SPY""",2020-01-02 09:03:00 UTC,323.69,323.69,323.69,323.69,127.0,3.0,323.69
"""SPY""",2020-01-02 09:04:00 UTC,323.64,323.64,323.64,323.64,100.0,1.0,323.64
…,…,…,…,…,…,…,…,…
"""SPY""",2022-12-30 22:54:00 UTC,383.1,383.1,383.09,383.1,1670.0,17.0,383.099333
"""SPY""",2022-12-30 22:56:00 UTC,383.1,383.1,383.1,383.1,807.0,21.0,383.1
"""SPY""",2022-12-30 22:58:00 UTC,383.11,383.11,383.11,383.11,538.0,5.0,383.11
"""SPY""",2022-12-30 22:59:00 UTC,383.1,383.1,383.1,383.1,505.0,6.0,383.1


In [32]:
import polars as pl

def resample_bars(bars: pl.DataFrame, timeframe: str) -> pl.DataFrame:
    """
    timeframe: "5m", "15m", "1h", "1d", "1w", "1mo"
    """

    if timeframe in {"5m", "15m", "1h"}:
        rule = {"5m": "5m", "15m": "15m", "1h": "1h"}[timeframe]
        df = bars.with_columns(pl.col("timestamp").dt.truncate(rule).alias("bar"))

    elif timeframe == "1d":
        df = bars.with_columns(pl.col("timestamp").dt.date().alias("bar"))

    elif timeframe == "1w":
        df = bars.with_columns(pl.col("timestamp").dt.truncate("1w").alias("bar"))

    elif timeframe == "1mo":
        df = bars.with_columns(
            pl.col("timestamp")
            .dt.strftime("%Y-%m-01")
            .str.strptime(pl.Date)
            .alias("bar")
        )

    else:
        raise ValueError("timeframe must be one of: 5m, 15m, 1h, 1d, 1w, 1mo")

    out = (
        df.group_by("bar")
        .agg([
            pl.col("open").first().alias("open"),
            pl.col("high").max().alias("high"),
            pl.col("low").min().alias("low"),
            pl.col("close").last().alias("close"),
            pl.col("volume").sum().alias("volume"),
            pl.col("trade_count").sum().alias("trade_count"),
            ((pl.col("vwap") * pl.col("volume")).sum() / pl.col("volume").sum()).alias("vwap"),
        ])
        .sort("bar")
    )

    return out


In [33]:
bars_5m  = resample_bars(ticks, "5m")
bars_15m = resample_bars(ticks, "15m")
bars_1h  = resample_bars(ticks, "1h")
bars_1d  = resample_bars(ticks, "1d")

print(bars_5m.head())


shape: (5, 8)
┌─────────────────────────┬────────┬────────┬────────┬────────┬─────────┬─────────────┬────────────┐
│ bar                     ┆ open   ┆ high   ┆ low    ┆ close  ┆ volume  ┆ trade_count ┆ vwap       │
│ ---                     ┆ ---    ┆ ---    ┆ ---    ┆ ---    ┆ ---     ┆ ---         ┆ ---        │
│ datetime[ns, UTC]       ┆ f64    ┆ f64    ┆ f64    ┆ f64    ┆ f64     ┆ f64         ┆ f64        │
╞═════════════════════════╪════════╪════════╪════════╪════════╪═════════╪═════════════╪════════════╡
│ 2020-01-02 09:00:00 UTC ┆ 323.52 ┆ 323.69 ┆ 323.52 ┆ 323.64 ┆ 10644.0 ┆ 29.0        ┆ 323.585423 │
│ 2020-01-02 09:05:00 UTC ┆ 323.71 ┆ 323.78 ┆ 323.7  ┆ 323.76 ┆ 3333.0  ┆ 11.0        ┆ 323.720596 │
│ 2020-01-02 09:10:00 UTC ┆ 323.76 ┆ 323.83 ┆ 323.73 ┆ 323.81 ┆ 3900.0  ┆ 13.0        ┆ 323.814102 │
│ 2020-01-02 09:15:00 UTC ┆ 323.76 ┆ 323.76 ┆ 323.76 ┆ 323.76 ┆ 100.0   ┆ 1.0         ┆ 323.76     │
│ 2020-01-02 09:20:00 UTC ┆ 323.76 ┆ 323.76 ┆ 323.76 ┆ 323.76 ┆ 100.0   ┆ 1.0