In [None]:
import glob
import os
import re
from io import StringIO

import pandas as pd

ACCOUNT_ID = "6973"
PATH_DIR = f"../data/raw/totals_by_date/{ACCOUNT_ID}/"


def parse_multi_table_csv(path):
    text = open(path, "r", encoding="utf-8").read()
    # Split into day blocks by lines starting with a date
    day_blocks = re.split(r"(?=^\d{1,2}/\d{1,2}/\d{2,4})", text, flags=re.MULTILINE)

    result = {}
    for block in day_blocks:
        block = block.strip()
        if not block:
            continue

        lines = block.splitlines()
        date_str = lines[0].strip()
        try:
            date = pd.to_datetime(date_str).date()
        except:
            continue

        body = "\n".join(lines[1:])

        # --- HERE: split on any line that starts with "Daily" or "Cash"
        sub_blocks = re.split(r"(?=^(?:Fee|Daily|Cash))", body, flags=re.MULTILINE)

        for i, sub in enumerate(sub_blocks):
            sub = sub.strip()
            if not sub:
                continue

            sub_lines = sub.splitlines()
            # first line is table name, e.g. "Daily Interest" or "Cash Table"

            table_name = (
                "Orders" if i == 0 else sub_lines[0].strip().replace(" ", "_").lower()
            )
            csv_text = "\n".join(sub_lines[(0 if i == 0 else 1) :])  # the header+rows

            # parse into a DataFrame
            df = pd.read_csv(StringIO(csv_text))

            result[date] = df
            break

    return result

In [None]:
# read all csv files in the directory and combine them into a single dataframe


def read_csv_files_in_directory(directory):
    path_pattern = os.path.join(directory, "*.csv")

    # Get a list of all matching files
    csv_files = glob.glob(path_pattern)
    print(f"Found {len(csv_files)} files.")
    df_list = []

    for file in csv_files:
        tables = parse_multi_table_csv(file)
        for date, df in tables.items():
            tmp = df.copy()  # don’t overwrite the original
            tmp["date"] = date  # add a new column
            df_list.append(tmp)

    # if df list is empty, return an empty DataFrame
    if not df_list:
        return pd.DataFrame()
    # concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df


totals_by_date = read_csv_files_in_directory(PATH_DIR)

date_col = totals_by_date.pop("date")
totals_by_date.insert(0, "date", date_col)
totals_by_date.sort_values(by=["date"], inplace=True)
totals_by_date.head(15)

In [None]:
totals_by_date.columns

In [None]:
fee_columns = [
    "Comm",
    "Ecn Fee",
    "SEC",
    "ORF",
    "CAT",
    "TAF",
    "FTT",
    "NSCC",
    "Acc",
    "Clr",
    "Misc",
]
totals_by_date["fee_sum"] = totals_by_date[fee_columns].sum(axis=1)
totals_by_date.drop(fee_columns, axis=1, inplace=True)
totals_by_date.head(15)

In [None]:
rename_map = {
    "Symbol": "symbol",
    "Orders": "orders",
    "Fills": "fills",
    "Qty": "qty",
    "Gross": "gross",
    "Net": "net",
    "Unrealized δ": "unrealized_delta",
    "Total δ": "total_delta",
    "Unrealized": "unrealized",
}
totals_by_date.rename(mapper=rename_map, axis=1, inplace=True)
totals_by_date[12:16]

In [None]:
numeric_cols = [
    "orders",
    "fills",
    "qty",
    "gross",
    "net",
    "unrealized_delta",
    "total_delta",
    "unrealized",
    "fee_sum",
]
totals_by_date[numeric_cols] = totals_by_date[numeric_cols].apply(
    pd.to_numeric, errors="coerce"
)
totals_by_date["date"] = pd.to_datetime(totals_by_date["date"], format="%y%m%d")

In [None]:
display(totals_by_date.describe(include="all"))

In [None]:
totals_by_date.info()

In [None]:
predictors = (
    totals_by_date.groupby("date")
    .agg(
        symbols=("symbol", "nunique"),
        orders=("orders", "sum"),
        fills=("fills", "sum"),
        qty=("qty", "sum"),
        net=("net", "sum"),
        unrealized_delta=("unrealized_delta", "sum"),
        total_delta=("total_delta", "sum"),
        gross=("gross", "sum"),
        fee_sum=("fee_sum", "sum"),
    )
    .reset_index()
)

In [None]:
# rolling windows need a proper datetime index
predictors = predictors.sort_index()

# 3‑day rolling mean of total_delta
predictors["td_3d_mean"] = predictors["total_delta"].rolling(3, min_periods=1).mean()

# 5‑day rolling std (volatility)
predictors["td_5d_std"] = (
    predictors["total_delta"].rolling(5, min_periods=1).std().fillna(0)
)

In [None]:
predictors.head()

In [None]:
import numpy as np

# start from your existing frame, indexed by date
df = predictors.set_index("date").sort_index()

# 1.1 Rolling P/L history
df["td_3d_mean"] = df["total_delta"].rolling(3, min_periods=1).mean()
df["td_5d_std"] = df["total_delta"].rolling(5, min_periods=1).std().fillna(0)

# 1.2 Fee impact
# assume you also have a 'fee_sum' and 'gross' column
df["fee_impact"] = (df["fee_sum"] / df["gross"].replace(0, np.nan)).fillna(0)

# 1.3 Time features
df["dow"] = df.index.dayofweek  # 0=Mon
df["month"] = df.index.month
df["quarter"] = df.index.quarter

In [None]:
df.head()

In [None]:
df.to_pickle(f"../data/train/totals_by_date{ACCOUNT_ID}.pkl")

In [None]:
display(df.head(15))