In [1]:
import pandas as pd
from io import StringIO
import re


def parse_multi_table_csv(path):
    text = open(path, "r", encoding="utf-8").read()
    # Split into day blocks by lines starting with a date
    day_blocks = re.split(r"(?=^\d{1,2}/\d{1,2}/\d{2,4})", text, flags=re.MULTILINE)

    result = {}
    for block in day_blocks:
        block = block.strip()
        if not block:
            continue

        lines = block.splitlines()
        date_str = lines[0].strip()
        try:
            date = pd.to_datetime(date_str).date()
        except:
            continue

        body = "\n".join(lines[1:])

        # --- HERE: split on any line that starts with "Daily" or "Cash"
        sub_blocks = re.split(r"(?=^(?:Fee|Daily|Cash))", body, flags=re.MULTILINE)

        for i, sub in enumerate(sub_blocks):
            sub = sub.strip()
            if not sub:
                continue

            sub_lines = sub.splitlines()
            # first line is table name, e.g. "Daily Interest" or "Cash Table"

            table_name = (
                "Orders" if i == 0 else sub_lines[0].strip().replace(" ", "_").lower()
            )
            csv_text = "\n".join(sub_lines[(0 if i == 0 else 1) :])  # the header+rows

            # parse into a DataFrame
            df = pd.read_csv(StringIO(csv_text))

            result[date] = df
            break

    return result


tables = parse_multi_table_csv("../data/raw/totalsByDate_new.csv")

In [2]:
df_list = []
for date, df in tables.items():
    tmp = df.copy()  # don’t overwrite the original
    tmp["date"] = date  # add a new column
    df_list.append(tmp)

# 2. stack them into one DataFrame
totals_by_date = pd.concat(df_list, ignore_index=True)
date_col = totals_by_date.pop("date")
totals_by_date.insert(0, "date", date_col)
totals_by_date.head()

  totals_by_date = pd.concat(df_list, ignore_index=True)


Unnamed: 0,date,Symbol,Orders,Fills,Qty,Gross,Comm,Ecn Fee,SEC,ORF,...,TAF,FTT,NSCC,Acc,Clr,Misc,Net,Unrealized δ,Total δ,Unrealized
0,2023-04-17,BABA,4,47,20000,9760.99,70.0,-4.602,7.74,0,...,1.45,0,0.8968,0,0,0,9685.5052,0.0,9685.5052,0.0
1,2023-04-17,BBBY,3,19,50000,0.0,175.0,47.997,0.0,0,...,0.0,0,1.2708,0,0,0,-224.2678,-1994.0,-2218.2678,-1994.0
2,2023-04-17,CFRX,16,134,70000,15300.11,245.0,22.2506,0.78,0,...,5.085,0,2.789,0,0,0,15024.2054,0.0,15024.2054,0.0
3,2023-04-17,CNSP,3,67,20000,-2712.6,70.0,44.7481,0.23,0,...,1.457,0,0.9467,0,0,0,-2829.9818,0.0,-2829.9818,0.0
4,2023-04-17,GOOGL,2,116,10000,-1443.315,35.0,32.7655,4.05,0,...,0.742,0,1.18,0,0,0,-1517.0525,0.0,-1517.0525,0.0


In [3]:
totals_by_date.columns

Index(['date', 'Symbol', 'Orders', 'Fills', 'Qty', 'Gross', 'Comm', 'Ecn Fee',
       'SEC', 'ORF', 'CAT', 'TAF', 'FTT', 'NSCC', 'Acc', 'Clr', 'Misc', 'Net',
       'Unrealized δ', 'Total δ', 'Unrealized'],
      dtype='object')

In [4]:
fee_columns = [
    "Comm",
    "Ecn Fee",
    "SEC",
    "ORF",
    "CAT",
    "TAF",
    "FTT",
    "NSCC",
    "Acc",
    "Clr",
    "Misc",
]
totals_by_date["fee_sum"] = totals_by_date[fee_columns].sum(axis=1)
totals_by_date.drop(fee_columns, axis=1, inplace=True)
totals_by_date.head()

Unnamed: 0,date,Symbol,Orders,Fills,Qty,Gross,Net,Unrealized δ,Total δ,Unrealized,fee_sum
0,2023-04-17,BABA,4,47,20000,9760.99,9685.5052,0.0,9685.5052,0.0,75.4848
1,2023-04-17,BBBY,3,19,50000,0.0,-224.2678,-1994.0,-2218.2678,-1994.0,224.2678
2,2023-04-17,CFRX,16,134,70000,15300.11,15024.2054,0.0,15024.2054,0.0,275.9046
3,2023-04-17,CNSP,3,67,20000,-2712.6,-2829.9818,0.0,-2829.9818,0.0,117.3818
4,2023-04-17,GOOGL,2,116,10000,-1443.315,-1517.0525,0.0,-1517.0525,0.0,73.7375


In [5]:
rename_map = {
    "Symbol": "symbol",
    "Orders": "orders",
    "Fills": "fills",
    "Qty": "qty",
    "Gross": "gross",
    "Net": "net",
    "Unrealized δ": "unrealized_delta",
    "Total δ": "total_delta",
    "Unrealized": "unrealized",
}
totals_by_date.rename(mapper=rename_map, axis=1, inplace=True)
totals_by_date[12:16]

Unnamed: 0,date,symbol,orders,fills,qty,gross,net,unrealized_delta,total_delta,unrealized,fee_sum
12,2023-04-17,VBLT,2,16,30000,0.0,-126.262649,-330.5497,-456.812349,-330.5497,126.262649
13,2023-04-17,XPEV,3,94,20000,-32.92,-156.0539,0.0,-156.0539,0.0,123.1339
14,2023-04-18,AMD,2,48,20000,-9681.85,-9813.2516,0.0,-9813.2516,0.0,131.4016
15,2023-04-18,BAC,8,40,30000,-1303.8,-1470.4278,0.0,-1470.4278,0.0,166.6278


In [6]:
numeric_cols = [
    "orders",
    "fills",
    "qty",
    "gross",
    "net",
    "unrealized_delta",
    "total_delta",
    "unrealized",
    "fee_sum",
]
totals_by_date[numeric_cols] = totals_by_date[numeric_cols].apply(
    pd.to_numeric, errors="coerce"
)
totals_by_date["date"] = pd.to_datetime(totals_by_date["date"], format="%y%m%d")

In [7]:
display(totals_by_date.describe(include="all"))

Unnamed: 0,date,symbol,orders,fills,qty,gross,net,unrealized_delta,total_delta,unrealized,fee_sum
count,2971,2971,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0,2971.0
unique,,600,,,,,,,,,
top,,TSLA,,,,,,,,,
freq,,177,,,,,,,,,
mean,2024-02-19 03:43:26.395152896,,7.301919,79.977785,13576.852238,158.691211,77.914624,4.897986e-15,77.914624,836.981859,80.776587
min,2023-04-17 00:00:00,,0.0,0.0,0.0,-109834.0,-110490.331685,-55600.0,-110490.331685,-55600.0,0.0
25%,2023-08-09 00:00:00,,2.0,16.0,2000.0,-2870.955,-2951.472351,0.0,-3274.324864,0.0,13.35251
50%,2024-02-09 00:00:00,,5.0,44.0,7000.0,0.0,-33.999,0.0,-490.6448,0.0,41.444076
75%,2024-07-30 00:00:00,,10.0,98.5,16000.0,1526.0175,1485.111442,0.0,2142.344045,0.0,98.806477
max,2025-02-28 00:00:00,,118.0,1359.0,200000.0,156570.89,155998.689847,58535.8,155998.689847,96684.49,1345.560152


In [8]:
totals_by_date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2971 entries, 0 to 2970
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              2971 non-null   datetime64[ns]
 1   symbol            2971 non-null   object        
 2   orders            2971 non-null   int64         
 3   fills             2971 non-null   int64         
 4   qty               2971 non-null   int64         
 5   gross             2971 non-null   float64       
 6   net               2971 non-null   float64       
 7   unrealized_delta  2971 non-null   float64       
 8   total_delta       2971 non-null   float64       
 9   unrealized        2971 non-null   float64       
 10  fee_sum           2971 non-null   float64       
dtypes: datetime64[ns](1), float64(6), int64(3), object(1)
memory usage: 255.4+ KB


In [9]:
predictors = (
    totals_by_date.groupby("date")
    .agg(
        symbols=("symbol", "nunique"),
        orders=("orders", "sum"),
        fills=("fills", "sum"),
        qty=("qty", "sum"),
        net=("net", "sum"),
        unrealized_delta=("unrealized_delta", "sum"),
        total_delta=("total_delta", "sum"),
        gross=("gross", "sum"),
        fee_sum=("fee_sum", "sum"),
    )
    .reset_index()
)

In [10]:
# rolling windows need a proper datetime index
predictors = predictors.sort_index()

# 3‑day rolling mean of total_delta
predictors["td_3d_mean"] = predictors["total_delta"].rolling(3, min_periods=1).mean()

# 5‑day rolling std (volatility)
predictors["td_5d_std"] = (
    predictors["total_delta"].rolling(5, min_periods=1).std().fillna(0)
)

In [11]:
predictors.head()

Unnamed: 0,date,symbols,orders,fills,qty,net,unrealized_delta,total_delta,gross,fee_sum,td_3d_mean,td_5d_std
0,2023-04-17,14,70,1073,383000,21678.245762,-716.7197,20961.526062,23605.575,1927.329238,20961.526062,0.0
1,2023-04-18,7,17,177,66000,-27834.145527,1747.17,-26086.975527,-27430.24,403.905527,-2562.724733,33268.314518
2,2023-04-19,10,84,916,283800,-51621.684762,-1625.0,-53246.684762,-50310.3495,1311.335262,-19457.378076,37545.682977
3,2023-04-20,20,156,2528,424000,93030.321081,16725.7647,109756.085781,95428.5053,2398.184219,10140.808497,71510.94567
4,2023-04-21,12,90,1797,273000,-46621.954396,-22603.875,-69225.829396,-45183.225,1438.729396,-4238.809459,71989.708649


In [12]:
import numpy as np

# start from your existing frame, indexed by date
df = predictors.set_index("date").sort_index()

# 1.1 Rolling P/L history
df["td_3d_mean"] = df["total_delta"].rolling(3, min_periods=1).mean()
df["td_5d_std"] = df["total_delta"].rolling(5, min_periods=1).std().fillna(0)

# 1.2 Fee impact
# assume you also have a 'fee_sum' and 'gross' column
df["fee_impact"] = (df["fee_sum"] / df["gross"].replace(0, np.nan)).fillna(0)

# 1.3 Time features
df["dow"] = df.index.dayofweek  # 0=Mon
df["month"] = df.index.month
df["quarter"] = df.index.quarter

In [13]:
df.head()

Unnamed: 0_level_0,symbols,orders,fills,qty,net,unrealized_delta,total_delta,gross,fee_sum,td_3d_mean,td_5d_std,fee_impact,dow,month,quarter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-04-17,14,70,1073,383000,21678.245762,-716.7197,20961.526062,23605.575,1927.329238,20961.526062,0.0,0.081647,0,4,2
2023-04-18,7,17,177,66000,-27834.145527,1747.17,-26086.975527,-27430.24,403.905527,-2562.724733,33268.314518,-0.014725,1,4,2
2023-04-19,10,84,916,283800,-51621.684762,-1625.0,-53246.684762,-50310.3495,1311.335262,-19457.378076,37545.682977,-0.026065,2,4,2
2023-04-20,20,156,2528,424000,93030.321081,16725.7647,109756.085781,95428.5053,2398.184219,10140.808497,71510.94567,0.025131,3,4,2
2023-04-21,12,90,1797,273000,-46621.954396,-22603.875,-69225.829396,-45183.225,1438.729396,-4238.809459,71989.708649,-0.031842,4,4,2


In [14]:
df.to_pickle("../data/train/predictors_new.pkl")

In [15]:
display(df)

Unnamed: 0_level_0,symbols,orders,fills,qty,net,unrealized_delta,total_delta,gross,fee_sum,td_3d_mean,td_5d_std,fee_impact,dow,month,quarter
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-04-17,14,70,1073,383000,21678.245762,-716.7197,20961.526062,23605.5750,1927.329238,20961.526062,0.000000,0.081647,0,4,2
2023-04-18,7,17,177,66000,-27834.145527,1747.1700,-26086.975527,-27430.2400,403.905527,-2562.724733,33268.314518,-0.014725,1,4,2
2023-04-19,10,84,916,283800,-51621.684762,-1625.0000,-53246.684762,-50310.3495,1311.335262,-19457.378076,37545.682977,-0.026065,2,4,2
2023-04-20,20,156,2528,424000,93030.321081,16725.7647,109756.085781,95428.5053,2398.184219,10140.808497,71510.945670,0.025131,3,4,2
2023-04-21,12,90,1797,273000,-46621.954396,-22603.8750,-69225.829396,-45183.2250,1438.729396,-4238.809459,71989.708649,-0.031842,4,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-02-21,7,73,557,113800,15653.238106,0.0000,15653.238106,16270.5750,617.336894,19979.684778,12814.447820,0.037942,4,2,1
2025-02-25,6,62,435,49000,-27588.270176,0.0000,-27588.270176,-27273.2300,315.040176,787.383599,21547.985857,-0.011551,1,2,1
2025-02-26,3,40,352,74000,11178.913912,0.0000,11178.913912,11553.4300,374.516088,-252.039386,21539.877418,0.032416,2,2,1
2025-02-27,9,112,2158,219000,49249.559899,0.0000,49249.559899,50811.6100,1562.050101,10946.734545,27260.322408,0.030742,3,2,1
