In [8]:
import pandas as pd

# Load pipeline output
df = pd.read_csv("data/processed/market_daily.csv", parse_dates=True, index_col=0)

# Drop any rows with missing data
df_clean = df.dropna(how="any")

# Save clean versions
df_clean.to_csv("data/processed/market_daily_clean.csv")
df_clean.to_parquet("data/processed/market_daily_clean.parquet")


In [11]:
import os
import pandas as pd

processed_path = "data/processed"
cleaned_path = os.path.join(processed_path, "cleaned")
os.makedirs(cleaned_path, exist_ok=True)

for fname in os.listdir(processed_path):
    if ("clean" in fname.lower()) or fname.startswith(".") or fname.endswith(".txt"):
        continue  # skip already-cleaned, hidden, or placeholder files

    fpath = os.path.join(processed_path, fname)
    base, ext = os.path.splitext(fname)

    if ext == ".csv":
        df = pd.read_csv(fpath, parse_dates=True, index_col=0)
    elif ext == ".parquet":
        df = pd.read_parquet(fpath)
    else:
        continue

    # Drop NaNs
    df_clean = df.dropna(how="any")

    # Save as both CSV + Parquet
    df_clean.to_csv(os.path.join(cleaned_path, f"{base}_clean.csv"))
    df_clean.to_parquet(os.path.join(cleaned_path, f"{base}_clean.parquet"))

    print(f"✅ Cleaned {fname}")


✅ Cleaned market_daily.csv
✅ Cleaned market_daily.parquet
✅ Cleaned market_extended_spx.csv
✅ Cleaned market_extended_spx.parquet
✅ Cleaned market_extended_spy.csv
✅ Cleaned market_extended_spy.parquet


  df = pd.read_csv(fpath, parse_dates=True, index_col=0)


✅ Cleaned options_snapshot_spx.csv
✅ Cleaned options_snapshot_spx.parquet


  df = pd.read_csv(fpath, parse_dates=True, index_col=0)


✅ Cleaned options_snapshot_spy.csv
✅ Cleaned options_snapshot_spy.parquet


  df = pd.read_csv(fpath, parse_dates=True, index_col=0)


✅ Cleaned vol_surface_spx.csv
✅ Cleaned vol_surface_spx.parquet


  df = pd.read_csv(fpath, parse_dates=True, index_col=0)


✅ Cleaned vol_surface_spy.csv
✅ Cleaned vol_surface_spy.parquet


In [17]:
import pandas as pd
import os

cleaned_path = "data/processed/cleaned"
aligned_path = os.path.join("data", "processed", "aligned")
os.makedirs(aligned_path, exist_ok=True)

dfs = {}

for fname in os.listdir(cleaned_path):
    if fname.endswith(".csv") and "clean" in fname:
        key = fname.replace("_clean.csv", "")
        fpath = os.path.join(cleaned_path, fname)

        # Load file
        df = pd.read_csv(fpath)

        # If a "date" column exists, parse it
        if "date" in df.columns:
            df["date"] = pd.to_datetime(df["date"], errors="coerce")
            df = df.set_index("date")
        else:
            # Otherwise assume first column is the index
            df = pd.read_csv(fpath, parse_dates=True, index_col=0)
            df.index = pd.to_datetime(df.index, errors="coerce")

        # Drop NaT (invalid dates) from index
        df = df[~df.index.isna()]

        # Drop duplicate dates
        df = df[~df.index.duplicated(keep="first")]

        dfs[key] = df
        print(f"Loaded {key}: {df.index.min()} → {df.index.max()}, {len(df)} rows")

# Align all datasets on their common overlapping dates
final_df = pd.concat(dfs.values(), axis=1, join="inner")

# Prefix columns with dataset names
final_df.columns = [f"{k}__{col}" for k, df in dfs.items() for col in df.columns]

# Save final aligned dataset
final_df.to_csv(os.path.join(aligned_path, "final_aligned.csv"))
final_df.to_parquet(os.path.join(aligned_path, "final_aligned.parquet"))

print("\n✅ Final aligned dataset saved in:", aligned_path)
print("Shape:", final_df.shape)
print("Date range:", final_df.index.min(), "to", final_df.index.max())


Loaded market_daily: 1996-01-04 00:00:00 → 2025-08-29 00:00:00, 10831 rows
Loaded market_extended_spx: 1996-01-04 00:00:00 → 2023-08-31 00:00:00, 6963 rows
Loaded market_extended_spy: 2005-01-10 00:00:00 → 2023-08-31 00:00:00, 4693 rows
Loaded options_snapshot_spx: 1996-01-04 00:00:00 → 2023-08-31 00:00:00, 6963 rows
Loaded options_snapshot_spy: 2005-01-10 00:00:00 → 2023-08-31 00:00:00, 4692 rows
Loaded vol_surface_spx: 1996-01-04 00:00:00 → 2023-08-31 00:00:00, 6963 rows
Loaded vol_surface_spy: 2005-01-10 00:00:00 → 2023-08-31 00:00:00, 4693 rows

✅ Final aligned dataset saved in: data\processed\aligned
Shape: (4692, 78)
Date range: 2005-01-10 00:00:00 to 2023-08-31 00:00:00
