In [1]:
import sys, pandas as pd
print("Kernel Python:", sys.executable)
print("Pandas:", pd.__version__)
try:
    import pyarrow as pa, pyarrow.parquet as pq
    print("pyarrow:", pa.__version__)
except Exception as e:
    print("pyarrow import failed:", repr(e))


Kernel Python: c:\.projects\stock-direction-ml\.venv\Scripts\python.exe
Pandas: 2.3.2
pyarrow import failed: ModuleNotFoundError("No module named 'pyarrow'")


In [2]:
import os, sys
from importlib import reload

# make repo root importable (this notebook lives in notebooks/)
sys.path.append(os.path.abspath(".."))

# local modules (no market/news here)
from src import data as data_mod, features as features_mod, utils as utils_mod
reload(data_mod); reload(features_mod); reload(utils_mod)

from src.data import get_data
from src.features import add_features
from src.utils import make_labels


In [3]:
TICKER = "AAPL"
START, END = "2015-01-01", "2023-12-31"

# labeling
TAU = 0.001
DEAD_ZONE = True

print(TICKER, START, END, TAU, DEAD_ZONE)


AAPL 2015-01-01 2023-12-31 0.001 True


In [4]:
import pandas as pd

df = get_data(TICKER, start=START, end=END)
df = add_features(df)
df = make_labels(df, tau=TAU, dead_zone=DEAD_ZONE)

# Clean & order
if "date" in df.columns:
    df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

# Exclude non-feature columns
exclude = {"date","open","high","low","close","volume","ret_next","y"}
feat_cols = [c for c in df.columns if c not in exclude]

len(df), len(feat_cols), feat_cols[:10]


(2078,
 8,
 ['ret1', 'ret5', 'ret10', 'vol10', 'volz', 'rsi14', 'macd', 'macd_signal'])

In [5]:
import numpy as np

# 70% train, 15% val, 15% test (time-ordered split)
n = len(df)
i1 = int(0.70 * n)
i2 = int(0.85 * n)

df_tr = df.iloc[:i1].copy()
df_va = df.iloc[i1:i2].copy()
df_te = df.iloc[i2:].copy()

Xtr, ytr = df_tr[feat_cols].values, df_tr["y"].values
Xva, yva = df_va[feat_cols].values, df_va["y"].values
Xte, yte = df_te[feat_cols].values, df_te["y"].values

len(df_tr), len(df_va), len(df_te)


(1454, 312, 312)

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np

# scaler + LR
scaler = StandardScaler().fit(Xtr)
Xtr_s, Xva_s, Xte_s = scaler.transform(Xtr), scaler.transform(Xva), scaler.transform(Xte)

lr = LogisticRegression(max_iter=2000)
lr.fit(Xtr_s, ytr)
p_va_lr = lr.predict_proba(Xva_s)[:,1]
p_te_lr = lr.predict_proba(Xte_s)[:,1]

print("AUC (VAL) — LR:", roc_auc_score(yva, p_va_lr))
print("AUC (TEST) — LR:", roc_auc_score(yte, p_te_lr))

# optional XGB (skip if xgboost not installed)
try:
    from xgboost import XGBClassifier
    xgb = XGBClassifier(
        n_estimators=300, max_depth=3, subsample=0.9, colsample_bytree=0.9,
        learning_rate=0.05, random_state=42, n_jobs=0, reg_lambda=1.0
    )
    xgb.fit(Xtr, ytr)
    p_va_xgb = xgb.predict_proba(Xva)[:,1]
    p_te_xgb = xgb.predict_proba(Xte)[:,1]
    print("AUC (VAL) — XGB:", roc_auc_score(yva, p_va_xgb))
    print("AUC (TEST) — XGB:", roc_auc_score(yte, p_te_xgb))
except Exception as e:
    print("XGBoost not available:", e)


AUC (VAL) — LR: 0.4847037509778893
AUC (TEST) — LR: 0.4734255626677679
AUC (VAL) — XGB: 0.5282661506155556
AUC (TEST) — XGB: 0.47003923188106544


In [7]:
import json, os
os.makedirs("data", exist_ok=True)
os.makedirs("artifacts", exist_ok=True)

storage_meta = {"format": None}

# Try Parquet first; fall back to CSV if engine missing
try:
    df.to_parquet("data/df_nb02.parquet", index=False)
    storage_meta["format"] = "parquet"
    print("Saved Parquet -> data/df_nb02.parquet")
except Exception as e:
    print("Parquet save failed:", e)
    df.to_csv("data/df_nb02.csv", index=False)
    storage_meta["format"] = "csv"
    print("Saved CSV -> data/df_nb02.csv")

# Save features + storage format
with open("artifacts/feature_list.json", "w", encoding="utf-8") as f:
    json.dump(sorted(feat_cols), f, indent=2)
with open("data/storage_format.json", "w", encoding="utf-8") as f:
    json.dump(storage_meta, f)

print("Wrote: artifacts/feature_list.json and data/storage_format.json")


Parquet save failed: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.
Saved CSV -> data/df_nb02.csv
Wrote: artifacts/feature_list.json and data/storage_format.json
