# HDFC Bank diagnostics: 10-year return modeling with LSTM + LightGBM

This notebook pulls data for a single company from MongoDB, builds the same features used by the API (MACD, RSI, MAs, momentum, volatility, proximity), appends exogenous signals from database collections, and trains:

- A return-based LightGBM model on the last 10 years (with StandardScaler)
- A return-based LSTM (if TensorFlow is available; otherwise a linear fallback)

Predictions are made for horizons h in {1, 5, 10} and re-anchored to the latest close. Final outputs include model components and a volatility-clamped ensemble estimate.



In [9]:
# Setup
import os
import json
import math
import numpy as np
import pandas as pd
from pymongo import MongoClient
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

try:
    import lightgbm as lgb
except Exception:
    lgb = None

try:
    import tensorflow as tf
    from tensorflow import keras
    TF_AVAILABLE = True
except Exception:
    TF_AVAILABLE = False

MONGO_URI = os.getenv("MONGODB_URI", "mongodb://localhost:27017")
DB_CACHE = os.getenv("MONGODB_CACHE_DB", "investment_advisor_cache")
client = MongoClient(MONGO_URI)
db = client[DB_CACHE]

NAME = "Hdfc Bank"
LNAME = NAME.strip().lower()
HORIZONS = [1, 5, 10]



In [10]:
# Load 10-year historical series and signals

def extract_price_series(doc: dict) -> pd.DataFrame:
    datasets = (doc or {}).get("datasets", [])
    price_ds = next((ds for ds in datasets if str(ds.get("metric", "")).lower() == "price" or str(ds.get("label", "")).lower().startswith("price")), None)
    if not price_ds:
        return pd.DataFrame()
    rows = price_ds.get("values", [])
    dt, val = [], []
    for r in rows:
        if len(r) >= 2:
            dt.append(pd.to_datetime(r[0]))
            try:
                val.append(float(r[1]))
            except Exception:
                val.append(np.nan)
    df = pd.DataFrame({"date": dt, "close": val}).dropna()
    df.sort_values("date", inplace=True)
    df.set_index("date", inplace=True)
    return df

hist = db.historical_data.find_one({"_norm_name": LNAME, "_period": "10yr", "_filter": "price"}) or db.historical_data.find_one({"_norm_name": LNAME}) or {}
prices = extract_price_series(hist)
print("History rows:", len(prices))

# Auxiliary collections
fifty = db.fiftytwo_week.find_one({}) or {}
ann = db.recent_announcements.find_one({"_norm_name": LNAME}) or {}
corp = db.corporate_actions.find_one({"_norm_name": LNAME}) or {}
details = db.stock_details.find_one({"_norm_name": LNAME}) or {}
stats_doc = db.historical_stats.find_one({"_norm_name": LNAME}) or {}



ServerSelectionTimeoutError: localhost:27017: [Errno 61] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 689cdbb01846c4a61e6499f8, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 61] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [7]:
# Feature builder (match backend logic with extras)

def build_features(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    out = df.copy()
    out["ret_1d"] = out["close"].pct_change()
    out["dma_20"] = out["close"].rolling(20).mean()
    out["dma_50"] = out["close"].rolling(50).mean()
    out["dma_100"] = out["close"].rolling(100).mean()
    out["dma_200"] = out["close"].rolling(200).mean()
    out["vol_20"] = out["ret_1d"].rolling(20).std()
    out["mom_10"] = out["close"].pct_change(10)
    out["close_over_dma20"] = out["close"] / out["dma_20"]
    out["close_over_dma50"] = out["close"] / out["dma_50"]
    out["close_over_dma100"] = out["close"] / out["dma_100"]
    out["close_over_dma200"] = out["close"] / out["dma_200"]
    out["roll_max_60"] = out["close"].rolling(60).max()
    out["roll_min_60"] = out["close"].rolling(60).min()
    denom = (out["roll_max_60"] - out["roll_min_60"]).replace(0, np.nan)
    out["prox_roll_60"] = (out["close"] - (out["roll_min_60"] + out["roll_max_60"]) / 2) / denom
    # RSI 14
    delta = out["close"].diff()
    gain = delta.clip(lower=0.0)
    loss = -delta.clip(upper=0.0)
    out["rsi_14"] = 100 - (100 / (1 + (gain.rolling(14).mean() / loss.rolling(14).mean().replace(0, np.nan))))
    # MACD
    ema12 = out["close"].ewm(span=12, adjust=False).mean()
    ema26 = out["close"].ewm(span=26, adjust=False).mean()
    macd = ema12 - ema26
    out["macd"] = macd
    out["macd_signal"] = macd.ewm(span=9, adjust=False).mean()
    out["macd_hist"] = out["macd"] - out["macd_signal"]
    return out.replace([np.inf, -np.inf], np.nan).dropna()

feats = build_features(prices)
print("Features shape:", feats.shape)



NameError: name 'prices' is not defined

In [8]:
# Build exogenous vector from DB signals (snapshot style)

def safe_num(x, default=0.0):
    try:
        v = float(x)
        if math.isfinite(v):
            return v
        return default
    except Exception:
        return default

# 52-week proximity (if present)
prox_52w = 0.0
for sec in ["BSE_52WeekHighLow", "NSE_52WeekHighLow", "bse_52weekhighlow", "nse_52weekhighlow"]:
    block = fifty.get(sec, {}) if isinstance(fifty, dict) else {}
    for cat in ("high52Week", "low52Week", "high52week", "low52week"):
        rows = block.get(cat, []) if isinstance(block, dict) else []
        for r in rows:
            if isinstance(r, dict) and str(r.get("company", "")).strip().lower().startswith(LNAME[:6]):
                price = safe_num(r.get("price") or r.get("Price"))
                h = safe_num(r.get("52_week_high") or r.get("high"), price)
                l = safe_num(r.get("52_week_low") or r.get("low"), price)
                if h and l:
                    prox_52w = 2 * (price - (l + h) / 2) / max(1e-6, (h - l))
                break

risk = safe_num(((details.get("riskMeter") or {}).get("score") if isinstance(details.get("riskMeter"), dict) else details.get("riskMeter")), 0.0)
# Sales growth slope (approx)
sales_growth = 0.0
q = (stats_doc.get("quarter_results") or stats_doc.get("Sales") or stats_doc.get("quarterResults")) if isinstance(stats_doc, dict) else {}
if isinstance(q, dict):
    try:
        series = list(q.get("Sales", {}).items()) if "Sales" in q else list(q.items())
        vals = [safe_num(v) for _, v in series[-6:]]
        if len(vals) >= 2 and abs(vals[0]) > 1e-6:
            sales_growth = (vals[-1] - vals[0]) / abs(vals[0])
    except Exception:
        pass

# Corporate events count
corp_events = 0
if isinstance(corp, dict):
    for k, v in corp.items():
        if k.endswith("meetings") and isinstance(v, dict):
            data = v.get("data", [])
            if isinstance(data, list):
                corp_events += len(data)

exog = {
    "feat_prox_52w": float(np.clip(prox_52w, -3.0, 3.0)),
    "feat_risk_meter": float(np.clip(risk, 0.0, 10.0)),
    "feat_sales_growth": float(np.clip(sales_growth, -3.0, 3.0)),
    "feat_corp_events": float(corp_events),
}

exog


NameError: name 'fifty' is not defined

In [None]:
# Supervised matrices for returns

def append_exog(df: pd.DataFrame, exog: dict) -> pd.DataFrame:
    out = df.copy()
    for k, v in (exog or {}).items():
        try:
            out[k] = float(v)
        except Exception:
            out[k] = 0.0
    return out


def prepare_supervised(feats: pd.DataFrame, h: int, exog: dict):
    data = append_exog(feats, exog)
    data["target"] = (data["close"].shift(-h) / data["close"]) - 1.0
    data = data.dropna()
    X = data.drop(columns=["target"])  # keep full feature list
    y = data["target"].values
    return X, y, list(X.columns), data

X5, y5, cols, data5 = prepare_supervised(feats, 5, exog)
print(len(data5), "samples", len(cols), "features")



In [None]:
# Train LightGBM return model (or linear fallback)

scaler = StandardScaler()
Xs = scaler.fit_transform(X5.values)
split = int(len(Xs) * 0.8)

if lgb is not None and len(Xs) > 150:
    train = lgb.Dataset(Xs[:split], label=y5[:split])
    valid = lgb.Dataset(Xs[split:], label=y5[split:])
    params = {"objective": "regression", "metric": "l1", "verbosity": -1, "num_leaves": 63, "learning_rate": 0.05, "feature_fraction": 0.9}
    lgbm = lgb.train(params, train, num_boost_round=600, valid_sets=[valid])
    preds = lgbm.predict(Xs[split:])
else:
    lgbm = Pipeline([("lin", LinearRegression())]).fit(Xs[:split], y5[:split])
    preds = lgbm.predict(Xs[split:])

mae_val = float(mean_absolute_error(y5[split:], preds)) if split < len(Xs) else float("nan")
mae_val


In [None]:
# Train simple LSTM on returns (optional)

def build_seq(data: pd.DataFrame, cols: list[str], seq_len: int):
    vals = data[cols + ["target"]].values
    X_list, y_list = [], []
    for i in range(seq_len, len(vals)):
        X_list.append(vals[i-seq_len:i, :-1])
        y_list.append(vals[i, -1])
    return np.array(X_list), np.array(y_list)

seq_len = 30
lstm_mae = np.nan
lstm_pred = np.nan
if TF_AVAILABLE and len(data5) > seq_len + 50:
    # normalize by training stats only
    scaler_lstm = StandardScaler().fit(data5[cols].iloc[:split].values)
    train_n = scaler_lstm.transform(data5[cols].iloc[:split].values)
    val_n = scaler_lstm.transform(data5[cols].iloc[split:].values)
    data5n = pd.DataFrame(np.vstack([train_n, val_n]), index=data5.index, columns=cols)
    data5n["target"] = y5

    Xtr, ytr = build_seq(data5n.iloc[:split], cols, seq_len)
    Xva, yva = build_seq(data5n.iloc[split:], cols, seq_len)

    model = keras.Sequential([
        keras.layers.LSTM(128, return_sequences=True, input_shape=(seq_len, len(cols))),
        keras.layers.Dropout(0.2),
        keras.layers.LSTM(64),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation="tanh"),
        keras.layers.Lambda(lambda x: 0.15 * x),
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mae", metrics=[keras.metrics.MAE])
    model.fit(Xtr, ytr, validation_data=(Xva, yva), epochs=20, verbose=0)
    lstm_mae = float(model.evaluate(Xva, yva, verbose=0)[1])
    # last seq prediction anchored to price
    last_seq = scaler_lstm.transform(data5[cols].iloc[-seq_len:].values)
    ret_hat = float(model.predict(last_seq[np.newaxis, :, :], verbose=0)[0][0])
    last_close = float(feats["close"].iloc[-1])
    lstm_pred = last_close * (1.0 + ret_hat)

lstm_mae, lstm_pred


In [None]:
# Compose ensemble for horizons and print diagnostics

last_close = float(feats["close"].iloc[-1])
ret_series = feats["close"].pct_change().dropna()
daily_vol = float(ret_series.rolling(20).std().iloc[-1]) if len(ret_series) > 20 else float(ret_series.std())

results = {}
for h in HORIZONS:
    Xh, yh, colsh, datah = prepare_supervised(feats, h, exog)
    Xsh = scaler.transform(Xh.values)
    # LightGBM/linear prediction for last row
    last_row = Xsh[-1:]
    ret_hat_lgbm = float(lgbm.predict(last_row)[0]) if hasattr(lgbm, "predict") else 0.0
    pt_lgbm = last_close * (1.0 + ret_hat_lgbm)
    # Combine with LSTM if available
    w_lgbm = 1.0 / max(1e-6, mae_val * last_close)
    w_lstm = 1.0 / max(1e-6, lstm_mae * last_close) if TF_AVAILABLE and math.isfinite(lstm_mae) else 0.0
    if not math.isfinite(w_lstm) or w_lstm == 0.0:
        w_lstm = 0.0
    if not math.isfinite(w_lgbm):
        w_lgbm = 1.0
    w_sum = w_lgbm + w_lstm if (w_lgbm + w_lstm) > 0 else 1.0
    p50 = (w_lgbm * pt_lgbm + w_lstm * (lstm_pred if math.isfinite(lstm_pred) else pt_lgbm)) / w_sum
    # clamp
    band = last_close * (daily_vol * max(1.0, (h ** 0.5)) * 2.5)
    lo = max(0.0, last_close - band)
    hi = last_close + band
    p50 = float(np.clip(p50, lo, hi))
    results[h] = {"last_close": last_close, "p50": p50, "lgbm": pt_lgbm, "lstm": float(lstm_pred) if math.isfinite(lstm_pred) else None, "vol_band": [lo, hi], "daily_vol": daily_vol, "mae_lgbm": mae_val, "mae_lstm": float(lstm_mae) if math.isfinite(lstm_mae) else None}

results
