In [5]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from ta.momentum import RSIIndicator
from ta.trend import MACD, SMAIndicator
from ta.volatility import BollingerBands
import joblib

# 📂 Load Data
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_path = os.path.join(project_root, "data", "historical_stock_data_15min_1year.csv")

if not os.path.exists(data_path):
    raise FileNotFoundError(f"❌ Error: {data_path} not found. Ensure fetch script has been run.")

print(f"✅ Loading data from: {data_path}")
df = pd.read_csv(data_path)

# 🕒 Ensure proper datetime format & sorting
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.sort_values(by=["symbol", "timestamp"]).reset_index(drop=True)

# 🔹 Feature Engineering: Adding Technical Indicators
def add_indicators(df):
    df["returns"] = df["close"].pct_change()  # Price % Change

    # 🟢 Relative Strength Index (RSI)
    df["rsi"] = df.groupby("symbol", group_keys=False)["close"].apply(lambda x: RSIIndicator(x, window=14).rsi()).reset_index(level=0, drop=True)

    # 🔵 Moving Averages (SMA)
    df["sma_20"] = df.groupby("symbol", group_keys=False)["close"].apply(lambda x: SMAIndicator(x, window=20).sma_indicator()).reset_index(level=0, drop=True)
    df["sma_50"] = df.groupby("symbol", group_keys=False)["close"].apply(lambda x: SMAIndicator(x, window=50).sma_indicator()).reset_index(level=0, drop=True)

    # 🟡 MACD Indicator
    df["macd"] = df.groupby("symbol", group_keys=False)["close"].apply(lambda x: MACD(x).macd()).reset_index(level=0, drop=True)

    # 🔴 Bollinger Bands
    df["bollinger_upper"] = df.groupby("symbol", group_keys=False)["close"].apply(lambda x: BollingerBands(x).bollinger_hband()).reset_index(level=0, drop=True)
    df["bollinger_lower"] = df.groupby("symbol", group_keys=False)["close"].apply(lambda x: BollingerBands(x).bollinger_lband()).reset_index(level=0, drop=True)

    return df

df = add_indicators(df)

# 🧹 Drop missing values from technical indicators
df = df.dropna()

# ✅ **Define Features & Target**
features = ["open", "high", "low", "close", "volume", "trade_count", "vwap", "rsi", "sma_20", "sma_50", "macd", "bollinger_upper", "bollinger_lower"]

# ✅ **Set Classification Target (1 = Next Close is Higher, 0 = Lower)**
df["target"] = (df.groupby("symbol")["close"].shift(-1) > df["close"]).astype(int)

# Drop last row per symbol (since `shift(-1)` creates NaN in last row)
df = df.dropna().reset_index(drop=True)

# ✅ **Split Train/Test Per Symbol to Prevent Leakage**
train_dfs, test_dfs = [], []
for symbol in df["symbol"].unique():
    symbol_df = df[df["symbol"] == symbol].sort_values("timestamp")

    if len(symbol_df) < 51:  # Ensure enough data for sequences
        continue  # Skip symbols with insufficient data

    split_idx = int(0.8 * len(symbol_df))  # 80% train, 20% test
    train_dfs.append(symbol_df.iloc[:split_idx])
    test_dfs.append(symbol_df.iloc[split_idx:])

# ✅ **Fit Scaler on Training Data ONLY**
all_train_df = pd.concat(train_dfs)  # Combine all training data
scaler = MinMaxScaler().fit(all_train_df[features])  # Fit only on training data

# ✅ **Scale Train/Test Data**
for i in range(len(train_dfs)):
    train_dfs[i].loc[:, features] = scaler.transform(train_dfs[i][features])
    test_dfs[i].loc[:, features] = scaler.transform(test_dfs[i][features])

# 💾 **Save Scaler for Later Inference**
scaler_path = os.path.join(project_root, "models", "xgboost_scaler.pkl")
joblib.dump(scaler, scaler_path)
print("✅ Scaler saved successfully!")

# ✅ **Prepare Data for Training**
X_train = pd.concat(train_dfs)[features].values
y_train = pd.concat(train_dfs)["target"].values

X_test = pd.concat(test_dfs)[features].values
y_test = pd.concat(test_dfs)["target"].values

print(f"✅ Training Data: {X_train.shape}, Testing Data: {X_test.shape}")

# 🎯 **Train XGBoost Classifier**
model = xgb.XGBClassifier(
    n_estimators=500,  # More boosting rounds
    max_depth=8,  # Higher depth for capturing complex patterns
    learning_rate=0.05,  # Lower learning rate
    subsample=0.8,  # Uses 80% of data per tree
    colsample_bytree=0.8,  # Uses 80% of features per tree
    gamma=0.2,  # Reduces overfitting
    random_state=42
)

model.fit(X_train, y_train)

# ✅ **Predict Next Price Direction (UP/DOWN)**
y_pred = model.predict(X_test)

# ✅ **Evaluate Model Accuracy**
accuracy = accuracy_score(y_test, y_pred)
print(f"📊 **Model Accuracy:** {accuracy:.4f}")

# 💾 **Save Model**
models_dir = os.path.join(project_root, "models")
os.makedirs(models_dir, exist_ok=True)  # Ensure 'models' directory exists
model_path = os.path.join(models_dir, "xgboost_model.pkl")

joblib.dump(model, model_path)
print(f"✅ Model saved successfully at: {model_path}")


✅ Loading data from: /Users/sohammandal/Developer/trading-system/data/historical_stock_data_15min_1year.csv
✅ Scaler saved successfully!
✅ Training Data: (521468, 13), Testing Data: (130394, 13)
📊 **Model Accuracy:** 0.5224
✅ Model saved successfully at: /Users/sohammandal/Developer/trading-system/models/xgboost_model.pkl
