In [1]:
import pandas as pd
import numpy as np
from stable_baselines3 import PPO, A2C, SAC
from typing import Optional, Dict, Any, List, Type
from quantrl_lab.data import DataSourceRegistry, DataProcessor
from quantrl_lab.data.indicators.technical_indicators import *
from quantrl_lab.backtesting import BacktestRunner
from quantrl_lab.custom_envs.stock import SingleStockTradingEnv, SingleStockEnvConfig
from quantrl_lab.custom_envs.stock.strategies.actions.types import StandardMarketActionStrategy
from quantrl_lab.custom_envs.stock.strategies.observations import PortfolioWithTrendObservation
from quantrl_lab.custom_envs.stock.strategies.rewards import (
    PortfolioValueChangeReward,
    InvalidActionPenalty,
    TrendFollowingReward,
    HoldPenalty,
    PositionSizingRiskReward,
    WeightedCompositeReward
)

from quantrl_lab.tuning.optuna_runner import (
    OptunaRunner, 
    create_sac_search_space, 
    create_ppo_search_space, 
    create_a2c_search_space
)

In [2]:
data_loader = DataSourceRegistry()

olhcv_df = data_loader.get_historical_ohlcv_data(
    symbols="MU",
    start="2022-01-01",
    end="2025-07-31",
    timeframe="1d", # can be adjusted based on the data granularity needed
)

news_df = data_loader.get_news_data(
    "MU",
    start="2022-01-01",
    end="2025-07-31",
)

Output()

In [3]:
data_processor = DataProcessor(olhcv_data=olhcv_df, news_data=news_df)

# The window sizes can be adjusted based on the signals from feature importance analysis.

# configuring the technical indicators with specific parameters
processed_data = data_processor.data_processing_pipeline(
    indicators=["SMA", "EMA", "RSI", "MACD", "ATR", "BB", "STOCH", "OBV"],
    
    fillna_strategy="neutral",
    SMA_params={
        "window": 20,
    },
    EMA_params={"window": 9},
    RSI_params={"window": 7},
    MACD_params={"fast": 12, "slow": 26, "signal": 9},
    ATR_params={"window": 14},
    BB_params={"window": 20, "num_std": 2},
    STOCH_params={"k_window": 14, "d_window": 3, "smooth_k": 1},
    # OBV_params={},
)

processed_data.head()

Device set to use cpu


Unnamed: 0,Open,High,Low,Close,Volume,Trade_count,VWAP,SMA_20,EMA_9,RSI_7,...,MACD_signal_9,ATR_14,BB_middle_20,BB_upper_20_2,BB_lower_20_2,BB_bandwidth_20,STOCH_%K_14_1,STOCH_%D_3,OBV,sentiment_score
0,78.96,82.31,78.02,82.27,25587263.0,177832.0,81.23175,89.427,83.289881,38.54667,...,-2.957446,3.908338,89.427,103.057519,75.796481,0.304841,29.887093,17.256112,29104168.0,0.615621
1,82.2,82.39,80.42,81.45,16677706.0,156159.0,81.282006,88.712,82.921905,36.195812,...,-3.168442,3.769885,88.712,102.445824,74.978176,0.309627,26.256365,24.249133,12426462.0,0.861282
2,82.92,84.7,82.54,84.51,21429140.0,174863.0,83.93071,88.1205,83.239524,49.58255,...,-3.272308,3.73275,88.1205,101.485089,74.755911,0.303325,39.80518,31.982879,33855602.0,0.0
3,82.67,85.18,81.71,81.97,21519262.0,183404.0,83.199568,87.499,82.985619,41.209527,...,-3.336647,3.713982,87.499,100.78991,74.20809,0.303796,29.438613,31.833386,12336340.0,0.0
4,81.222,82.19,79.84,81.17,16827299.0,153181.0,81.113986,86.775,82.622495,38.801794,...,-3.377998,3.616555,86.775,99.770654,73.779346,0.299525,27.655409,32.299734,-4490959.0,0.0


In [4]:
train_size = int(len(processed_data) * 0.6)
train_data_df = processed_data[:train_size]
eval_size = int(len(processed_data) * 0.2)
eval_data_df = processed_data[train_size:train_size + eval_size] 
test_data_df = processed_data[train_size + eval_size:]  

In [5]:
# Create strategy instances

action_strategy = StandardMarketActionStrategy() # 7 actions: buy, sell ,hold, limit buy, limit sell, take profit, stop loss
observation_strategy = PortfolioWithTrendObservation()

# Create composite reward strategy
portfolio_reward = PortfolioValueChangeReward()
invalid_penalty = InvalidActionPenalty(penalty=-1.0) # free to adjust the penalty value
trend_reward = TrendFollowingReward()
hold_penalty = HoldPenalty(penalty=-0.5) # free to adjust the penalty value
position_sizing_reward = PositionSizingRiskReward()

In [7]:
conservative_reward_strategy = WeightedCompositeReward(
        strategies=[portfolio_reward, invalid_penalty, trend_reward, hold_penalty, position_sizing_reward],
        weights=[1.2, 4.0, 0.1, 0.05, 1.0]
)

sample_env_config = BacktestRunner.create_env_config_factory(
    train_data=train_data_df,
    test_data=eval_data_df,  # notice that we use eval data here for hyperparameter tuning
    action_strategy=action_strategy,
    reward_strategy=conservative_reward_strategy,
    observation_strategy=observation_strategy,
    initial_balance=10000,
    transaction_cost_pct=0.001,
    window_size=20,
)

The following tuning example is just for illustration. You should probably run it with a more powerful machine/cluster

In [None]:
runner = BacktestRunner(verbose=1) # prints the logs

# Initialize the tuner with your existing runner
tuner = OptunaRunner(
    runner=runner,
    storage_url="sqlite:///optuna_studies.db"  # Optional: for persistent storage and future reference
)

# Example: Basic hyperparameter optimization for SAC
print("Starting SAC hyperparameter optimization...")

# Use the pre-defined search space or create a custom one
sac_search_space = create_sac_search_space()

# Run optimization
study = tuner.optimize_hyperparameters(
    algo_class=SAC,
    env_config=sample_env_config,  # Use your existing env config
    search_space=sac_search_space,
    fixed_params = { "verbose": 0 },  # Optional: fixed parameters to keep constant
    study_name="sac_optimization",
    n_trials=3,  # Start with very few trials for quick testing on optuna
    total_timesteps=50000,
    num_eval_episodes=3,
    optimization_metric="test_avg_return_pct",  # or "test_avg_reward"
    direction="maximize"
)



In [9]:
print(f"✅ Best parameters: {study.best_params}")
print(f"✅ Best value: {study.best_value:.4f}")

✅ Best parameters: {'learning_rate': 7.476312062252303e-05, 'batch_size': 64, 'gamma': 0.9455613914232819, 'tau': 0.07873242017790835, 'train_freq': 8, 'gradient_steps': 8, 'target_update_interval': 1}
✅ Best value: 18.4322


In [11]:
import optuna
# Reading from storage
study = optuna.load_study(
    study_name="sac_optimization",  
    storage="sqlite:///optuna_studies.db"  # Path to your SQLite DB (change as needed)
)

# Get all trials as a DataFrame
trials_df = study.trials_dataframe()
print(trials_df.head())

# Access best parameters
print(f"Best parameters: {study.best_params}")
print(f"Best value: {study.best_value}")

   number      value             datetime_start          datetime_complete  \
0       0  -1.208242 2025-09-01 19:41:19.274992 2025-09-01 19:42:38.102367   
1       1  18.432171 2025-09-01 19:42:38.116379 2025-09-01 19:43:55.785223   
2       2 -96.862244 2025-09-01 19:43:55.792446 2025-09-01 19:47:17.900588   
3       3 -92.218117 2025-09-05 22:23:07.226468 2025-09-05 22:24:29.984176   
4       4        NaN 2025-09-05 22:24:29.999399 2025-09-05 22:25:43.946036   

                duration  params_batch_size  params_gamma  \
0 0 days 00:01:18.827375                 64      0.915584   
1 0 days 00:01:17.668844                 64      0.945561   
2 0 days 00:03:22.108142                256      0.990841   
3 0 days 00:01:22.757708                 64      0.915584   
4 0 days 00:01:13.946637                 64      0.945561   

   params_gradient_steps  params_learning_rate  params_target_update_interval  \
0                      1              0.000133                              4   
1 

In [None]:
custom_sac_config = BacktestRunner.create_custom_config(
    SAC,
    **study.best_params
)

sample_env_config = BacktestRunner.create_env_config_factory(
    train_data=train_data_df,
    test_data=test_data_df,  # Change back to test set for final evaluation
    action_strategy=action_strategy,
    reward_strategy=conservative_reward_strategy,
    observation_strategy=observation_strategy,
    initial_balance=10000,
    transaction_cost_pct=0.001,
    window_size=20,
)


# Run single experiment using the single_env_config
results = runner.run_single_experiment(
    SAC,         
    single_standard_config,     # Use the single environment config we created
    config=custom_sac_config,  # Custom algorithm configuration, this is an optional parameter
    total_timesteps=50000,  # Total timesteps for training
    num_eval_episodes=3
)

print(f"✅ Single experiment completed!")