In [1]:
import numpy as np
import gymnasium as gym
from quantrl_lab.custom_envs.stock import (
    SingleStockTradingEnv,
    SingleStockEnvConfig,
    )
from quantrl_lab.custom_envs.stock.strategies.actions.types.standard_market_action_strategy import (
    StandardMarketActionStrategy
)
from quantrl_lab.custom_envs.stock.strategies.rewards import (
    PortfolioValueChangeReward,
    InvalidActionPenalty,
    TrendFollowingReward,
    HoldPenalty,
    WeightedCompositeReward,
    BaseRewardStrategy
)
from quantrl_lab.custom_envs.stock.strategies.observations import (
    BaseObservationStrategy, 
    PortfolioWithTrendObservation
)

from stable_baselines3 import PPO, A2C, SAC
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.env_checker import check_env

In [2]:
portfolio_reward = PortfolioValueChangeReward()
invalid_penalty = InvalidActionPenalty(penalty=-1.0)
trend_reward = TrendFollowingReward()
hold_penalty = HoldPenalty(penalty=-0.005)

reward_components = [
    portfolio_reward,
    invalid_penalty,
    trend_reward,
    hold_penalty
]


component_weights = [
    1.0,  
    1.0,  
    0.5,  
    0.1   
]

composite_reward_strategy = WeightedCompositeReward(
    strategies=reward_components,
    weights=component_weights
)

In [3]:
def run_simple_test():

    print("=== 1. Setting up configuration and data ===")


    PRICE_COLUMN_INDEX = 3
    
    # Create dummy data for the environment
    data_size = 100
    my_data = np.random.rand(data_size, 5).astype(np.float32)
    my_data[:, PRICE_COLUMN_INDEX] = 50 + np.arange(data_size) * 0.2 + np.random.randn(data_size) * 0.5
    
    # Create the Pydantic config object
    env_configuration = SingleStockEnvConfig(
        price_column_index=PRICE_COLUMN_INDEX,
        window_size=10,
        initial_balance=10000.0,
        transaction_cost_pct=0.001,
        slippage=0.0005            
    )

    # Instantiate the desired action strategy
    standard_market_action_strategy = StandardMarketActionStrategy()

    print("Configuration, data, and strategy are ready.\n")

    # === 2. Initialize the Environment ===
    print("--- 2. Initializing the environment ---")
    try:
        env = SingleStockTradingEnv(
            data=my_data,
            config=env_configuration,
            action_strategy=standard_market_action_strategy,
            reward_strategy=composite_reward_strategy,
            observation_strategy=PortfolioWithTrendObservation()  # Use the new observation strategy
        )
        print("✅ Environment created successfully with FullActionStrategy!")
        print(f"Action Space: {env.action_space}")
        print(f"Observation Space Shape: {env.observation_space.shape}\n")
    except Exception as e:
        print(f"❌ Failed to create environment: {e}")
        return

    # === Run a few steps with random actions ===
    print("--- 3. Running a few steps with random actions ---")
    observation, info = env.reset()
    
    print("\n>>> Initial State after reset <<<")
    env.render(mode="human")

    num_random_steps = 50
    for i in range(num_random_steps):
        random_action = env.action_space.sample()

        print(f"\n{'='*40}")
        print(f"Step {i+1}/{num_random_steps} - Taking a random action")
        print(f"Random Action Array: {random_action}")
        print(f"{'='*40}")

        # Perform the step
        observation, reward, terminated, truncated, info = env.step(random_action)
        
        # Render the new state
        print("\n>>> State after action <<<")
        env.render(mode="human")

        # Print the key results from the info dictionary
        print(f"Reward received: {reward:.4f}")
        print(f"Decoded Action Info: {info['action_decoded']}")
        
        if terminated or truncated:
            print("\nEpisode finished early.")
            break
            
    print("\n\n=== Test finished successfully! ===")
    env.close()


In [4]:
run_simple_test()

=== 1. Setting up configuration and data ===
Configuration, data, and strategy are ready.

--- 2. Initializing the environment ---
✅ Environment created successfully with FullActionStrategy!
Action Space: Box([0.  0.  0.9], [6.  1.  1.1], (3,), float32)
Observation Space Shape: (59,)

--- 3. Running a few steps with random actions ---

>>> Initial State after reset <<<
----------------------------------------
Step:         10/99
Current Price:          51.65
Balance:             10000.00
Shares Held:                0 (Free)
Total Shares:               0 (Free + Reserved)
Portfolio Val:       10000.00
----------------------------------------
Active Orders:
 Pending Limit:    0
  Stop Loss:        0
  Take Profit:      0
----------------------------------------

Step 1/50 - Taking a random action
Random Action Array: [2.5259373  0.00304347 1.01022   ]

>>> State after action <<<
----------------------------------------
Step:         11/99
Current Price:          52.46
Balance:           

In [5]:
def test_full_sb3_integration():
    print("=== Testing full SB3 integration with model training and evaluation ===")
    
    # Same environment setup as before
    data_size = 500  # Larger dataset for training
    PRICE_COLUMN_INDEX = 3
    test_data = np.random.rand(data_size, 5).astype(np.float32)
    test_data[:, PRICE_COLUMN_INDEX] = 50 + np.arange(data_size) * 0.2 + np.random.randn(data_size) * 0.5
    
    config = SingleStockEnvConfig(
        price_column_index=PRICE_COLUMN_INDEX,
        window_size=10,
        initial_balance=10000.0
    )
    
    # Create vectorized environment (recommended for SB3)
    env = make_vec_env(
        lambda: SingleStockTradingEnv(
            data=test_data,
            config=config,
            action_strategy=StandardMarketActionStrategy(),
            reward_strategy=composite_reward_strategy,
            observation_strategy=PortfolioWithTrendObservation()
        ), 
        n_envs=2  # Use 2 parallel environments
    )
    
    try:
        # Train a model
        model = PPO("MlpPolicy", env, verbose=1, 
                   learning_rate=0.0003,
                   n_steps=64,
                   batch_size=32)
        
        print("Training model for 10,000 steps...")
        model.learn(total_timesteps=10000)
        
        # Test the trained model
        obs = env.reset()
        total_reward = 0
        
        print("Evaluating trained model...")
        for _ in range(100):
            action, _states = model.predict(obs, deterministic=True)
            obs, rewards, dones, info = env.step(action)
            total_reward += rewards[0]  # Just use the first environment's reward
            
            if dones[0]:
                break
                
        print(f"Total reward during evaluation: {total_reward:.2f}")
        print("✅ Full SB3 integration test completed successfully!")
        
    except Exception as e:
        print(f"❌ SB3 integration test failed: {e}")
    
    env.close()

In [6]:
test_full_sb3_integration()

=== Testing full SB3 integration with model training and evaluation ===
Using cpu device
Training model for 10,000 steps...
-----------------------------
| time/              |      |
|    fps             | 3481 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 128  |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1886        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 256         |
| train/                  |             |
|    approx_kl            | 0.010733332 |
|    clip_fraction        | 0.123       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.26       |
|    explained_variance   | -1.82       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0132      |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.034  