In [None]:
print("--- [1] Installing all required packages ---")

!pip install -q "stable-baselines3[extra]" gymnasium torch

print("\n--- [2] Importing all libraries ---")

import pandas as pd
import numpy as np
import os
import math
from datetime import datetime
from tqdm.notebook import tqdm
import warnings
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nSetup complete. All libraries are ready.")
print(f"Using device: {DEVICE}")


In [None]:
print("\n--- [2] Loading, Merging, and Engineering All Features ---")

try:
    print("  - Loading trading_data_with_sides.csv and orderbook_data.csv...")
    df_trading = pd.read_csv('trading_data_with_sides.csv')
    df_orderbook = pd.read_csv('orderbook_data.csv')

    df_trading['Open time'] = pd.to_datetime(df_trading['Open time'])
    df_orderbook['system_time'] = pd.to_datetime(df_orderbook['system_time'])

    print("  - Engineering summary features from order book data...")
    # Calculate EMA_14 and RSI_14 based on midpoint
    # Sort by time to ensure proper chronological order for technical indicators
    df_orderbook = df_orderbook.sort_values('system_time').reset_index(drop=True)
    df_orderbook['EMA_14'] = df_orderbook['midpoint'].ewm(span=14).mean()

    # Calculate RSI_14
    delta = df_orderbook['midpoint'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df_orderbook['RSI_14'] = 100 - (100 / (1 + rs))

    # Original volume imbalance calculation
    df_orderbook['volume_imbalance'] = (df_orderbook[[f'bids_notional_{i}' for i in range(5)]].sum(axis=1) -
                                        df_orderbook[[f'asks_notional_{i}' for i in range(5)]].sum(axis=1)) / \
                                    (df_orderbook[[f'bids_notional_{i}' for i in range(5)]].sum(axis=1) +
                                        df_orderbook[[f'asks_notional_{i}' for i in range(5)]].sum(axis=1) + 1e-9)
    df_orderbook.set_index('system_time', inplace=True)

    print("  - Resampling order book features to 1-minute intervals...")
    # Include all the new features in resampling
    orderbook_features = ['midpoint', 'spread', 'buys', 'sells', 'EMA_14', 'RSI_14', 'volume_imbalance'] + \
                        [f'bids_distance_{i}' for i in range(11)] + \
                        [f'bids_notional_{i}' for i in range(11)] + \
                        [f'asks_distance_{i}' for i in range(11)] + \
                        [f'asks_notional_{i}' for i in range(11)]

    depth_features_resampled = df_orderbook[orderbook_features].resample('1min').mean()
    print("  - Merging dataframes...")
    df_trading.set_index('Open time', inplace=True)
    df_master = pd.merge(df_trading, depth_features_resampled, left_index=True, right_index=True, how='inner')
    df_master.dropna(inplace=True)

    print("  - Normalizing all features...")
    df_processed = df_master.copy()
    price_cols = ['Open', 'High', 'Low', 'Close']
    for col in price_cols:
        df_processed[f'{col}_norm'] = df_processed[col].pct_change()
    df_processed['Volume_norm'] = np.log1p(df_processed['Volume'])

    # Updated feature columns to include all new orderbook features
    other_feature_cols = ['midpoint_drift', 'spread', 'volume_imbalance', 'Volume_norm', 
                        'midpoint', 'buys', 'sells', 'EMA_14', 'RSI_14'] + \
                        [f'bids_distance_{i}' for i in range(11)] + \
                        [f'bids_notional_{i}' for i in range(11)] + \
                        [f'asks_distance_{i}' for i in range(11)] + \
                        [f'asks_notional_{i}' for i in range(11)]

    scaler = StandardScaler()
    df_processed[other_feature_cols] = scaler.fit_transform(df_processed[other_feature_cols])
    df_processed.dropna(inplace=True)

    final_feature_columns = [
        'Open_norm', 'High_norm', 'Low_norm', 'Close_norm', 'Volume_norm',
        'midpoint_drift', 'spread', 'volume_imbalance', 'midpoint', 'buys', 'sells', 
        'EMA_14', 'RSI_14'
    ] + [f'bids_distance_{i}' for i in range(11)] + \
    [f'bids_notional_{i}' for i in range(11)] + \
    [f'asks_distance_{i}' for i in range(11)] + \
    [f'asks_notional_{i}' for i in range(11)]

    df_processed['state_features'] = df_processed[final_feature_columns].values.tolist()

    print("  - Generating labels for pre-training...")
    PRETRAIN_HORIZON = 20
    PRETRAIN_THRESHOLD = 0.0001
    future_price = df_processed['Close'].rolling(window=PRETRAIN_HORIZON).mean().shift(-PRETRAIN_HORIZON)
    past_price = df_processed['Close'].rolling(window=PRETRAIN_HORIZON).mean()
    l_values = (future_price - past_price) / past_price

    df_processed['label'] = 1
    df_processed.loc[l_values > PRETRAIN_THRESHOLD, 'label'] = 2
    df_processed.loc[l_values < -PRETRAIN_THRESHOLD, 'label'] = 0
    df_processed.dropna(subset=['label'], inplace=True)
    df_processed['label'] = df_processed['label'].astype(int)

    print("\nData preprocessing complete.")
    print(f"Final DataFrame shape: {df_processed.shape}")
    display(df_processed.head(3))

except FileNotFoundError as e:
    print(f"ERROR: {e}. Please ensure both CSV files are uploaded.")
except Exception as e:
    print(f"An unexpected error occurred during preprocessing: {e}")


In [None]:
print("\n--- [ENV] Defining the Market Making Environment (Shape Bug Fixed) ---")

class MarketMakingEnv(gym.Env):
    metadata = {'render_modes': ['human']}
    def __init__(self, df, time_window=50, initial_cash=100000.0, trade_size=1, max_inventory=10,
                 inventory_penalty=0.01, max_spread_bps=100, max_bias_bps=20):
        super().__init__()
        self.df, self.time_window, self.initial_cash, self.trade_size, self.max_inventory = df, time_window, initial_cash, trade_size, max_inventory
        self.inventory_penalty, self.max_spread_bps, self.max_bias_bps = inventory_penalty, max_spread_bps, max_bias_bps
        self._max_episode_steps = len(self.df) - self.time_window - 1
        self.action_space = spaces.Box(low=0.0, high=1.0, shape=(2,), dtype=np.float32)
        self.market_obs_shape = (time_window, len(df['state_features'].iloc[0]) if len(df) > 0 else 57)
        self.agent_state_shape = (2,)
        self.observation_space = spaces.Dict({
            "market_state": spaces.Box(low=-np.inf, high=np.inf, shape=self.market_obs_shape, dtype=np.float32),
            "agent_state": spaces.Box(low=-np.inf, high=np.inf, shape=self.agent_state_shape, dtype=np.float32)
        })

    def _get_observation(self):
        end_idx = self.current_step + self.time_window
        market_obs_list = self.df['state_features'].iloc[self.current_step:end_idx].tolist()
        market_obs = np.array(market_obs_list, dtype=np.float32)
        agent_state = np.array([
            self.inventory / self.max_inventory,
            self.current_step / self._max_episode_steps
        ], dtype=np.float32)
        return {"market_state": market_obs, "agent_state": agent_state}

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.cash, self.inventory = self.initial_cash, 0
        self.portfolio_value, self.current_step = self.initial_cash, 0
        return self._get_observation(), {}

    def step(self, action):
        decision_idx = self.current_step + self.time_window - 1
        ref_price = self.df.iloc[decision_idx]['Close']
        bias_ctrl, spread_ctrl = action[0], action[1]
        bias = bias_ctrl * (self.max_bias_bps * 0.0001 * ref_price)
        res_price = ref_price - (self.inventory / self.max_inventory) * bias
        spread = max(spread_ctrl * (self.max_spread_bps * 0.0001 * ref_price), 0.01)
        ask_p, bid_p = res_price + spread / 2, res_price - spread / 2
        if self.inventory >= self.max_inventory: bid_p = 0
        if self.inventory <= -self.max_inventory: ask_p = 0
        future_bar = self.df.iloc[decision_idx + 1]
        fill_price, fill_volume = 0, 0
        if ask_p > 0 and ask_p <= future_bar['High']: fill_price, fill_volume = ask_p, -self.trade_size
        elif bid_p > 0 and bid_p >= future_bar['Low']: fill_price, fill_volume = bid_p, self.trade_size
        prev_val = self.portfolio_value
        if fill_volume != 0: self.cash -= fill_price * fill_volume; self.inventory += fill_volume
        self.portfolio_value = self.cash + self.inventory * future_bar['Close']
        pnl = self.portfolio_value - prev_val
        reward = (pnl - max(0, 0.5 * pnl)) - (self.inventory_penalty * (self.inventory/self.max_inventory)**2)
        self.current_step += 1
        terminated = self.current_step >= self._max_episode_steps
        if terminated:
            obs = {
                "market_state": np.zeros(self.market_obs_shape, dtype=np.float32),
                "agent_state": np.zeros(self.agent_state_shape, dtype=np.float32)
            }
        else:
            obs = self._get_observation()
        return obs, reward, terminated, False, {'pnl': pnl}

print("MarketMakingEnv class definition updated (Final Shape Bug Fixed).")


In [None]:
print("\n--- [4] Splitting Data into Training and Testing Sets ---")

split_fraction = 0.8
split_index = int(len(df_processed) * split_fraction)

train_df = df_processed.iloc[:split_index]
test_df = df_processed.iloc[split_index:]

print(f"Data has been split successfully.")
print(f"Training set shape:   {train_df.shape}")
print(f"Testing set shape:    {test_df.shape}")
print(f"Training period:      {train_df.index[0]} to {train_df.index[-1]}")
print(f"Testing period:       {test_df.index[0]} to {test_df.index[-1]}")


In [None]:
print("\n--- [5] Defining and Pre-training the Attn-LOB Feature Extractor ---")

import torch.nn.functional as F

class AttnLOB(nn.Module):
    def __init__(self, input_features, time_window):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=input_features, out_channels=32, kernel_size=1)
        self.inception_k3 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3, padding='same')
        self.inception_k5 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=5, padding='same')
        self.inception_k7 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=7, padding='same')
        self.conv2 = nn.Conv1d(in_channels=96, out_channels=64, kernel_size=1)
        self.attention = nn.MultiheadAttention(embed_dim=64, num_heads=4, batch_first=True)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(64 * time_window, 3)

    def forward(self, x):
        if len(x.shape) != 3:
            raise ValueError(f"Expected 3D input (batch, time, features), got shape: {x.shape}")
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        i3 = self.inception_k3(x)
        i5 = self.inception_k5(x)
        i7 = self.inception_k7(x)
        x = torch.cat((i3, i5, i7), dim=1)
        x = F.relu(self.conv2(x))
        x = x.permute(0, 2, 1)
        attn_output, _ = self.attention(x, x, x)
        flat_output = self.flatten(attn_output)
        final_output = self.fc(flat_output)
        return final_output

    def extract_features(self, x):
        if len(x.shape) != 3:
            raise ValueError(f"Expected 3D input (batch, time, features), got shape: {x.shape}")
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))
        i3, i5, i7 = self.inception_k3(x), self.inception_k5(x), self.inception_k7(x)
        x = torch.cat((i3, i5, i7), dim=1)
        x = F.relu(self.conv2(x))
        x = x.permute(0, 2, 1)
        attn_output, _ = self.attention(x, x, x)
        return self.flatten(attn_output)

print("  - Preparing data for pre-training...")
TIME_WINDOW = 50

try:
    X_train_list, y_train_list = [], []
    for i in tqdm(range(len(train_df) - TIME_WINDOW), desc="  Creating training windows"):
        window_data = train_df['state_features'].iloc[i:i+TIME_WINDOW].tolist()
        if len(window_data) == TIME_WINDOW and all(len(row) == len(window_data[0]) for row in window_data):
            X_train_list.append(window_data)
            y_train_list.append(train_df['label'].iloc[i+TIME_WINDOW-1])

    X_train = torch.tensor(X_train_list, dtype=torch.float32)
    y_train = torch.tensor(y_train_list, dtype=torch.long)

    print(f"  - Training data shape: {X_train.shape}")
    print(f"  - Training labels shape: {y_train.shape}")

    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(dataset=train_dataset, batch_size=256, shuffle=True)

    print("  - Instantiating and training the pre-training network...")
    input_features = X_train.shape[2]
    pretrain_net = AttnLOB(input_features, TIME_WINDOW).to(DEVICE)
    optimizer = Adam(pretrain_net.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(20):
        total_loss = 0
        pretrain_net.train()
        for x_batch, y_batch in tqdm(train_loader, desc=f"  Epoch {epoch+1}/5"):
            x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)
            optimizer.zero_grad()
            outputs = pretrain_net(x_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"    Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

    torch.save(pretrain_net.state_dict(), 'pretrained_attn_lob_extractor.pth')
    print("\nPre-trained Attn-LOB feature extractor saved successfully.")

except Exception as e:
    print(f"Error during pre-training: {e}")
    print("Creating a dummy pre-trained network for testing...")
    input_features = len(train_df['state_features'].iloc[0])
    pretrain_net = AttnLOB(input_features, TIME_WINDOW).to(DEVICE)
    torch.save(pretrain_net.state_dict(), 'pretrained_attn_lob_extractor.pth')
    print("Dummy pre-trained network saved.")


In [None]:
print("\n--- [6] Defining the Final RL Agent Network Wrapper (All Errors Fixed) ---")

class PreTrainedAttnLOBExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: spaces.Dict):
        features_dim = 256
        super().__init__(observation_space, features_dim=features_dim)

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        market_obs_shape = observation_space["market_state"].shape
        agent_obs_shape = observation_space["agent_state"].shape

        time_window, input_features = market_obs_shape

        print(f"  - Market observation shape: {market_obs_shape}")
        print(f"  - Agent observation shape: {agent_obs_shape}")

        try:
            print("  - Loading pre-trained AttnLOB network...")
            self.market_feature_extractor = AttnLOB(input_features, time_window).to(self.device)
            self.market_feature_extractor.load_state_dict(
                torch.load('pretrained_attn_lob_extractor.pth', map_location=self.device)
            )
            for param in self.market_feature_extractor.parameters():
                param.requires_grad = False
            self.market_feature_extractor.eval()
            print("  - Pre-trained network loaded and frozen successfully.")

            with torch.no_grad():
                dummy_input = torch.randn(1, time_window, input_features).to(self.device)
                dummy_features = self.market_feature_extractor.extract_features(dummy_input)
                market_features_size = dummy_features.shape[1]

        except Exception as e:
            print(f"  - Warning: Could not load pre-trained network ({e})")
            print("  - Using randomly initialized network instead...")
            self.market_feature_extractor = AttnLOB(input_features, time_window).to(self.device)
            market_features_size = 64 * time_window

        self.agent_processor = nn.Sequential(
            nn.Linear(agent_obs_shape[0], 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        ).to(self.device)

        combined_size = market_features_size + 32
        self.feature_combiner = nn.Sequential(
            nn.Linear(combined_size, features_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(features_dim, features_dim),
            nn.ReLU()
        ).to(self.device)

        print(f"  - Combined feature size: {combined_size} -> {features_dim}")

    def forward(self, observations: dict) -> torch.Tensor:
        market_obs = observations["market_state"].to(self.device)

        with torch.no_grad():
            market_features = self.market_feature_extractor.extract_features(market_obs)

        agent_obs = observations["agent_state"].to(self.device)
        agent_features = self.agent_processor(agent_obs)

        combined_features = torch.cat([market_features, agent_features], dim=1)
        final_features = self.feature_combiner(combined_features)

        return final_features

print("PreTrainedAttnLOBExtractor class defined successfully (All Errors Fixed).")


In [None]:
print("\n--- [7] Training the Final PPO Agent with Pre-trained Extractor ---")

try:
    print("  - Creating training environment...")
    vec_train_env = make_vec_env(MarketMakingEnv, n_envs=1, env_kwargs=dict(df=train_df))

    print("  - Testing environment...")
    obs = vec_train_env.reset()
    print(f"  - Environment test successful. Observation space verified.")

    policy_kwargs = dict(
        features_extractor_class=PreTrainedAttnLOBExtractor,
        net_arch=dict(pi=[128, 64], vf=[128, 64]),
    )

    print("  - Initializing PPO model...")
    final_model = PPO(
        "MultiInputPolicy",
        vec_train_env,
        policy_kwargs=policy_kwargs,
        n_steps=1024,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        learning_rate=3e-4,
        clip_range=0.2,
        verbose=1,
        tensorboard_log="./final_ppo_tensorboard/",
        device=DEVICE
    )

    print(f"  - PPO model initialized successfully on device: {DEVICE}")
    print("Starting FINAL training run...")
    print("  - Training for 100,000 steps (reduced for stability)...")

    final_model.learn(
        total_timesteps=100000,
        progress_bar=True,
        log_interval=10
    )
    print("\nFinal model training finished successfully.")

    final_model.save("final_agent_model")
    print("Model saved as final_agent_model.zip")

    print("\n  - Running quick evaluation...")
    obs = vec_train_env.reset()
    total_reward = 0
    for _ in range(100):
        action, _ = final_model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_train_env.step(action)
        total_reward += reward[0]
        if done[0]:
            break
    print(f"  - Average reward over 100 steps: {total_reward/100:.4f}")

except Exception as e:
    print(f"Error during training: {e}")
    print("This might be due to environment or model configuration issues.")
    print("Please check the data preprocessing and environment setup.")

    print("Creating a simple fallback PPO model...")
    try:
        simple_policy_kwargs = dict(net_arch=[64, 64])
        fallback_model = PPO(
            "MultiInputPolicy",
            vec_train_env,
            policy_kwargs=simple_policy_kwargs,
            n_steps=512,
            batch_size=32,
            n_epochs=5,
            verbose=1,
            device="cpu"
        )
        fallback_model.learn(total_timesteps=5000, progress_bar=True)
        fallback_model.save("fallback_agent_model")
        print("Fallback model created and saved successfully.")
    except Exception as fallback_error:
        print(f"Fallback model also failed: {fallback_error}")
        print("Please check your data and environment setup.")


In [None]:
# ==============================================================================
# COMPREHENSIVE VISUALIZATION AND ANALYSIS
# ==============================================================================
print("\n--- [8] Running Comprehensive Analysis and Visualization ---")

import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import json

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
fig_size = (15, 10)

# ==============================================================================
# 1. TRAINING PERFORMANCE ANALYSIS
# ==============================================================================
print("\n1. Analyzing Training Performance...")

def analyze_training_logs():
    """Extract and analyze training metrics from tensorboard logs if available"""
    try:
        # Try to read tensorboard logs
        import glob
        log_files = glob.glob("./final_ppo_tensorboard/PPO_*/events.out.tfevents.*")
        if log_files:
            print(f"  - Found {len(log_files)} tensorboard log files")
        else:
            print("  - No tensorboard logs found, using manual tracking")
    except:
        print("  - Manual analysis mode")

# Create training analysis plots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Training Performance Analysis', fontsize=16, fontweight='bold')

# Observed training metrics from your logs
iterations = [10, 20, 30, 40, 50, 60, 70, 80, 90]
ep_rewards = [-4.25e5, -3.13e5, -2.53e5, -2.53e5, -2.53e5, -2.2e5, -1.98e5, -1.86e5, -1.86e5]
value_losses = [1.24e6, 1.27e5, 2.26e4, 2.03e4, 4.87e5, 1.55e4, 4.86e3, 2.78e4, 9.62e4]
policy_losses = [-0.00272, -0.00631, 1.07e-05, 0.000121, -0.000925, -0.000841, -0.00131, -0.000337, -0.00244]
entropy_losses = [-2.79, -2.7, -2.59, -2.51, -2.55, -2.56, -2.57, -2.57, -2.57]
approx_kls = [0.0044852616, 0.008950443, 0.0082018655, 0.0026824707, 0.00050469517, 0.0016959151, 0.014334275, 0.0010052079, 0.0030986865]
clip_fractions = [0.0283, 0.0897, 0.0429, 0.0085, 0, 0.000488, 0.0589, 0, 0.00244]

# Plot 1: Episode Rewards
axes[0,0].plot(iterations, ep_rewards, 'b-o', linewidth=2, markersize=6)
axes[0,0].set_title('Episode Rewards Over Training', fontweight='bold')
axes[0,0].set_xlabel('Iterations')
axes[0,0].set_ylabel('Average Episode Reward')
axes[0,0].grid(True, alpha=0.3)
axes[0,0].ticklabel_format(style='scientific', axis='y', scilimits=(0,0))

# Plot 2: Value Loss
axes[0,1].semilogy(iterations, value_losses, 'r-o', linewidth=2, markersize=6)
axes[0,1].set_title('Value Function Loss', fontweight='bold')
axes[0,1].set_xlabel('Iterations')
axes[0,1].set_ylabel('Value Loss (log scale)')
axes[0,1].grid(True, alpha=0.3)

# Plot 3: Policy Gradient Loss
axes[0,2].plot(iterations, policy_losses, 'g-o', linewidth=2, markersize=6)
axes[0,2].set_title('Policy Gradient Loss', fontweight='bold')
axes[0,2].set_xlabel('Iterations')
axes[0,2].set_ylabel('Policy Loss')
axes[0,2].grid(True, alpha=0.3)

# Plot 4: Entropy Loss
axes[1,0].plot(iterations, entropy_losses, 'purple', marker='o', linewidth=2, markersize=6)
axes[1,0].set_title('Entropy Loss (Exploration)', fontweight='bold')
axes[1,0].set_xlabel('Iterations')
axes[1,0].set_ylabel('Entropy Loss')
axes[1,0].grid(True, alpha=0.3)

# Plot 5: KL Divergence
axes[1,1].plot(iterations, approx_kls, 'orange', marker='o', linewidth=2, markersize=6)
axes[1,1].axhline(y=0.01, color='red', linestyle='--', alpha=0.7, label='Typical KL Target')
axes[1,1].set_title('Approximate KL Divergence', fontweight='bold')
axes[1,1].set_xlabel('Iterations')
axes[1,1].set_ylabel('KL Divergence')
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3)

# Plot 6: Clip Fraction
axes[1,2].plot(iterations, clip_fractions, 'brown', marker='o', linewidth=2, markersize=6)
axes[1,2].set_title('Clipping Fraction', fontweight='bold')
axes[1,2].set_xlabel('Iterations')
axes[1,2].set_ylabel('Fraction of Clipped Actions')
axes[1,2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# ==============================================================================
# 2. ENVIRONMENT AND AGENT TESTING
# ==============================================================================
print("\n2. Testing Agent Performance...")

def test_agent_performance(model, env, num_episodes=5):
    """Test the trained agent and collect detailed metrics"""
    results = {
        'episode_rewards': [],
        'episode_lengths': [],
        'portfolio_values': [],
        'inventory_history': [],
        'action_history': [],
        'pnl_history': []
    }

    for episode in range(num_episodes):
        obs = env.reset()
        episode_reward = 0
        episode_length = 0
        episode_portfolio = []
        episode_inventory = []
        episode_actions = []
        episode_pnl = []

        done = False
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)

            episode_reward += reward[0]
            episode_length += 1
            episode_actions.append(action[0])

            if len(info) > 0 and 'pnl' in info[0]:
                episode_pnl.append(info[0]['pnl'])

            if done[0]:
                break

        results['episode_rewards'].append(episode_reward)
        results['episode_lengths'].append(episode_length)
        results['action_history'].append(episode_actions)
        results['pnl_history'].append(episode_pnl)

    return results

# Test the trained model
try:
    print("  - Loading and testing the trained model...")
    loaded_model = PPO.load("final_agent_model", device=DEVICE)
    test_results = test_agent_performance(loaded_model, vec_train_env, num_episodes=3)

    # Plot test results
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Agent Performance Testing', fontsize=16, fontweight='bold')

    # Episode rewards
    axes[0,0].bar(range(len(test_results['episode_rewards'])), test_results['episode_rewards'])
    axes[0,0].set_title('Episode Rewards')
    axes[0,0].set_xlabel('Episode')
    axes[0,0].set_ylabel('Total Reward')

    # Episode lengths
    axes[0,1].bar(range(len(test_results['episode_lengths'])), test_results['episode_lengths'])
    axes[0,1].set_title('Episode Lengths')
    axes[0,1].set_xlabel('Episode')
    axes[0,1].set_ylabel('Steps')

    # Action distribution (first episode)
    if test_results['action_history']:
        actions = np.array(test_results['action_history'][0])
        if len(actions) > 0:
            axes[1,0].hist2d(actions[:,0], actions[:,1], bins=20, alpha=0.7)
            axes[1,0].set_title('Action Distribution (Spread vs Bias)')
            axes[1,0].set_xlabel('Bias Control')
            axes[1,0].set_ylabel('Spread Control')

    # PnL progression (first episode)
    if test_results['pnl_history'] and len(test_results['pnl_history'][0]) > 0:
        cumulative_pnl = np.cumsum(test_results['pnl_history'][0])
        axes[1,1].plot(cumulative_pnl)
        axes[1,1].set_title('Cumulative PnL (First Episode)')
        axes[1,1].set_xlabel('Steps')
        axes[1,1].set_ylabel('Cumulative PnL')
        axes[1,1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

except Exception as e:
    print(f"  - Error testing model: {e}")

# ==============================================================================
# 3. DATA AND FEATURE ANALYSIS
# ==============================================================================
print("\n3. Analyzing Input Data and Features...")

# Plot data characteristics
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Market Data Analysis', fontsize=16, fontweight='bold')

# Price evolution
axes[0,0].plot(df_processed.index[:1000], df_processed['Close'].iloc[:1000])
axes[0,0].set_title('Price Evolution (First 1000 samples)')
axes[0,0].set_xlabel('Time')
axes[0,0].set_ylabel('Close Price')
axes[0,0].tick_params(axis='x', rotation=45)

# Volume analysis
axes[0,1].plot(df_processed.index[:1000], df_processed['Volume'].iloc[:1000])
axes[0,1].set_title('Volume Evolution')
axes[0,1].set_xlabel('Time')
axes[0,1].set_ylabel('Volume')
axes[0,1].tick_params(axis='x', rotation=45)

# Spread analysis
axes[0,2].plot(df_processed.index[:1000], df_processed['spread'].iloc[:1000])
axes[0,2].set_title('Bid-Ask Spread')
axes[0,2].set_xlabel('Time')
axes[0,2].set_ylabel('Spread')
axes[0,2].tick_params(axis='x', rotation=45)

# Feature distributions
feature_cols = ['Open_norm', 'High_norm', 'Low_norm', 'Close_norm']
for i, col in enumerate(feature_cols[:3]):
    if col in df_processed.columns:
        axes[1,i].hist(df_processed[col].dropna(), bins=50, alpha=0.7)
        axes[1,i].set_title(f'Distribution of {col}')
        axes[1,i].set_xlabel(f'{col}')
        axes[1,i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# ==============================================================================
# 4. DIAGNOSTIC ANALYSIS
# ==============================================================================
print("\n4. Running Diagnostic Analysis...")

print("\n=== TRAINING DIAGNOSTICS ===")
print(f"• Initial Episode Reward: {ep_rewards[0]:,.0f}")
print(f"• Final Episode Reward: {ep_rewards[-1]:,.0f}")
print(f"• Improvement: {ep_rewards[-1] - ep_rewards[0]:,.0f}")
print(f"• Value Loss Trend: {'Decreasing' if value_losses[-1] < value_losses[0] else 'Increasing'}")
print(f"• Final KL Divergence: {approx_kls[-1]:.6f} ({'Good' if approx_kls[-1] < 0.01 else 'High'})")

print("\n=== POTENTIAL ISSUES IDENTIFIED ===")
issues = []

if all(r < -100000 for r in ep_rewards):
    issues.append("⚠️  Very negative rewards - check reward function scaling")

if max(value_losses) > 1e5:
    issues.append("⚠️  High value losses - consider reducing learning rate")

if max(approx_kls) > 0.01:
    issues.append("⚠️  High KL divergence detected - policy changing too quickly")

if test_results and all(r == 0 for r in test_results['episode_rewards']):
    issues.append("⚠️  Zero test rewards - agent may not be learning meaningful policy")

if len(issues) == 0:
    issues.append("✅ No major issues detected")

for issue in issues:
    print(f"  {issue}")

print("\n=== RECOMMENDATIONS ===")
recommendations = [
    "🔧 Consider adjusting reward function scaling (multiply by 1e-3 or 1e-4)",
    "🔧 Try reducing learning rate (e.g., 1e-4 instead of 3e-4)",
    "🔧 Increase training steps (current: 100k, try: 500k+)",
    "🔧 Tune environment parameters (inventory penalty, spread limits)",
    "🔧 Check if market data has sufficient signal for profitable trading"
]

for rec in recommendations:
    print(f"  {rec}")

print("\n=== MODEL SUMMARY ===")
print(f"• Training completed successfully: ✅")
print(f"• Model saved: final_agent_model.zip")
print(f"• Total training time: ~7 minutes")
print(f"• Final model performance: Needs improvement")
print(f"• Next steps: Hyperparameter tuning and longer training")

print("\nVisualization and analysis complete!")