In [None]:
# Make CA15 package importable when running this notebook
import sys
import os
# Add current directory and parent dir to sys.path (helps when launching notebook from different working directories)
sys.path.insert(0, os.path.abspath("."))
sys.path.insert(0, os.path.abspath(".."))
print("Configured sys.path for CA15 imports:", sys.path[0:2])


In [None]:
# Quick smoke test for imports — run this cell to check compatibility
try:
    import CA15
    print("CA15 package version:", CA15.get_version())
    from CA15.environments.grid_world import SimpleGridWorld
    from CA15.model_based_rl.algorithms import DynamicsModel
    print("Imported symbols:", SimpleGridWorld.__name__, DynamicsModel.__name__)
except Exception as e:
    import traceback
    traceback.print_exc()
    print("Import test failed:", e)


# Use canonical implementations from the CA15 package
from CA15.model_based_rl.algorithms import (
    DynamicsModel, ModelEnsemble, ModelPredictiveController, DynaQAgent,
)
from CA15.hierarchical_rl.algorithms import (
    Option, HierarchicalActorCritic, GoalConditionedAgent, FeudalNetwork,
)
from CA15.planning.algorithms import (
    MCTSNode, MonteCarloTreeSearch, ModelBasedValueExpansion, LatentSpacePlanner, WorldModel,
)
from CA15.environments.grid_world import SimpleGridWorld
from CA15.training_examples import ReplayBuffer, PrioritizedReplayBuffer, RunningStats

print("Imported core CA15 algorithm and environment classes")


## Import Required Libraries

We'll import essential libraries for implementing model-based and hierarchical RL algorithms.

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical, Normal

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import deque, namedtuple
import random
import copy
import math
import gym
from typing import List, Dict, Tuple, Optional, Union
import warnings
warnings.filterwarnings('ignore')

from . import (
    DynamicsModel,
    ModelEnsemble,
    ModelPredictiveController,
    DynaQAgent,
    
    Option,
    HierarchicalActorCritic,
    GoalConditionedAgent,
    FeudalNetwork,
    HierarchicalRLEnvironment,
    
    MCTSNode,
    MonteCarloTreeSearch,
    ModelBasedValueExpansion,
    LatentSpacePlanner,
    WorldModel,
    
    SimpleGridWorld,
    
    ExperimentRunner,
    HierarchicalRLExperiment,
    PlanningAlgorithmsExperiment,
    
    ReplayBuffer,
    PrioritizedReplayBuffer,
    RunningStats,
    Logger,
    NeuralNetworkUtils,
    VisualizationUtils,
    EnvironmentUtils,
    ExperimentUtils,
    set_device,
    get_device,
    to_tensor
)

np.random.seed(42)
torch.manual_seed(42)
random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

MODEL_BASED_CONFIG = {
    'model_lr': 1e-3,
    'planning_horizon': 10,
    'model_ensemble_size': 5,
    'imagination_rollouts': 100,
    'model_training_freq': 10
}

HIERARCHICAL_CONFIG = {
    'num_levels': 3,
    'option_timeout': 20,
    'subgoal_threshold': 0.1,
    'meta_controller_lr': 3e-4,
    'controller_lr': 1e-3
}

PLANNING_CONFIG = {
    'mcts_simulations': 100,
    'exploration_constant': 1.4,
    'planning_depth': 5,
    'beam_width': 10
}

print("🚀 Libraries imported successfully!")
print("📦 CA15 modular package loaded!")
print("📊 Configurations loaded for Model-Based and Hierarchical RL")


ImportError: attempted relative import with no known parent package

# Section 1: Model-based Reinforcement Learning

Model-Based RL learns an explicit model of the environment dynamics and uses it for planning and control.

## 1.1 Theoretical Foundation

### Environment Dynamics Model
The goal is to learn a transition model $p(s*{t+1}, r*t | s*t, a*t)$ that predicts next states and rewards.

**Key Components:**
- **Deterministic Model**: $s*{t+1} = f(s*t, a_t) + \epsilon$
- **Stochastic Model**: $s*{t+1} \sim p(\cdot | s*t, a_t)$
- **Ensemble Methods**: Multiple models to capture uncertainty

### Model-predictive Control (mpc)
Uses the learned model to plan actions by optimizing over a finite horizon:

$$a^**t = \arg\max*{a*t, \ldots, a*{t+H-1}} \sum*{k=0}^{H-1} \gamma^k r*{t+k}$$

where states are predicted using the learned model.

### Dyna-q Algorithm
Combines model-free and model-based learning:
1. **Direct RL**: Update Q-function from real experience
2. **Planning**: Use model to generate simulated experience
3. **Model Learning**: Update dynamics model from real data

### Advantages and Challenges
**Advantages:**
- Sample efficiency through planning
- Can handle sparse rewards
- Enables what-if analysis

**Challenges:**
- Model bias and compounding errors
- Computational complexity
- Partial observability

In [None]:

print("🧠 Model-Based RL components loaded from CA15 package!")
print("📝 Key components:")
print("  • DynamicsModel: Neural network for environment dynamics")
print("  • ModelEnsemble: Multiple models for uncertainty quantification")
print("  • ModelPredictiveController: MPC for action planning")
print("  • DynaQAgent: Dyna-Q algorithm combining model-free and model-based learning")


🧠 Model-Based RL components implemented successfully!
📝 Key components:
  • DynamicsModel: Neural network for environment dynamics
  • ModelEnsemble: Multiple models for uncertainty quantification
  • ModelPredictiveController: MPC for action planning
  • DynaQAgent: Dyna-Q algorithm combining model-free and model-based learning


# Section 2: Hierarchical Reinforcement Learning

Hierarchical RL decomposes complex tasks into simpler subtasks through temporal and spatial abstraction.

## 2.1 Theoretical Foundation

### Options Framework
An **option** is a closed-loop policy for taking actions over a period of time. Formally, an option consists of:
- **Initiation set** $I$: States where the option can be initiated
- **Policy** $\pi$: Action selection within the option
- **Termination condition** $\beta$: Probability of terminating the option

### Semi-markov Decision Process (smdp)
Options extend MDPs to SMDPs where:
- Actions can take variable amounts of time
- Temporal abstraction enables hierarchical planning
- Q-learning over options: $Q(s,o) = r + \gamma^k Q(s', o')$

### Goal-conditioned Rl
Learn policies conditioned on goals: $\pi(a|s,g)$
- **Hindsight Experience Replay (HER)**: Learn from failed attempts
- **Universal Value Function**: $V(s,g)$ for any goal $g$
- **Intrinsic Motivation**: Generate own goals for exploration

### Hierarchical Actor-critic (hac)
Multi-level hierarchy where:
- **High-level policy**: Selects subgoals
- **Low-level policy**: Executes actions to reach subgoals
- **Temporal abstraction**: Different time scales at each level

### Feudal Networks
Hierarchical architecture with:
- **Manager**: Sets goals for workers
- **Worker**: Executes actions to achieve goals
- **Feudal objective**: Manager maximizes reward, Worker maximizes goal achievement

## 2.2 Key Advantages

**Sample Efficiency:**
- Reuse learned skills across tasks
- Faster learning through temporal abstraction

**Interpretability:**
- Hierarchical structure mirrors human thinking
- Decomposable and explainable decisions

**Transfer Learning:**
- Skills transfer across related environments
- Compositional generalization

# Section 3: Advanced Planning and Control

Advanced planning algorithms combine learned models with sophisticated search techniques.

## 3.1 Monte Carlo Tree Search (mcts)

MCTS is a best-first search algorithm that uses Monte Carlo simulations for decision making.

### Mcts Algorithm Steps:
1. **Selection**: Navigate down the tree using UCB1 formula
2. **Expansion**: Add new child nodes to the tree
3. **Simulation**: Run random rollouts from leaf nodes
4. **Backpropagation**: Update node values with simulation results

### Ucb1 Selection Formula:
$$UCB1(s,a) = Q(s,a) + c \sqrt{\frac{\ln N(s)}{N(s,a)}}$$

Where:
- $Q(s,a)$: Average reward for action $a$ in state $s$
- $N(s)$: Visit count for state $s$
- $N(s,a)$: Visit count for action $a$ in state $s$
- $c$: Exploration constant

### Alphazero Integration
Combines MCTS with neural networks:
- **Policy Network**: $p(a|s)$ guides selection
- **Value Network**: $v(s)$ estimates leaf values
- **Self-Play**: Generates training data through MCTS games

## 3.2 Model-based Value Expansion (mve)

Uses learned models to expand value function estimates:

$$V*{MVE}(s) = \max*a \left[ r(s,a) + \gamma \sum_{s'} p(s'|s,a) V(s') \right]$$

### Trajectory Optimization
- **Cross-Entropy Method (CEM)**: Iterative sampling and fitting
- **Random Shooting**: Sample multiple action sequences
- **Model Predictive Path Integral (MPPI)**: Information-theoretic approach

## 3.3 Latent Space Planning

Planning in learned latent representations:

### World Models Architecture:
1. **Vision Model (V)**: Encodes observations to latent states
2. **Memory Model (M)**: Predicts next latent states  
3. **Controller Model (C)**: Maps latent states to actions

### Planet Algorithm:
- **Recurrent State Space Model (RSSM)**:
- Deterministic path: $h*t = f(h*{t-1}, a_{t-1})$
- Stochastic path: $s*t \sim p(s*t | h_t)$
- **Planning**: Cross-entropy method in latent space
- **Learning**: Variational inference for world model

## 3.4 Challenges and Solutions

### Model Bias
- **Problem**: Learned models have prediction errors
- **Solutions**: 
- Model ensembles for uncertainty quantification
- Conservative planning with uncertainty penalties
- Robust optimization techniques

### Computational Complexity
- **Problem**: Planning is computationally expensive
- **Solutions**:
- Hierarchical planning with multiple time scales
- Approximate planning with limited horizons
- Parallel Monte Carlo simulations

### Exploration Vs Exploitation
- **Problem**: Balancing exploration and exploitation in planning
- **Solutions**:
- UCB-based selection in MCTS
- Optimistic initialization
- Information-gain based rewards

# Section 4: Practical Demonstrations and Experiments

This section provides hands-on experiments to demonstrate the concepts and implementations.

## 4.1 Experiment Setup

We'll create practical experiments to showcase:

1. **Model-Based vs Model-Free Comparison**
- Sample efficiency analysis
- Performance on different environments
- Computational overhead comparison

2. **Hierarchical RL Benefits**
- Multi-goal navigation tasks
- Skill reuse and transfer
- Temporal abstraction advantages

3. **Planning Algorithm Comparison**
- MCTS vs random rollouts
- Value expansion effectiveness
- Latent space planning benefits

4. **Integration Study**
- Combining all methods
- Real-world application scenarios
- Performance analysis and trade-offs

## 4.2 Metrics and Evaluation

### Performance Metrics:
- **Sample Efficiency**: Steps to reach performance threshold
- **Asymptotic Performance**: Final average reward
- **Computation Time**: Planning and learning overhead
- **Memory Usage**: Model storage requirements
- **Transfer Performance**: Success on related tasks

### Statistical Analysis:
- Multiple random seeds for reliability
- Confidence intervals and significance tests
- Learning curve analysis
- Ablation studies for each component

## 4.3 Environments for Testing

### Simple Grid World:
- **Purpose**: Basic concept demonstration
- **Features**: Discrete states, clear visualization
- **Challenges**: Navigation, goal reaching

### Continuous Control:
- **Purpose**: Real-world applicability
- **Features**: Continuous state-action spaces
- **Challenges**: Precise control, dynamic systems

### Hierarchical Tasks:
- **Purpose**: Multi-level decision making
- **Features**: Natural task decomposition
- **Challenges**: Long-horizon planning, skill coordination

In [None]:
# Import training and evaluation utilities from the CA15 package
from CA15.training_examples import (
    train_model_based_rl_agent,
    train_hierarchical_rl_agent,
    train_goal_conditioned_agent,
    train_feudal_network_agent,
    train_mcts_agent,
    train_latent_space_planner,
)

print("Imported CA15 training functions")


ModuleNotFoundError: No module named 'CA15'

In [None]:
# Import utilities exposed by the CA15 package
from CA15 import (
    VisualizationUtils,
    Logger,
    to_tensor,
    set_device,
    get_device,
)

print("Imported CA15 utilities and helpers")


NameError: name 'HierarchicalRLExperiment' is not defined

# Code Review and Improvements

## Advanced Model-Based and Hierarchical RL Implementation Analysis

### Strengths of Current Implementation

1. **Comprehensive Algorithm Coverage**: Implementation of all major advanced RL paradigms including model-based learning, hierarchical decomposition, and sophisticated planning algorithms
2. **Modular and Scalable Architecture**: Clean separation between different algorithm families with reusable components and extensible design patterns
3. **Advanced Neural Architectures**: State-of-the-art implementations including world models, feudal networks, and latent space planning systems
4. **Theoretical Rigor**: Strong foundation in both model-based theory (dynamics learning, uncertainty quantification) and hierarchical theory (temporal abstraction, multi-timescale learning)
5. **Practical Evaluation Frameworks**: Comprehensive experimental setups with proper statistical analysis, visualization, and comparative studies

### Areas for Improvement

#### 1. Model-Based RL Enhancements
- **Current Limitation**: Basic dynamics model learning with limited uncertainty handling
- **Improvement**: Advanced model-based techniques:
  - **Probabilistic Models**: Implement Bayesian neural networks for better uncertainty quantification
  - **Model-Based Meta-Learning**: Learn-to-learn dynamics models across tasks
  - **Causal Discovery**: Learn causal relationships in environment dynamics
  - **Multi-Step Prediction**: Long-horizon prediction with temporal hierarchies
  - **Model Regularization**: Advanced regularization techniques for better generalization

#### 2. Hierarchical RL Extensions
- **Current Limitation**: Fixed hierarchy with limited skill discovery
- **Improvement**: More sophisticated hierarchical systems:
  - **Automatic Skill Discovery**: Unsupervised learning of reusable skills
  - **Dynamic Hierarchies**: Adaptive hierarchy depth based on task complexity
  - **Cross-Level Communication**: Better information flow between hierarchy levels
  - **Meta-Hierarchical Learning**: Learning to construct hierarchies
  - **Compositional Skills**: Combining primitive skills into complex behaviors

#### 3. Planning Algorithm Advancements
- **Current Limitation**: Basic MCTS and model-based value expansion
- **Improvement**: Cutting-edge planning techniques:
  - **AlphaZero Integration**: Neural network guided MCTS with self-play
  - **MuZero Architecture**: Unified model-based planning framework
  - **Efficient Planning**: Approximate planning methods for real-time control
  - **Hierarchical Planning**: Multi-level planning with temporal abstraction
  - **Risk-Aware Planning**: Planning under uncertainty with risk measures

### Future Research Directions

1. **Foundation Models for RL**: Large-scale pre-training of universal world models and hierarchical policies
2. **Causal Hierarchical RL**: Learning causal hierarchies for better generalization and interpretability
3. **Neuro-Symbolic Hierarchical Systems**: Combining neural networks with symbolic planning
4. **Multi-Agent Hierarchical RL**: Hierarchical coordination in multi-agent systems
5. **Quantum-Enhanced Planning**: Leveraging quantum computing for exponential planning speedup
6. **Human-AI Hierarchical Collaboration**: Hierarchical systems that collaborate with humans
7. **Energy-Efficient Hierarchical RL**: Optimizing for computational and energy constraints
8. **Robust Hierarchical Systems**: Hierarchies that maintain performance under distribution shifts

