In [2]:
import gymnasium as gym
import numpy as np
import os
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random

### CartPole Offline RL Dataset Generator

In [3]:
def set_seeds(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seeds()

In [4]:

def collect_transitions(env, policy_fn, n_episodes=100, max_steps_per_episode=500):
    """
    Collect transitions from an environment using a given policy.
    """
    observations = []
    actions = []
    rewards = []
    next_observations = []
    dones = []
    
    # Track episode stats for diagnostics
    episode_rewards = []
    episode_lengths = []
    
    for episode in tqdm(range(n_episodes), desc="Collecting episodes"):
        obs, _ = env.reset()
        episode_reward = 0
        episode_length = 0
        done = False
        
        while not done and episode_length < max_steps_per_episode:
            action = policy_fn(obs)
            
            observations.append(obs)
            actions.append(action)
            
            next_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            rewards.append(reward)
            next_observations.append(next_obs)
            dones.append(done)
            
            obs = next_obs
            episode_reward += reward
            episode_length += 1
        
        episode_rewards.append(episode_reward)
        episode_lengths.append(episode_length)
    
    # Print diagnostic information
    print(f"Collection complete: {len(observations)} transitions, {n_episodes} episodes")
    print(f"Average episode reward: {np.mean(episode_rewards):.2f}")
    print(f"Average episode length: {np.mean(episode_lengths):.2f}")
    
    # Check action distribution in collected data
    action_counts = np.bincount(actions, minlength=2)
    action_distribution = action_counts / len(actions)
    print(f"Action distribution in dataset: {action_distribution}")
    
    return {
        "observations": np.array(observations, dtype=np.float32),
        "actions": np.array(actions, dtype=np.int32),
        "rewards": np.array(rewards, dtype=np.float32),
        "next_observations": np.array(next_observations, dtype=np.float32),
        "dones": np.array(dones, dtype=np.bool_)
    }


In [5]:

def check_action_distribution(model, env, n_samples=200):
    """
    Check if the model takes diverse actions in the environment.
    Returns action distribution statistics.
    """
    print("Checking action distribution...")
    obs = env.reset()[0]
    actions = []
    
    for _ in range(n_samples):
        action, _ = model.predict(obs, deterministic=True)
        actions.append(action)
        obs, _, terminated, truncated, _ = env.step(action)
        if terminated or truncated:
            obs = env.reset()[0]
    
    # Calculate action distribution
    action_counts = np.bincount(actions, minlength=2)  # CartPole has 2 actions
    action_distribution = action_counts / len(actions)
    
    # Check if actions are diverse enough (neither action dominates too much)
    is_diverse = min(action_distribution) > 0.1  # At least 10% of each action
    
    result = {
        'counts': action_counts.tolist(),
        'distribution': action_distribution.tolist(),
        'is_diverse': is_diverse
    }
    
    print(f"Action counts: {action_counts}, Distribution: {action_distribution}")
    print(f"Model takes diverse actions: {is_diverse}")
    
    return result


In [6]:

def train_expert_model():
    """
    Train a DQN agent until it reliably solves CartPole-v1.
    Simplified training loop for better reliability.
    """
    print("Starting expert model training with simplified approach...")
    
    # Create environment
    env = gym.make("CartPole-v1")
    env = Monitor(env)  # Wrap with Monitor for proper stats
    
    # Initialize agent with more aggressive settings
    model = DQN(
        "MlpPolicy",
        env,
        learning_rate=0.0005,  # Reduced learning rate for stability
        buffer_size=100000,    # Larger buffer
        learning_starts=1000,
        batch_size=128,        # Larger batch size
        gamma=0.99,
        target_update_interval=500,  # Much less frequent updates for stability
        exploration_fraction=0.2,    # More exploration
        exploration_final_eps=0.05,
        policy_kwargs=dict(net_arch=[64, 64]),  # Simpler network
        verbose=0
    )
    
    print("Training for 100k timesteps...")
    model.learn(total_timesteps=100000)
    
    # Evaluate the model
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)
    print(f"Evaluation: Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")
    
    # Check action diversity to ensure we're not just taking one action
    action_dist = check_action_distribution(model, env)
    
    # If model isn't good enough or lacks diversity, retrain with different settings
    if mean_reward < 195 or not action_dist['is_diverse']:
        print("First attempt didn't produce a good policy. Retraining with different settings...")
        
        # Close and remake environment
        env.close()
        env = gym.make("CartPole-v1")
        env = Monitor(env)
        
        model = DQN(
            "MlpPolicy",
            env,
            learning_rate=0.001,  # Higher learning rate 
            buffer_size=100000,
            learning_starts=5000,  # More initial random actions
            batch_size=64,
            gamma=0.99,
            target_update_interval=250,
            exploration_fraction=0.3,  # Even more exploration
            exploration_final_eps=0.1,  # Higher final exploration
            policy_kwargs=dict(net_arch=[128, 128]),  # Larger network
            verbose=0
        )
        
        print("Training for 150k timesteps...")
        model.learn(total_timesteps=150000)
        
        # Evaluate again
        mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)
        print(f"Second evaluation: Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")
        
        action_dist = check_action_distribution(model, env)
    
    # Save the model if it's good enough
    if mean_reward >= 195 and action_dist['is_diverse']:
        print("Saving expert model that achieves good performance with diverse actions!")
        model.save("expert_dqn.zip")
    else:
        # If we still don't have a good model, last resort approach
        print("Still struggling to train a good model. Using last-resort approach...")
        
        env.close()
        env = gym.make("CartPole-v1")
        env = Monitor(env)
        
        # Try a completely different approach with PPO for last resort
        from stable_baselines3 import PPO
        
        model = PPO(
            "MlpPolicy", 
            env,
            learning_rate=0.0003,
            n_steps=2048,
            batch_size=64,
            gae_lambda=0.95,
            gamma=0.99,
            n_epochs=10,
            ent_coef=0.01,  # Encourage exploration
            verbose=0
        )
        
        print("Training PPO for 200k timesteps...")
        model.learn(total_timesteps=200000)
        
        # Final evaluation
        mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)
        print(f"Final evaluation: Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")
        
        action_dist = check_action_distribution(model, env)
        
        if mean_reward >= 195 and action_dist['is_diverse']:
            print("Success with PPO! Saving expert model.")
            model.save("expert_dqn.zip")
        else:
            raise Exception("Failed to train a good expert policy after multiple attempts.")
    
    env.close()
    return model


In [7]:

def generate_expert_dataset():
    """
    Generate expert dataset, ensuring we have a good expert policy first.
    """
    model_path = "expert_dqn.zip"
    
    # Train or load the expert model
    if not os.path.exists(model_path):
        train_expert_model()
    
    # Load the trained model
    if "PPO" in open(model_path, 'rb').read(10).decode('utf-8', errors='ignore'):
        from stable_baselines3 import PPO
        model = PPO.load(model_path)
        print("Loaded PPO model")
    else:
        model = DQN.load(model_path)
        print("Loaded DQN model")
    
    # Create environment
    env = gym.make("CartPole-v1")
    
    # Verify model performance
    mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=20, deterministic=True)
    print(f"Expert model performance: {mean_reward:.2f} ± {std_reward:.2f}")
    
    if mean_reward < 195:
        print("Loaded model doesn't perform well. Retraining...")
        train_expert_model()
        
        # Reload model
        if os.path.exists(model_path):
            if "PPO" in open(model_path, 'rb').read(10).decode('utf-8', errors='ignore'):
                from stable_baselines3 import PPO
                model = PPO.load(model_path)
            else:
                model = DQN.load(model_path)
        else:
            raise FileNotFoundError("Expert model file not found after training")
    
    # Define expert policy function
    def expert_policy(obs):
        action, _ = model.predict(obs, deterministic=True)
        return action
    
    # Collect expert transitions
    print("Generating expert dataset...")
    expert_data = collect_transitions(env, expert_policy, n_episodes=100)
    
    # Save the dataset
    np.savez_compressed(
        "expert_data.npz",
        observations=expert_data["observations"],
        actions=expert_data["actions"],
        rewards=expert_data["rewards"],
        next_observations=expert_data["next_observations"],
        dones=expert_data["dones"]
    )
    
    # Create metadata
    episode_ends = np.where(expert_data["dones"])[0]
    episode_lengths = np.diff(np.concatenate([[0], episode_ends + 1]))
    rewards_per_episode = []
    start_idx = 0
    
    for end_idx in episode_ends:
        episode_reward = np.sum(expert_data["rewards"][start_idx:end_idx+1])
        rewards_per_episode.append(float(episode_reward))
        start_idx = end_idx + 1
    
    metadata = {
        "dataset_type": "expert",
        "num_transitions": len(expert_data["observations"]),
        "num_episodes": len(episode_ends),
        "avg_episode_length": float(np.mean(episode_lengths)),
        "avg_episode_reward": float(np.mean(rewards_per_episode)),
        "action_distribution": {
            str(i): float(np.mean(expert_data["actions"] == i)) 
            for i in range(env.action_space.n)
        }
    }
    
    # Save metadata
    with open("expert_data_metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Expert dataset saved to expert_data.npz")
    print(f"Dataset stats: {len(expert_data['observations'])} transitions, " 
          f"{metadata['avg_episode_reward']:.1f} avg reward/episode")
    
    env.close()
    return expert_data


In [8]:

def generate_random_dataset():
    """
    Generate a dataset of transitions using random actions.
    """
    # Create environment
    env = gym.make("CartPole-v1")
    env.reset(seed=42)
    
    # Define random policy function
    def random_policy(obs):
        return env.action_space.sample()
    
    # Collect random transitions
    print("Generating random dataset...")
    random_data = collect_transitions(env, random_policy, n_episodes=100)
    
    # Save the dataset
    np.savez_compressed(
        "random_data.npz",
        observations=random_data["observations"],
        actions=random_data["actions"],
        rewards=random_data["rewards"],
        next_observations=random_data["next_observations"],
        dones=random_data["dones"]
    )
    
    # Create metadata
    episode_ends = np.where(random_data["dones"])[0]
    episode_lengths = np.diff(np.concatenate([[0], episode_ends + 1]))
    rewards_per_episode = []
    start_idx = 0
    
    for end_idx in episode_ends:
        episode_reward = np.sum(random_data["rewards"][start_idx:end_idx+1])
        rewards_per_episode.append(float(episode_reward))
        start_idx = end_idx + 1
    
    metadata = {
        "dataset_type": "random",
        "num_transitions": len(random_data["observations"]),
        "num_episodes": len(episode_ends),
        "avg_episode_length": float(np.mean(episode_lengths)),
        "avg_episode_reward": float(np.mean(rewards_per_episode)),
        "action_distribution": {
            str(i): float(np.mean(random_data["actions"] == i)) 
            for i in range(env.action_space.n)
        }
    }
    
    # Save metadata
    with open("random_data_metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Random dataset saved to random_data.npz")
    print(f"Dataset stats: {len(random_data['observations'])} transitions, " 
          f"{metadata['avg_episode_reward']:.1f} avg reward/episode")
    
    env.close()
    return random_data


In [9]:

def generate_mixed_dataset(seed=42):
    """
    Generate a mixed dataset by combining expert and random datasets.
    """
    print(f"Generating mixed dataset (seed={seed})...")
    
    # Set seed for reproducibility
    np.random.seed(seed)
    
    # Ensure both datasets exist
    if not os.path.exists("expert_data.npz"):
        print("Expert dataset not found. Generating it first...")
        generate_expert_dataset()
    
    if not os.path.exists("random_data.npz"):
        print("Random dataset not found. Generating it first...")
        generate_random_dataset()
    
    # Load expert and random datasets
    expert_data = np.load("expert_data.npz")
    random_data = np.load("random_data.npz")
    
    # Get dataset sizes
    expert_size = len(expert_data["observations"])
    random_size = len(random_data["observations"])
    
    print(f"Expert dataset: {expert_size} transitions")
    print(f"Random dataset: {random_size} transitions")
    
    # Calculate number of transitions to sample from each dataset (50% each)
    target_size = min(expert_size, random_size)
    n_expert = target_size // 2
    n_random = target_size // 2
    
    print(f"Sampling {n_expert} expert transitions and {n_random} random transitions")
    
    # Sample indices without replacement
    expert_indices = np.random.choice(expert_size, size=n_expert, replace=False)
    random_indices = np.random.choice(random_size, size=n_random, replace=False)
    
    # Extract sampled transitions
    mixed_observations = np.vstack([
        expert_data["observations"][expert_indices],
        random_data["observations"][random_indices]
    ])
    mixed_actions = np.concatenate([
        expert_data["actions"][expert_indices],
        random_data["actions"][random_indices]
    ])
    mixed_rewards = np.concatenate([
        expert_data["rewards"][expert_indices],
        random_data["rewards"][random_indices]
    ])
    mixed_next_observations = np.vstack([
        expert_data["next_observations"][expert_indices],
        random_data["next_observations"][random_indices]
    ])
    mixed_dones = np.concatenate([
        expert_data["dones"][expert_indices],
        random_data["dones"][random_indices]
    ])
    
    # Create a random permutation to shuffle the data
    perm = np.random.permutation(n_expert + n_random)
    
    # Apply the permutation to all arrays
    mixed_data = {
        "observations": mixed_observations[perm],
        "actions": mixed_actions[perm],
        "rewards": mixed_rewards[perm],
        "next_observations": mixed_next_observations[perm],
        "dones": mixed_dones[perm]
    }
    
    # Save the mixed dataset
    np.savez_compressed(
        "mixed_data.npz",
        observations=mixed_data["observations"],
        actions=mixed_data["actions"],
        rewards=mixed_data["rewards"],
        next_observations=mixed_data["next_observations"],
        dones=mixed_data["dones"]
    )
    
    # Create metadata
    metadata = {
        "dataset_type": "mixed",
        "num_transitions": len(mixed_data["observations"]),
        "expert_ratio": 0.5,
        "n_expert_transitions": n_expert,
        "n_random_transitions": n_random,
        "action_distribution": {
            str(i): float(np.mean(mixed_data["actions"] == i)) 
            for i in range(2)  # CartPole has 2 actions
        },
        "seed": seed
    }
    
    # Save metadata
    with open("mixed_data_metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Mixed dataset saved to mixed_data.npz")
    print(f"Dataset size: {len(mixed_data['observations'])} transitions")
    
    return mixed_data


In [10]:
# Function to visualize dataset statistics
def visualize_datasets(expert_path="expert_data.npz", random_path="random_data.npz", 
                     mixed_path="mixed_data.npz"):
    """
    Visualize statistics of the generated datasets.
    
    Args:
        expert_path: Path to the expert dataset
        random_path: Path to the random dataset
        mixed_path: Path to the mixed dataset
    """
    # Load datasets
    expert_data = np.load(expert_path)
    random_data = np.load(random_path)
    mixed_data = np.load(mixed_path)
    
    # Get dataset sizes
    expert_size = len(expert_data["observations"])
    random_size = len(random_data["observations"])
    mixed_size = len(mixed_data["observations"])
    
    # Calculate episode rewards
    def calculate_episode_rewards(rewards, dones):
        episode_rewards = []
        current_reward = 0
        
        for r, d in zip(rewards, dones):
            current_reward += r
            if d:
                episode_rewards.append(current_reward)
                current_reward = 0
        
        if current_reward > 0:  # Add the last episode if not done
            episode_rewards.append(current_reward)
            
        return episode_rewards
    
    expert_episode_rewards = calculate_episode_rewards(expert_data["rewards"], expert_data["dones"])
    random_episode_rewards = calculate_episode_rewards(random_data["rewards"], random_data["dones"])
    mixed_episode_rewards = calculate_episode_rewards(mixed_data["rewards"], mixed_data["dones"])
    
    # Plot episode rewards
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.hist(expert_episode_rewards, alpha=0.7, label="Expert", bins=20)
    plt.hist(random_episode_rewards, alpha=0.7, label="Random", bins=20)
    plt.hist(mixed_episode_rewards, alpha=0.7, label="Mixed", bins=20)
    plt.xlabel("Episode Reward")
    plt.ylabel("Count")
    plt.title("Distribution of Episode Rewards")
    plt.legend()
    
    # Plot action distributions
    plt.subplot(1, 2, 2)
    actions = [0, 1]
    expert_action_freqs = [np.mean(expert_data["actions"] == a) for a in actions]
    random_action_freqs = [np.mean(random_data["actions"] == a) for a in actions]
    mixed_action_freqs = [np.mean(mixed_data["actions"] == a) for a in actions]
    
    x = np.arange(len(actions))
    width = 0.2
    
    plt.bar(x - width, expert_action_freqs, width=width, label="Expert")
    plt.bar(x, random_action_freqs, width=width, label="Random")
    plt.bar(x + width, mixed_action_freqs, width=width, label="Mixed")
    plt.xlabel("Action")
    plt.ylabel("Frequency")
    plt.title("Action Distribution")
    plt.xticks(x, ["0", "1"])
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Print dataset statistics
    print("Dataset Statistics:")
    print(f"{'Dataset':<10} {'Size':<10} {'Episodes':<10} {'Avg. Reward/Episode':<20} {'Avg. Episode Length':<20}")
    print("-" * 70)
    print(f"{'Expert':<10} {expert_size:<10} {len(expert_episode_rewards):<10} "
          f"{np.mean(expert_episode_rewards):<20.2f} "
          f"{expert_size/len(expert_episode_rewards):<20.2f}")
    print(f"{'Random':<10} {random_size:<10} {len(random_episode_rewards):<10} "
          f"{np.mean(random_episode_rewards):<20.2f} "
          f"{random_size/len(random_episode_rewards):<20.2f}")
    print(f"{'Mixed':<10} {mixed_size:<10} {len(mixed_episode_rewards):<10} "
          f"{np.mean(mixed_episode_rewards):<20.2f} "
          f"{mixed_size/len(mixed_episode_rewards):<20.2f}")



In [11]:
def generate_all_datasets():
    """Generate all three datasets in sequence."""
    print("Generating all datasets...")
    
    # First, ensure we have a good expert model
    print("\n===== STEP 1: GENERATING EXPERT DATASET =====")
    expert_data = generate_expert_dataset()
    
    print("\n===== STEP 2: GENERATING RANDOM DATASET =====")
    random_data = generate_random_dataset()
    
    print("\n===== STEP 3: GENERATING MIXED DATASET =====")
    mixed_data = generate_mixed_dataset()
    
    print("\n===== ALL DATASETS GENERATED SUCCESSFULLY =====")
    return expert_data, random_data, mixed_data

In [12]:
generate_all_datasets()
visualize_datasets()

Generating all datasets...

===== STEP 1: GENERATING EXPERT DATASET =====
Starting expert model training with simplified approach...
Training for 100k timesteps...
Evaluation: Mean reward: 500.00 ± 0.00
Checking action distribution...
Action counts: [100 100], Distribution: [0.5 0.5]
Model takes diverse actions: True
Saving expert model that achieves good performance with diverse actions!
Loaded DQN model




Expert model performance: 500.00 ± 0.00
Generating expert dataset...


Collecting episodes:   0%|          | 0/100 [00:00<?, ?it/s]

Collection complete: 50000 transitions, 100 episodes
Average episode reward: 500.00
Average episode length: 500.00
Action distribution in dataset: [0.5 0.5]


NameError: name 'json' is not defined