In [None]:
! pip install wandb gymnasium tqdm torch

In [None]:
! apt-get install -y build-essential swig

In [None]:
! pip install "gymnasium[box2d]"

In [None]:
from IPython import get_ipython
from IPython.display import display

import argparse
import time
import wandb
from tqdm import tqdm

import gymnasium as gym
import numpy as np

import torch

In [None]:
from config import *
from replay_buffer import *
from networks import *
from agent import *

In [None]:
'''
env = gym.make(ENV_NAME, render_mode="human")
agent = DDPGAgent(env, device)

if PATH_LOAD is not None:
    print("loading weights")
    agent.load_models()

states, _ = env.reset()
done = False
score = 0
noise = np.zeros(agent.actions_dim)
while not done:
    action = agent.get_action(states, noise, evaluation=True)
    new_states, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    score += reward
    states = new_states
print(f"Inference score: {score}")
'''

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
single_env = gym.make(ENV_NAME)
agent = DDPGAgent(single_env, device)

if ENABLE_WANDB:
    config = {
        "learning_rate_actor": ACTOR_LR,
        "learning_rate_critic": CRITIC_LR,
        "batch_size": BATCH_SIZE,
        "architecture": "DDPG",
        "infra": "MacOS",
        "env": ENV_NAME
    }

    wandb.init(
        project=f"ddpg_{ENV_NAME.lower()}",
        config=config,
    )

scores = []

In [None]:
for i in tqdm(range(MAX_GAMES)):
    start_time = time.time()
    states, _ = single_env.reset()
    done = False
    score = 0.0
    # states, _ = env.reset()  # Vectorized version
    # dones = np.array([False] * NUM_ENVS)  # Vectorized version
    # score = np.zeros(NUM_ENVS)  # Vectorized version

    for t in range(MAX_TIMESTEPS):  # Add timestep condition
        actions = agent.get_action(states)  # Will return single action
        new_states, rewards, terminated, truncated, _ = single_env.step(actions)
        done = terminated or truncated  # Single environment done flag
        score += rewards if not done else 0  # Single environment score update

        # Store experience in replay buffer (single environment version)
        agent.replay_buffer.push(states, actions, rewards, new_states, done)

        states = new_states

        if agent.replay_buffer.buffer_counter > BATCH_SIZE:
            critic_loss, actor_loss, q_value = agent.learn()

        if done:
            break

    scores.append(score)  # Changed from extend to append for single score

    if ENABLE_WANDB:
        wandb.log({
            'Game number': i + 1,
            'Average reward (last 10 games)': np.mean(scores[-10:]),
            'Time taken': round(time.time() - start_time, 2),
            'Critic Loss': critic_loss,
            'Actor Loss': actor_loss,
            'Average Q Value': q_value
        })


    if (i + 1) % SAVE_FREQUENCY == 0:
        print("saving...")
        agent.save_models()
        print("saved")

agent.save_models()