In [6]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# Neural Network for DQN
class DQNNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNNetwork, self).__init__()
        # Assuming action_size is 1 for continuous action space
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 48)
        self.fc3 = nn.Linear(48, action_size)

    def forward(self, state, action):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        q_value = self.fc3(x)
        return torch.sum(q_value * action, dim=1)  # Element-wise product and sum for Q-value

# Priority Replay Buffer
class PriorityReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        self.priorities = deque(maxlen=capacity)

    def add(self, experience, priority):
        self.buffer.append(experience)
        self.priorities.append(priority)

    def sample(self, batch_size):
        probabilities = np.array(self.priorities) / sum(self.priorities)
        indices = np.random.choice(range(len(self.buffer)), batch_size, p=probabilities)
        experiences = [self.buffer[idx] for idx in indices]
        return experiences

    def size(self):
        return len(self.buffer)
    
    
    def get_importance_weights(self, b=0.4, epsilon=1e-5):
        sampling_probabilities = np.array(self.priorities) / sum(self.priorities)
        importance_weights = (1 / (len(self.buffer) * sampling_probabilities + epsilon)) ** b
        return importance_weights

# Agent
class Agent:
    def __init__(self, state_size, action_size, batch_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = PriorityReplayBuffer(10000)
        self.batch_size = batch_size
        self.gamma = 0.99  # discount factor
        self.epsilon = 1.0  # exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.model = DQNNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

    def select_action(self, state, noise_scale=0.2):
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            # Get the action from the network
            action = self.model(state, torch.FloatTensor([[1]])).item()
        # Add noise for exploration
        action += noise_scale * np.random.randn()
        # Clip the action to be within the valid range
        return np.clip(action, -2.0, 2.0)

    def store_experience(self, state, action, reward, next_state, done):
        # Calculate TD error
        current_q = self.model(torch.FloatTensor(state).unsqueeze(0))[0][action].item()
        next_q = max(self.model(torch.FloatTensor(next_state).unsqueeze(0)).detach().numpy()[0])
        td_error = abs(reward + (self.gamma * next_q * (not done)) - current_q)

        self.memory.add((state, action, reward, next_state, done), td_error)

    def update_network(self):
        if self.memory.size() < self.batch_size:
            return
        experiences = self.memory.sample(self.batch_size)
        importance_weights = self.memory.get_importance_weights()

        for i, (state, action, reward, next_state, done) in enumerate(experiences):
            target = reward
            if not done:
                next_state = torch.FloatTensor(next_state).unsqueeze(0)
                target = reward + self.gamma * torch.max(self.model(next_state)).item()
            state = torch.FloatTensor(state).unsqueeze(0)
            expected = self.model(state)[0][action]
            loss = (expected - target) ** 2 * importance_weights[i]  # Adjusting loss with importance weights
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

def calculate_priority(experience):
    # Implement your priority calculation here
    return 1  # Placeholder

# Training the agent
def train(num_episodes=1000):
    env = gym.make('Pendulum-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    agent = Agent(state_size, action_size, 64)

    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        while True:
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            agent.store_experience(state, action, reward, next_state, done)
            agent.update_network()
            state = next_state
            total_reward += reward
            if done:
                break
        print(f"Episode: {episode}, Total Reward: {total_reward}")

if __name__ == "__main__":
    train()


  state = torch.FloatTensor(state).unsqueeze(0)


ValueError: expected sequence of length 3 at dim 1 (got 0)