In [53]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# Neural Network for DQN
class DQNNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 48)
        self.fc3 = nn.Linear(48, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)  # Outputs Q-values for all actions
        
# Priority Replay Buffer
class PriorityReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
        self.priorities = deque(maxlen=capacity)

    def add(self, experience, priority):
        self.buffer.append(experience)
        self.priorities.append(priority)

    def sample(self, batch_size):
        probabilities = np.array(self.priorities) / sum(self.priorities)
        indices = np.random.choice(range(len(self.buffer)), batch_size, p=probabilities)
        experiences = [self.buffer[idx] for idx in indices]
        return experiences

    def size(self):
        return len(self.buffer)
    
    
    def get_importance_weights(self, b=0.4, epsilon=1e-5):
        sampling_probabilities = np.array(self.priorities) / sum(self.priorities)
        importance_weights = (1 / (len(self.buffer) * sampling_probabilities + epsilon)) ** b
        return importance_weights

# Agent
class Agent:
    def __init__(self, state_size, action_size, batch_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = PriorityReplayBuffer(10000)
        self.batch_size = batch_size
        self.gamma = 0.99  # discount factor
        self.epsilon = 1.0  # exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.model = DQNNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

    def select_action(self, state):
        # Extract the state array if it's inside a tuple
        if isinstance(state, tuple):
            state = state[0]

        # Ensure the state is a numpy array and reshape it to (1x3)
        state = np.array(state).reshape(1, -1)

        # Convert the state to a tensor
        state_tensor = torch.FloatTensor(state)

        with torch.no_grad():
            # Get Q-values from the network
            q_values = self.model(state_tensor)

        # Epsilon-greedy policy for exploration-exploitation trade-off
        if np.random.rand() > self.epsilon:
            # Exploitation: choose the action with the highest Q-value
            action = torch.argmax(q_values).item()
            # Scale the action from the range [0, action_size] to [-2, 2]
            action = 2.0 * (action / (self.action_size - 1)) - 1.0
        else:
            # Exploration: choose a random action within the range [-2, 2]
            action = np.random.uniform(-2.0, 2.0)

        # Decay the epsilon value
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
        
        return [action]

    def store_experience(self, state, action, reward, next_state, done):

        # Extract the state array if it's inside a tuple
        if isinstance(state, tuple):
            state = state[0]

        # Ensure the state and next_state are numpy arrays
        if not isinstance(state, np.ndarray):
            state = np.array(state)
        if not isinstance(next_state, np.ndarray):
            next_state = np.array(next_state)

        # Convert to tensors
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
        action_tensor = torch.FloatTensor([action]).unsqueeze(0)

    def update_network(self):
        if self.memory.size() < self.batch_size:
            return
        experiences = self.memory.sample(self.batch_size)
        importance_weights = self.memory.get_importance_weights()

        for i, (state, action, reward, next_state, done) in enumerate(experiences):
            target = reward
            if not done:
                next_state = torch.FloatTensor(next_state).unsqueeze(0)
                target = reward + self.gamma * torch.max(self.model(next_state)).item()
            state = torch.FloatTensor(state).unsqueeze(0)
            expected = self.model(state)[0][action]
            loss = (expected - target) ** 2 * importance_weights[i]  # Adjusting loss with importance weights
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

def calculate_priority(experience):
    # Implement your priority calculation here
    return 1  # Placeholder


In [54]:
if __name__ == "__main__":
    env = gym.make('Pendulum-v1', g=9.81, render_mode="rgb_array")
    state_size = env.observation_space.shape[0]
    action_size = 30  # Discretize action space into 10 actions

    agent = Agent(state_size, action_size, batch_size=64)

    num_episodes = 1000  # Number of episodes to train

    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0

        for t in range(200):  # Maximum steps in an episode
            env.render()
            action = agent.select_action(state)
            next_state, reward, done, _, _ = env.step(action)  # Action needs to be a list
            agent.store_experience(state, action, reward, next_state, done)
            agent.update_network()
            state = next_state
            total_reward += reward

            if done:
                break

        print(f"Episode {episode+1}/{num_episodes}, Total Reward: {total_reward}")

    env.close()

Episode 1/1000, Total Reward: -1413.9347359477272
Episode 2/1000, Total Reward: -1553.2451194850046
Episode 3/1000, Total Reward: -1848.1158795689653
Episode 4/1000, Total Reward: -1191.8535815939351
Episode 5/1000, Total Reward: -923.0170103866361
Episode 6/1000, Total Reward: -1478.782590907086
Episode 7/1000, Total Reward: -1680.7193607095646
Episode 8/1000, Total Reward: -1543.3256042381183
Episode 9/1000, Total Reward: -1529.2908664532779
Episode 10/1000, Total Reward: -1466.1125997897989
Episode 11/1000, Total Reward: -1475.1712755798885
Episode 12/1000, Total Reward: -1517.7993526551234
Episode 13/1000, Total Reward: -1168.7559972419558
Episode 14/1000, Total Reward: -1330.2246415272555
Episode 15/1000, Total Reward: -1294.615569702171
Episode 16/1000, Total Reward: -1561.9860420201585
Episode 17/1000, Total Reward: -1848.5275189082838
Episode 18/1000, Total Reward: -1240.3353716701276
Episode 19/1000, Total Reward: -1561.943235354493


KeyboardInterrupt: 