### Imports

In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

: 

### Actor Definition

In [110]:
class Actor(nn.Module):
    def __init__(self, state_dimensions, action_dimensions, max_action):
        super(Actor, self).__init__()

        # first layer of the neural network, state_dimension amt of neurons to 256 neurons
        self.first_layer = nn.Linear(state_dimensions, 256)


        self.second_layer = nn.Linear(256, 256)

        # 256 neurons to action_dimension amt of neurons
        self.third_layer = nn.Linear(256, action_dimensions)

        # the scale of the action (-1, 1) for our purposes
        self.max_action = max_action


    def forward(self, state):
        # returns a "tensor" (a matrix with a single column) representing the output of the first layer
        x = torch.relu(self.first_layer(state))
        x = torch.relu(self.second_layer(x))

        # returns a tensor with one column and eight rows, each item represents the
        # continous action to take on a given control (left axis, right axis, etc)
        action = torch.tanh(self.third_layer(x)) * self.max_action
        return action

### Critic Definition

In [111]:
class Critic(nn.Module):
    def __init__(self, state_dimensions, action_dimensions):
        super(Critic, self).__init__()

        # a tad different from the neural network setup in the actor, instead of taking in states and outputting actions
        # we are taking in a state and an action while outputting a single number (the Q-value)
        self.first_layer = nn.Linear(state_dimensions + action_dimensions, 256)
        self.second_layer = nn.Linear(256, 256)
        self.third_layer = nn.Linear(256, 1)

    def forward(self, state, action):

        # takes the 8 pieces of the state and the 2 pieces of the action and shapes it into a tensor with 10 neurons
        # these neurons altogether tell the network the state and the action taken in that state
        x = torch.cat([state, action], 1)

        # then we run through the neural network
        x = torch.relu(self.first_layer(x))
        x = torch.relu(self.second_layer(x))

        # eventually ending up with a single value (the Q-value)
        value = self.third_layer(x)
        return value

### Replay Buffer Definition

In [112]:
# the purpose of this replay buffer is to further randomize data
# primarily for the purpose of eliminating learning by correlation,
# but also it serves as quite the computational speedup, as the "batches"
# that we run through the critic and the actor get parallelized
class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        # we gather a batch_size amount of random snapshots from the buffer
        batch = random.sample(self.buffer, batch_size)

        # and we return them as numpy arrays to be tested on
        states, actions, rewards, next_states, dones = zip(*batch)
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

### Hyper Parameters

In [113]:
batch_size = 64 # the amount of snapshots to be run through the critic at once
gamma = 0.99  # the discount factor
update_rate = 0.001  # the learning rate
target_update_rate = 0.0005  # the update rate rate of the target network
exploration_noise = 0.1 # the rate at which random actions occur

### Initializations

In [114]:
# initialize environment
env = gym.make("LunarLanderContinuous-v2")
state_dimensions = env.observation_space.shape[0]
action_dimensions = env.action_space.shape[0]
max_action = env.action_space.high[0]

# initialize networks
actor = Actor(state_dimensions, action_dimensions, max_action)
critic = Critic(state_dimensions, action_dimensions)
target_actor = Actor(state_dimensions, action_dimensions, max_action)
target_critic = Critic(state_dimensions, action_dimensions)

# copy weights to target networks
target_actor.load_state_dict(actor.state_dict())
target_critic.load_state_dict(critic.state_dict())

# initialize optimizers, to tune the weights of the networks
actor_optimizer = optim.Adam(actor.parameters(), lr=update_rate)
critic_optimizer = optim.Adam(critic.parameters(), lr=update_rate)

# initialize replay buffer
replay_buffer = ReplayBuffer(max_size=1000000)

### Main Training Loop

In [115]:
'''### for graphs and stuff
episode_rewards = []
current_Q_means = []
target_Q_means = []
actor_losses = []
critic_losses = []
###

# we are going to run this many attempts at landing the lunar lander
num_episodes = 1000
episode_reward_dict = {}
for episode in range(num_episodes):

    # each of those steps will start the same way
    state, _ = env.reset()
    episode_reward = 0
    done = False

    # while the lander hasnt landed
    while not done:

        # we have the actor select an action
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action = actor(state_tensor).detach().numpy()[0]

        # add a bit of randomness to it
        action = action + np.random.normal(0, exploration_noise, size=action_dimensions)

        # make sure it is still between -1 and 1
        action = np.clip(action, -max_action, max_action)

        # and take that action
        next_state, reward, done, _, info = env.step(action)
        episode_reward += reward

        # that snapshot (state, action, reward, net_state, done) then gets stored in the buffer
        replay_buffer.add(state, action, reward, next_state, done)

        # and we enter that state
        state = next_state

        # once we have a predetermined amount of snapshots
        if len(replay_buffer.buffer) > batch_size:

            # we random select a predetermined amount of snapshots
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

            # and turn them into tensors, so they can be used as input to the neural networks
            states = torch.FloatTensor(states)
            actions = torch.FloatTensor(actions)
            rewards = torch.FloatTensor(rewards).unsqueeze(1)
            next_states = torch.FloatTensor(next_states)
            dones = torch.FloatTensor(dones).unsqueeze(1)

            # we then see what the target critic deems these [state, action] pairs' q-values
            with torch.no_grad(): # ensures we dont update the target
                next_actions = target_actor(next_states)
                target_Q = target_critic(next_states, next_actions)
                target_Q = rewards + (1 - dones) * gamma * target_Q
                target_Q_means.append(target_Q.mean().item()) # for graphs!

            # and we see what the current critic things those q-values are
            current_Q = critic(states, actions)
            current_Q_means.append(current_Q.mean().item()) # for graphs!

            # using the current critic's Q-value and the target critic's Q-value, we caluate the loss
            critic_loss = nn.MSELoss()(current_Q, target_Q)
            critic_losses.append(critic_loss.item()) # for graphs!

            # and we use backpropogation to update the critic
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # then we use the critic's feedback on the actor's action to calculate the loss
            actor_loss = -critic(states, actor(states)).mean()
            actor_losses.append(actor_loss.item()) # for graphs!

            # and we use backpropogation to update the actor
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            # after that, we provide a smaller update to the target actor and target critic
            for target_param, param in zip(target_critic.parameters(), critic.parameters()):
                target_param.data.copy_(target_update_rate * param.data + (1 - target_update_rate) * target_param.data)
            for target_param, param in zip(target_actor.parameters(), actor.parameters()):
                target_param.data.copy_(target_update_rate * param.data + (1 - target_update_rate) * target_param.data)

    episode_rewards.append(episode_reward) # for graphs!
    print(f"curr episode: {episode}, reward: {episode_reward}")
    episode_reward_dict[episode] = reward

env.close()
'''


'### for graphs and stuff\nepisode_rewards = []\ncurrent_Q_means = []\ntarget_Q_means = []\nactor_losses = []\ncritic_losses = []\n###\n\n# we are going to run this many attempts at landing the lunar lander\nnum_episodes = 1000\nepisode_reward_dict = {}\nfor episode in range(num_episodes):\n\n    # each of those steps will start the same way\n    state, _ = env.reset()\n    episode_reward = 0\n    done = False\n\n    # while the lander hasnt landed\n    while not done:\n\n        # we have the actor select an action\n        state_tensor = torch.FloatTensor(state).unsqueeze(0)\n        action = actor(state_tensor).detach().numpy()[0]\n\n        # add a bit of randomness to it\n        action = action + np.random.normal(0, exploration_noise, size=action_dimensions)\n\n        # make sure it is still between -1 and 1\n        action = np.clip(action, -max_action, max_action)\n\n        # and take that action\n        next_state, reward, done, _, info = env.step(action)\n        episode_r

### Pack It Up

In [116]:
'''import pickle 

# save off the model
torch.save(actor.state_dict(), 'base_run4.pth')

# pack up the graph data
data_to_save = {
    "episode_rewards": episode_rewards,
    "current_Q_means": current_Q_means,
    "target_Q_means": target_Q_means,
    "actor_losses": actor_losses,
    "critic_losses": critic_losses
}

# save off the graph data
with open('base_run4.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)'''

'import pickle \n\n# save off the model\ntorch.save(actor.state_dict(), \'base_run4.pth\')\n\n# pack up the graph data\ndata_to_save = {\n    "episode_rewards": episode_rewards,\n    "current_Q_means": current_Q_means,\n    "target_Q_means": target_Q_means,\n    "actor_losses": actor_losses,\n    "critic_losses": critic_losses\n}\n\n# save off the graph data\nwith open(\'base_run4.pkl\', \'wb\') as f:\n    pickle.dump(data_to_save, f)'