In [1]:
import gymnasium as gym
import torch.nn as nn
import torch.optim as optim 

### Q value algorithm
+ the basic idea of Q value (value function) is to calculate the value of a given state, with an action at step t using its reward and state value of the next step as the consequence of the action at step t.
  + Q($s_t$, $a_t$) = $r_{t+1}$ + gamma * max(Q($s_{t+1}$, $a_{t+1}$)). The right side is called TD target
  + The bellman implementation is based on iterative conversion using a learning rate alpha
    + $Q_{new}$($s_t$, $a_t$)                                       
       = (1-alpha)* $Q_{old}$($s_t$, $a_t$) + alpha * ($r_{t+1} + gamma * max($Q_{old}$($s_{t+1}$, $a{t+1}$)))
      
+ in deep RL, instead of using the iterative process, we use a deep neural network to learn for a given input state, its output corresponding to each possible action
  + the input has the dimension of the state dimension, and output has the dimension of number of actions
  + a good example is the LunarLander environment from gym where a state is defined as an eight-dimension vector corresponding to x position, y position, x velocity, y velocity, lander angle, angle velocity, left foot contacts with land (yes/no as 1/0) and right foot contacts with land (yes/no as 1/0)
  + for NN network, input_dimension and output_dimensions are 8 and 4 (up, down, left, right) respectively
#### Barebone Deep Q-network (DQN)
+ selection actions
  + select the action with maximum output
+ loss function
   + instead of iterative process, the loss is directly defined as TD target minus Q-value at (s, a) as below:
     + [($r_{t+1}$ + gamma * max(Q($s_{t+1}$, $a_{t+1}$))) - Q($s_t$, $a_t$)]$^2$
+  

In [12]:
# Initiate the Lunar Lander environment
import torch.nn as nn

env = gym.make("LunarLander-v3")

class Network(nn.Module):
    def __init__(self, dim_inputs, dim_outputs):
        super(Network, self).__init__()
        # Define a linear transformation layer 
        self.fc1 = nn.Linear(dim_inputs, 64)
        self.fc2 = nn.Linear(64, dim_outputs)
        self.relu = nn.ReLU()
    def forward(self, x):
        
        return self.relu(self.fc2(self.fc1(x)))

# Instantiate the network
network = Network(8, 4)

# Initialize the optimizer
optimizer = optim.Adam(network.parameters(), lr=0.0001)

In [13]:
import torch.nn as nn
import torch

def select_action(q_network, state):
    actions = q_network(state)
    action =  torch.argmax(actions).item()
    return action

def calculate_loss(q_network, state, action, next_state, reward, done, gamma=0.99):
    current_state_q_value = q_network(state)[action]
    next_state_q_value = q_network(next_state).max()
    target_q_value = reward + gamma * next_state_q_value * (1 - done)
    loss = nn.MSELoss()(current_state_q_value, target_q_value)
    
    return loss

def describe_episode(episode, episode_reward, step):
    print(f"| Episode:   {episode} | Duration: {step} steps | Return: {episode_reward} |")

# initialize a simple linear NN
q_network = Network(8, 4)

# simulate a random state and next_state as 8-element vectors
state = torch.rand(8)
next_state = torch.rand(8)
action = select_action(q_network, state)
reward = 1
gamma = .99
done = False

calculate_loss(q_network, state, action, next_state, reward, done, gamma)   

tensor(0.9176, grad_fn=<MseLossBackward0>)

In [14]:
for episode in range(10):
    state, info = env.reset()
    done = False
    step = 0
    episode_reward = 0    
    state = torch.tensor(state)

    # Run through steps until done
    while not done:
        step += 1
        action = select_action(q_network, state)        
        # Take the action
        next_state, reward, terminated, truncated, _ = env.step(action)
        next_state = torch.tensor(next_state)
        done = terminated or truncated        
        loss = calculate_loss(q_network, state, action, next_state, reward, done)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()        
        # Update the state
        state = next_state
        episode_reward += reward
    describe_episode(episode, episode_reward, step)    

| Episode:   0 | Duration: 65 steps | Return: -131.4319903801453 |
| Episode:   1 | Duration: 61 steps | Return: -125.62647767680724 |
| Episode:   2 | Duration: 57 steps | Return: -100.59567031761136 |
| Episode:   3 | Duration: 74 steps | Return: -132.08290585974655 |
| Episode:   4 | Duration: 55 steps | Return: -94.66884058159405 |
| Episode:   5 | Duration: 80 steps | Return: -127.31781506665013 |
| Episode:   6 | Duration: 63 steps | Return: -170.77364464951967 |
| Episode:   7 | Duration: 69 steps | Return: -175.04003372369544 |
| Episode:   8 | Duration: 76 steps | Return: -109.04935455407167 |
| Episode:   9 | Duration: 77 steps | Return: -136.6436698244239 |
