In [47]:
import torch
from torch import nn
from torch import optim
import gym
import matplotlib.pyplot as plt

**Example 1**

In [48]:
from torch import nn
import gym

Write a `DeepQNetwork` for `env`. Write hidden size is `128`

In [49]:
env = gym.make('CartPole-v1')

In [50]:
class DeepQNetwork(nn.Module):
    def __init__(self, n_observations, n_actions):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(n_observations, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
        )
    
    def forward(self, x):
        return self.model(x)

In [51]:
n_actions = env.action_space.n
n_observations = env.observation_space.shape[0]

In [52]:
model = DeepQNetwork(
    n_observations=n_observations,
    n_actions=n_actions
)

##### Example 2

In [83]:
from torch import nn
from torch import optim

Write a training loop for a Deep Q Network (no replay) and the agent will take action that has the highest predicted reward

**Hint**
- `env.step()` returns `new_observation, reward, done, truncated, info`

In [84]:
LEARNING_RATE = 1e-2
N_EPISODES = 100
GAMMA = 0.9

In [85]:
loss_func = nn.MSELoss()
model = DeepQNetwork(n_observations=n_observations, n_actions=n_actions)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
losses = []

In [86]:
for i_episode in range(N_EPISODES):
    observation, _ = env.reset()
    observation = torch.from_numpy(observation)
    in_progress = True
    
    while in_progress:
        predicted_reward = model(observation)
        action = torch.argmax(predicted_reward, dim=-1).item()
        
        new_observation, reward, done, truncated, info = env.step(action)
        new_observation = torch.from_numpy(new_observation)
        
        if done == True:
            target_reward = torch.tensor(reward)
        else:
            with torch.no_grad():
                predicted_next_reward = model(new_observation)
                
            max_predicted_next_reward = torch.max(
                predicted_next_reward, dim=-1
            )
            target_reward = reward + GAMMA * max_predicted_next_reward[0]
        
        loss = loss_func(predicted_reward[action], target_reward)
        loss_np = loss.detach().numpy() # can be ignore
        losses.append(loss_np)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
        if done == True:
            in_progress = False
            observation, _ = env.reset()

In [87]:
losses[-3:]

[array(8.719359, dtype=float32),
 array(15.052817, dtype=float32),
 array(109.01839, dtype=float32)]