In [18]:
import torch
from torch import nn
from torch import optim
import gym
import matplotlib.pyplot as plt

**Example 1**

In [10]:
from torch import nn
import gym

Write a `DeepQNetwork` for `env`. Write hidden size is `128`

In [11]:
env = gym.make('CartPole-v1')

In [12]:
class DeepQNetwork(nn.Module):
    def __init__(self, n_observations, n_actions):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(n_observations, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
        )
    
    def forward(self, x):
        return self.model(x)

In [13]:
n_actions = env.action_space.n
n_observations = env.observation_space.shape[0]

In [14]:
model = DeepQNetwork(
    n_observations=n_observations,
    n_actions=n_actions
)

##### Example 2

In [82]:
from torch import nn
from torch import optim

Write a training loop for a Deep Q Network (no replay)

**Hint**
- `env.step()` returns `new_observation, reward, done, truncated, info`

In [83]:
model = DeepQNetwork(n_observations=n_observations, n_actions=n_actions)

In [84]:
LEARNING_RATE = 1e-2
N_EPISODES = 100
GAMMA = 0.9

In [85]:
loss_func = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
losses = []

In [86]:
for i_episode in range(N_EPISODES):
    observation, _ = env.reset()
    observation = torch.from_numpy(observation)
    in_progress = True
    
    while in_progress:
        predicted_reward = model(observation)
        action = torch.argmax(predicted_reward, dim=-1).item()
        
        new_observation, reward, done, truncated, info = env.step(action)
        new_observation = torch.from_numpy(new_observation)
        
        predicted_next_reward = model(new_observation)
        max_predicted_next_reward = torch.max(predicted_next_reward, dim=-1)
        
        target_reward = reward + GAMMA * max_predicted_next_reward[0]
        loss = loss_func(target_reward, predicted_reward[action])
        loss_np = loss.detach().numpy() # can be ignore
        losses.append(loss_np)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
        if done == True:
            in_progress = False
            observation, _ = env.reset()

In [87]:
losses[-3:]

[array(0.01225199, dtype=float32),
 array(0.01956193, dtype=float32),
 array(0.01937165, dtype=float32)]