In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import numpy as np

In [2]:
env = gym.make('CartPole-v0')
env.reset()
env._max_episode_steps = 500

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dis = 0.98

In [4]:
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        hidden = 128
        self.loss_list = []
        self.fc1 = nn.Linear(env.observation_space.shape[0], hidden)
        self.fc_policy = nn.Linear(hidden, env.action_space.n)
        self.fc_value = nn.Linear(hidden, 1)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = self.relu(self.fc1(x))
        policy = F.softmax(self.fc_policy(x))
        value = self.fc_value(x)
        return policy, value
    
    def iteration(self, optimizer, loss_list):
        loss = torch.cat(loss_list).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [5]:
model = ActorCritic()
optimizer = optim.Adam(model.parameters(), lr=5e-3)
gamma = 0.99

In [6]:
num_eps = 2000

rList = []
for ep in range(num_eps):
    state = env.reset()
    
    loss_list = []
    rAll = 0
    for t in range(500):
        pi, v = model(torch.tensor(state).float())
        m = Categorical(pi)
        action = m.sample()
        
        new_state, reward, done, _ = env.step(action.item())
        _, next_v = model(torch.tensor(new_state).float())
        delta = reward + gamma * next_v - v
        loss = -torch.log(pi[action]) * delta.item() + delta * delta
        loss_list.append(loss.unsqueeze(0))
        
        rAll += reward
        if done:
            break
        
        state = new_state
    
    rList.append(rAll)
    model.iteration(optimizer, loss_list)
    
    if (ep + 1) % 100 == 0:
        print(f"ep: {ep}, reward mean: {np.mean(rList)}")
        rList = []
        

  if sys.path[0] == '':


ep: 99, reward mean: 37.9
ep: 199, reward mean: 41.37
ep: 299, reward mean: 59.63
ep: 399, reward mean: 82.31
ep: 499, reward mean: 268.45
ep: 599, reward mean: 451.0
ep: 699, reward mean: 380.77
ep: 799, reward mean: 447.2
ep: 899, reward mean: 473.55
ep: 999, reward mean: 476.74
ep: 1099, reward mean: 500.0
ep: 1199, reward mean: 493.16
ep: 1299, reward mean: 496.65
ep: 1399, reward mean: 493.26
ep: 1499, reward mean: 496.11
ep: 1599, reward mean: 492.88
ep: 1699, reward mean: 496.53
ep: 1799, reward mean: 500.0


KeyboardInterrupt: 