In [187]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym
from matplotlib import animation
import cv2
import torch
from IPython.display import display,HTML
from collections import namedtuple
import random
from torch import nn
from torch import optim
import torch.nn.functional as F 
import torch.multiprocessing as mp
from datetime import datetime

#https://shmuma.medium.com/speeding-up-dqn-on-pytorch-solving-pong-in-30-minutes-81a1bd2dff55
# import gymnasium as gym

def display_animation(anim):
    plt.close(anim._fig)
    return HTML(anim.to_jshtml())

def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1]/36.0, frames[0].shape[0]/36.0),
               dpi=72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames),
                                   interval=50)

    anim.save('movie_cartpole.mp4')
    return anim
#     display(display_animation(anim))

In [203]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")
# device = torch.device("cpu")
    
# 将图像转换为64*64
class InputWrapper(gym.ObservationWrapper):
    def __init__(self, *args):
        super(InputWrapper, self).__init__(*args)
        old_space = self.observation_space
        self.obervation_space = gym.spaces.Box(
            self.observation(old_space.low),
            self.observation(old_space.high),
            dtype=np.float32
        )
        
    def observation(self, obs):
        new_obs = cv2.resize(
            obs, (64, 64)
        )
        # 转置 (210, 160, 3) -> (3, 210, 160)
        new_obs = np.moveaxis(new_obs, 2, 0)
        return new_obs.astype(np.float32)
    
Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward'))

Running on the GPU


In [204]:
# 存储经验
class ReplayMemory:
    def __init__(self, CAPACITY):
        self.capacity = CAPACITY
        self.memory = []
        self.index = 0
        
    def push(self, state, action, state_next, reward):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
            
#         if reward != 0:
#             self.memory[self.index] = Transition(state, action, state_next, reward)
#             self.index = (self.index + 1) % self.capacity
#         elif reward == 0 and 0.6 <= np.random.uniform(0,1):
#             self.memory[self.index] = Transition(state, action, state_next, reward)
#             self.index = (self.index + 1) % self.capacity
#         else:
#             if self.memory[self.index] is None:
        self.memory[self.index] = Transition(state, action, state_next, reward)
        self.index = (self.index + 1) % self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)


In [205]:
class ImageModel(nn.Module):
    def __init__(self, in_channels=3, num_actions=4):
        super(ImageModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
#         conv_out_size = self._get_conv_out(in_channels)
#         o = self.conv(Variable(torch.zeros(1, *shape)))
#         int(np.prod(o.size()))
        self.fc4 = nn.Linear(1024, 512)
        self.fc5 = nn.Linear(512, num_actions)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc4(x.view(x.size(0), -1)))
        return self.fc5(x)

In [206]:
# 
BATCH_SIZE = 32
CAPACITY = 10000
class Brain:
    def __init__(self, num_state, num_actions):
        self.num_actions = num_actions
        self.memory = ReplayMemory(CAPACITY)
#         print('state {}, action: {}'.format(num_state, num_actions))
#         1, 3, 64, 64]
        self.dqn = ImageModel(num_actions=num_actions).to(device)
        self.target_model = ImageModel(num_actions=num_actions).to(device)
        
        self.optimizer = optim.Adam(self.dqn.parameters(), lr = 0.00025)
        self.gamma = 0.99

    def replay(self):

        if len(self.memory) < BATCH_SIZE:
#             print('minimal batch size：', len(self.memory))
            return 0
        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        
        state_batch = torch.cat(batch.state).to(device)#.cuda(non_blocking=True)
#         print('batch', state_batch.shape)
        action_batch =torch.cat(batch.action).to(device)#.cuda(non_blocking=True)
#         print('reward', batch.reward)
        reward_batch =torch.FloatTensor([[r] for r in batch.reward]).to(device)#.cuda(non_blocking=True)
#         print('batch.reward:',batch.reward)
        # 在状态S下执行动作action_batch后得到的新状态next_S
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]).to(device)#.cuda(non_blocking=True)

#         print('state: ',state_batch.shape, 'next state:', non_final_next_states.shape )
        # 切换推理模式，计算Q值
        self.dqn.eval()
        # 计算Q值:使用gather函数从输出中提取对应于实际采取的动作的小批量变量action的输出
#         dqn_action_values = self.dqn(state_batch)
#         print('batch 2', dqn_action_values.shape)
#         next_action_values = self.target_model(non_final_next_states)
        # 对DQN做正向传播，得到q_j
        state_action_values = self.get_dqn_q(state_batch, action_batch)
#         print('action value', state_action_values.shape)
        # 最大Q值对应的动作
        next_state_action_values = self.get_dqn_q(non_final_next_states, action_batch)
        max_value_actions = self.select_argmax_action(next_state_action_values)
        # 求下一个状态下最大Q值
#         print('next state:', non_final_next_states, max_value_actions.unsqueeze(1))
        self.target_model.eval()
        next_state_values = self.get_target_q(non_final_next_states, max_value_actions.unsqueeze(1))

        expectd_state_action_values = (reward_batch + (self.gamma * next_state_values)).detach()  # Maximize Q
#         print('cur {}, dst: {}'.format(state_action_values, expectd_state_action_values))
#         print('reward_batch:', reward_batch.shape, 'exected: ', next_state_values.shape, 'squeeze:', expectd_state_action_values.unsqueeze(1).shape)
        # 切换到训练模式
        self.dqn.train()

        loss = F.smooth_l1_loss(state_action_values, expectd_state_action_values)
#         loss = F.smooth_l1_loss(state_action_values, expectd_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad() # Backpropagation: clear the tensors from previous gradients calculations.
        loss.backward() # Backpropagation: calculate the gradients.
        self.optimizer.step() # Updating the weights.
        
        self.soft_update()
        return loss

    def decide_action(self, state, episode):
        epsilon = 1/(episode + 1)   # 采用e-贪婪法逐步采用最优动作
        if epsilon < 0.1:
            epsilon = 0.1

        if epsilon <= np.random.uniform(0,1):
            self.dqn.eval()
            with torch.no_grad():
                value = self.dqn(state.to(device))
#                 print('shape:', value.shape, value2)
                action = value.max(1)[1].view(1,1)
        else:
            action = torch.LongTensor([[random.randrange(self.num_actions)]]).to(device)
        
        return action
    
    def get_dqn_q(self, state_batch, action_batch):
        # 计算Q值:使用gather函数从输出中提取对应于实际采取的动作的小批量变量action的输出
        dqn_action_values = self.dqn(state_batch)
        # state_action_values是状态S在网络模型Q下的所有动作的预测值
        return dqn_action_values.gather(1, action_batch)
    
    def get_target_q(self, state_batch, action_batch):
        # 计算Q值:使用gather函数从输出中提取对应于实际采取的动作的小批量变量action的输出
        action_values = self.target_model(state_batch)
#         print('shape:', action_values.shape, action_batch.shape)
        # state_action_values是状态S在网络模型Q下的所有动作的预测值
        return action_values.gather(1, action_batch)
    
    def select_argmax_action(self, q):
        # 选择使q最大的那个动作
        non_final_mask = torch.ByteTensor(tuple(map(lambda s: s is not None, q)))
        next_state_values = torch.zeros(BATCH_SIZE).to(device)
        # 最大q值对应的动作
#         print('max action', q.shape)
        return q.max(1)[1].detach()
#         return q.max(1)[1].detach()
    
    def soft_update(self, tau=0.01):
        for target_param, param in zip(self.target_model.parameters(), self.dqn.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)

In [207]:
class Agent:
    def __init__(self, num_state, num_actions):
        self.brain = Brain(num_state, num_actions)
        
    def update_q_function(self):
        return self.brain.replay()
    
    def get_action(self, state, episode):
        return self.brain.decide_action(state, episode)
    
    def memorize(self, state, action, state_next, reward):
        self.brain.memory.push(state, action, state_next, reward)

In [208]:
num_episodes = 100000
max_steps = 2000

class Environment:
    def __init__(self):
        self.env = InputWrapper(gym.make("BreakoutNoFrameskip-v4", render_mode='rgb_array'))
        self.num_states = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.n
        # 0-不动 1- 重置 2-向右 3- 向左
#         print('action', self.num_actions)
#         print('num_states', self.num_states)
#         print('observation_space', self.env.observation_space)
        self.agent = Agent(self.num_states, self.num_actions)
        
    def run(self):
        record = False
        complete_episode = 0
        max_record = 0

        for episode in range(num_episodes):
            observation = self.env.reset()[0]
#             print('observation', observation.shape)
            state = observation
            state = torch.from_numpy(state).type(torch.FloatTensor)
            state = torch.unsqueeze(state,0)
            self.frames = []
            
            total_reward = 0
            for step in range(0, max_steps):
                self.frames.append(self.env.render())
                action = self.agent.get_action(state, episode)
                observation_next, reward, terminated, done, info = self.env.step(action)
                total_reward += reward

                if done:
                    reward = 100
                else:
                    reward = reward/500
                if info['lives'] >=2:
                    state_next = observation_next
                    state_next = torch.from_numpy(state_next).type(torch.FloatTensor)
                    state_next = torch.unsqueeze(state_next,0)

                    self.agent.memorize(state.to(device), action.to(device), state_next.to(device), reward/50)
                    loss = self.agent.update_q_function()
                    state = state_next            # 保存当前状态到历史记录中
                else:
                    break

            if episode % 50 == 0:
                print('{} {} Episode: finish after {} steps, total reward {}, last loss: {}'.format(datetime.now().strftime('%H:%M:%S'), episode, step + 1, total_reward, loss))
            if total_reward > max_record:
                display_frames_as_gif(self.frames)
                max_record = total_reward

            if complete_episode >= 10 or episode == num_episodes - 2:
                record = True
            
    def display(self):
        gif = display_frames_as_gif(self.frames)
        display(display_animation(gif))

In [None]:
# env = gym.make('GymV26Environment-v0', env_id = "BreakoutNoFrameskip-v4")
# env = gym.make("Breakout-v0")
env = Environment()
env.run()
env.display()

00:13:15 0 Episode: finish after 692 steps, total reward 2.0, last loss: 5.9985344705637544e-05
00:17:50 50 Episode: finish after 392 steps, total reward 0.0, last loss: 5.482451342686545e-06
00:21:29 100 Episode: finish after 439 steps, total reward 0.0, last loss: 7.901838898760616e-07
00:25:24 150 Episode: finish after 399 steps, total reward 0.0, last loss: 2.6894166147567455e-12
00:28:38 200 Episode: finish after 598 steps, total reward 1.0, last loss: 7.078169994567673e-11
00:31:43 250 Episode: finish after 597 steps, total reward 1.0, last loss: 1.0844474623850076e-11


In [None]:
a = torch.randn(2, 1)
# a = torch.randn(2, 4)
a

In [152]:
torch.FloatTensor([[2],[1]]) + 1*a

tensor([[ 1.5453],
        [-0.1009]])

In [151]:
1 -0.4547

0.5453

In [11]:
torch.max(a, 1)

torch.return_types.max(
values=tensor([0.7627, 1.2305]),
indices=tensor([3, 2]))

In [12]:
non_final_mask = torch.ByteTensor(tuple(map(lambda s: s is not None, a)))

In [13]:
tuple(map(lambda s: s is not None, a))

(True, True)

In [14]:
nsv = torch.zeros(2)
# 求下一个状态下最大Q值
nsv[non_final_mask] = torch.max(a, 1)[0].detach()
print(nsv)

tensor([0.7627, 1.2305])


  nsv[non_final_mask] = torch.max(a, 1)[0].detach()


In [15]:
nsv.unsqueeze(1)

tensor([[0.7627],
        [1.2305]])