<a href="https://colab.research.google.com/github/zll134/deeplearning-tutorial/blob/master/DQN_Pong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Gym.wrapper

将环境打包封装

将对Atari Pong游戏进行实现


In [0]:
import gym
import cv2
import numpy as np 
import gym.spaces
import collections

#实现Fire按钮的实现
class FireResetEnv(gym.Wrapper):
  def __init__(self,env=None):
    super(FireResetEnv,self).__init__(env);
    assert env.unwrapped.get_action_meanings()[1]=="FIRE"
    assert len(env.unwrapped.get_action_meanings())>=3
  def step(self,action):
    return self.env.step(action)
  def reset(self):
    self.env.reset();
    obs,_,done,_=self.env.step(1)
    if done:
      self.env.reset();
    obs,_,done,_=self.env.step(2)
    if done :
      self.env.reset()
    return obs

#实现K帧的合并,
class MaxAndSkipEnv(gym.Wrapper):
  """值返回第 skip th 的帧"""
  def __init__(self,env=None,skip=4):
    super(MaxAndSkipEnv,self).__init__(env);
    #使用最新的观察参数
    self._obs_buffer=collections.deque(maxlen=2)
    self._skip=skip
  def step(self,action):
    total_reward=0.0;
    done=None;
    for _ in range(self._skip):
      obs,reward,done,info =self.env.step(action)
      self._obs_buffer.append(obs)
      total_reward+=reward
      if done :
        break
    #输出两个连续帧中较大的那个
    max_frame=np.max(np.stack(self._obs_buffer),axis=0)
    return max_frame,total_reward,done,info
  def reset(self):
    self._obs_buffer.clear()
    obs = self.env.reset()
    self._obs_buffer.append(obs)
    return obs

#将210*160分辨率的仿真器转化为84*84*1的灰度图
class ProcessFrame84(gym.ObservationWrapper):
  def __init__(self,env=None):
    super(ProcessFrame84,self).__init__(env)
    self.observation_space=gym.spaces.Box(low=0,high=255,shape=[84,84,1],dtype=np.int8)
  def observation(self,obs):
    return ProcessFrame84.process(obs)
  @staticmethod
  def process(frame):
    if frame.size==210*160*3:
      img=np.reshape(frame,[210,160,3]).astype(np.float32)
    elif frame.size==250*160*3:
      img=np.reshape(frame,[250,160,3]).astype(np.float32)
    else :
      assert False,"Unknown Resolution"
    img=img[:,:,0]*0.29+img[:,:,1]*0.59+img[:,:,2]*0.11
    resized_screen=cv2.resize(img,(84,110),interpolation=cv2.INTER_AREA)
    x_t=resized_screen[18:102,:]
    x_t=np.reshape(x_t,[84,84,1])
    return x_t.astype(np.uint8)

#对buffer进行处理
class BufferWrapper(gym.ObservationWrapper):
  def __init__(self,env,n_steps,dtype=np.float32):
    super(BufferWrapper,self).__init__(env)
    self.dtype=dtype
    old_space=env.observation_space
    self.observation_space=gym.spaces.Box(old_space.low.repeat(n_steps,axis=0),old_space.high.repeat(n_steps,axis=0),dtype=dtype)
  def reset(self):
    self.buffer=np.zeros_like(self.observation_space.low,dtype=self.dtype)
    return self.observation(self.env.reset())
  def observation(self,observation):
    self.buffer[:-1]=self.buffer[1:]
    self.buffer[-1] = observation
    return self.buffer

#将图片转化为pytorch 需要的格式
class ImageToPyTorch(gym.ObservationWrapper):
  def __init__(self, env):
    super(ImageToPyTorch, self).__init__(env)
    old_shape = self.observation_space.shape
    self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
    shape=(old_shape[-1], old_shape[0], old_shape[1]),dtype=np.float32)
  def observation(self, observation):
    return np.moveaxis(observation, 2, 0)

class ScaledFloatFrame(gym.ObservationWrapper):
  def observation(self, obs):
    return np.array(obs).astype(np.float32) / 255.0

def make_env(env_name):
  env = gym.make(env_name)
  env = MaxAndSkipEnv(env)
  env = FireResetEnv(env)
  env = ProcessFrame84(env)
  env = ImageToPyTorch(env)
  env = BufferWrapper(env, 4)
  return ScaledFloatFrame(env)




### DQN 模型

神经网络是三个卷积层加上两个全连接层

In [0]:
import torch 
import torch.nn as nn
import numpy as np
import torch.optim as optim

import collections

#DQN 模型
class DQN(nn.Module):
  def __init__(self,input_shape,n_actions):
    super(DQN,self).__init__();
    self.conv=nn.Sequential(
        nn.Conv2d(input_shape[0],32,kernel_size=8,stride=4),
        nn.ReLU(),
        nn.Conv2d(32,64,kernel_size=4,stride=2),
        nn.ReLU(),
        nn.Conv2d(64,64,kernel_size=3,stride=1),
        nn.ReLU()
    )
    
    conv_out_size=self.get_conv_out(input_shape)
    self.fc=nn.Sequential(
        nn.Linear(conv_out_size,512),
        nn.ReLU(),
        nn.Linear(512,n_actions)
    )
  def get_conv_out(self,shape):
    o=self.conv(torch.zeros(1,*shape))
    return int(np.prod(o.size()))
  def forward(self,x):
    conv_out=self.conv(x).view(x.size()[0],-1)
    return self.fc(conv_out)

#超参数
gamma=0.99

#rerplay buffer的代码
Experience=collections.namedtuple('Experience',field_names=['state','action','reward','done','new_state'])

class ExperienceBuffer:
  def __init__(self,capacity):
    self.buffer=collections.deque(maxlen=capacity)
  def __len__(self):
    return len(self.buffer)
  def append(self,experience):
    self.buffer.append(experience)
  def sample(self,batch_size):
    indices=np.random.choice(len(self.buffer),batch_size,replace=False)
    states,actions,rewards,dones,next_states=zip(*[self.buffer[idx] for idx in indices])
    return np.array(states),np.array(actions),np.array(rewards, dtype=np.float32),np.array(dones, dtype=np.int8),np.array(next_states)

#与环境相互作用的agent
class Agent:
  def __init__(self,env,exp_buffer):
    self.env=env;
    self.exp_buffer=exp_buffer
    self._reset();
  def _reset(self):
    self.state=self.env.reset()
    self.total_reward=0.0
  def play_step(self,net,epsilon=0.0,device='cpu'):
    done_reward=None

    #epsilon-greedy算法
    if np.random.random()<epsilon:
      action=env.action_space.sample()
    else:
      state_a=np.array([self.state],copy=False)
      state_v=torch.tensor(state_a).to(device)
      q_vals_v=net(state_v)
      _,action_v=torch.max(q_vals_v,dim=1)
      action=int(action_v.item())

    #产生动作
    new_state,reward,is_done,_=self.env.step(action)
    self.total_reward+=reward
    exp=Experience(self.state,action,reward,is_done,new_state)
    self.exp_buffer.append(exp)
    self.state=new_state
    if is_done:
      done_reward=self.total_reward
      self._reset()
    return done_reward

#计算损失
def calc_loss(batch,net,tgt_net,device='cpu'):
  states,actions,rewards,dones,new_states=batch
  #将numpy array 放入gpu中进行计算
  states_v=torch.tensor(states).to(device)
  actions_v=torch.tensor(actions).to(device)
  rewards_v=torch.tensor(rewards).to(device)
  next_states_v=torch.tensor(new_states).to(device)
  done_mask=torch.BoolTensor(dones).to(device)

  #计算Q值和target
  state_action_values=net(states_v).gather(1,actions_v.unsqueeze(-1)).squeeze(-1)
  next_states_values=tgt_net(next_states_v).max(1)[0]
  next_states_values[done_mask]=0.0#对于已经完成的就算0
  next_states_values=next_states_values.detach()#不进行梯度更新
  expected_state_action_value=next_states_values*gamma+rewards_v#Qlearning的更新规则
  return nn.MSELoss()(state_action_values,expected_state_action_value)

In [0]:
!pip install tensorboardX

### 开始训练

In [0]:
from tensorboardX import SummaryWriter
import time

#环境名字
env_name="PongNoFrameskip-v4"
mean_reward_bound=19.5
device='cuda'

#超参数

replay_size=10000
epsilon_decay_last_frame=10**5
epsilon_start=1.0
epsilon_final=0.02
replay_start_size=10000
learning_rate=1e-4
sync_target_frame=1000
mean_reward_bound=19.5
batch_size=32

#环境与网络

env=make_env(env_name)

net=DQN(env.observation_space.shape,env.action_space.n).to(device)
tgt_net=DQN(env.observation_space.shape,env.action_space.n).to(device)

writer=SummaryWriter(comment='-')
buffer=ExperienceBuffer(replay_size)

agent=Agent(env,buffer)
epsilon=epsilon_start

optimizer = optim.Adam(net.parameters(),lr=learning_rate)
total_rewards=[]
frame_idx=0
ts_frame=0
ts=time.time()
best_mean_reward=None


#开始循环
while True:
  frame_idx+=1
  #epsilon线性递减
  epsilon=max(epsilon_final,epsilon_start-frame_idx/epsilon_decay_last_frame)
  reward = agent.play_step(net, epsilon, device=device)#buffer的操作在agent里面
  if reward is not None:
    total_rewards.append(reward)
    speed = (frame_idx - ts_frame) / (time.time() - ts)#计算每秒能处理多少帧
    ts_frame = frame_idx
    ts = time.time()
    mean_reward = np.mean(total_rewards[-100:])#表示最后100帧的平均回报
    print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
                frame_idx, len(total_rewards), mean_reward, epsilon,
                speed
            ))
    writer.add_scalar("epsilon", epsilon, frame_idx)
    writer.add_scalar("speed", speed, frame_idx)
    writer.add_scalar("reward_100", mean_reward, frame_idx)
    writer.add_scalar("reward", reward, frame_idx)

    #存储最好的模型
    if best_mean_reward is None or best_mean_reward < mean_reward:
      torch.save(net.state_dict(),env_name+"-best.dat")
      if best_mean_reward is not None:
         print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
         best_mean_reward=mean_reward
    if mean_reward > mean_reward_bound:
      print("Solved in %d frames!" % frame_idx)
      break;
  #只有replay buffer 满了才进行梯度更新
  if len(buffer) < replay_start_size:
    continue
  if frame_idx % sync_target_frame == 0:
    tgt_net.load_state_dict(net.state_dict())
  optimizer.zero_grad()
  batch = buffer.sample(batch_size)
  loss_t = calc_loss(batch, net, tgt_net, device=device)
  loss_t.backward()
  optimizer.step()
writer.close()



910: done 1 games, mean reward -21.000, eps 0.99, speed 681.07 f/s
2035: done 2 games, mean reward -20.500, eps 0.98, speed 663.23 f/s
2881: done 3 games, mean reward -20.667, eps 0.97, speed 666.95 f/s
3751: done 4 games, mean reward -20.750, eps 0.96, speed 648.14 f/s
4636: done 5 games, mean reward -20.800, eps 0.95, speed 650.37 f/s
5504: done 6 games, mean reward -20.667, eps 0.94, speed 651.86 f/s
6402: done 7 games, mean reward -20.571, eps 0.94, speed 654.90 f/s
7273: done 8 games, mean reward -20.625, eps 0.93, speed 641.60 f/s
8095: done 9 games, mean reward -20.667, eps 0.92, speed 642.82 f/s
8875: done 10 games, mean reward -20.700, eps 0.91, speed 652.94 f/s
9725: done 11 games, mean reward -20.727, eps 0.90, speed 646.77 f/s
10547: done 12 games, mean reward -20.750, eps 0.89, speed 143.90 f/s
11388: done 13 games, mean reward -20.769, eps 0.89, speed 104.27 f/s
12298: done 14 games, mean reward -20.786, eps 0.88, speed 104.38 f/s
13200: done 15 games, mean reward -20.733

### 运行,测试

In [0]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"


env = make_env(args.env)
env = gym.wrappers.Monitor(env, '/tmp')

state = env.reset()
total_reward = 0.0
c = collections.Counter()

while True:
  start_ts = time.time()
  state_v = torch.tensor(np.array([state], copy=False))
  q_vals = net(state_v).data.numpy()[0]
  action = np.argmax(q_vals)
  c[action] += 1
  state, reward, done, _ = env.step(action)
  total_reward += reward
  if done:
    break
env.close()

### python 几个函数详解

##### 1、卷积神经网络的参数计算公式

```
N=(W-F+2P)/S+1
```
其中W表示输入图片的大小，F表示卷积核大小，P表示padding的像素
S表示stride



### 与上面无关，华为云obs管理

In [0]:
!wget https://cnnorth1-modelarts-sdk.obs.cn-north-1.myhwclouds.com/modelarts-1.1.3-py2.py3-none-any.whl
!pip install modelarts-1.1.3-py2.py3-none-any.whl

In [0]:
from modelarts.session import Session
session = Session(access_key='5QEDGVWFKVRYBE3TW0ND',secret_key='l9d7DkG9Nf5cBLxalMjqWRAcU4Q1OIosaieKPC4R', project_id='079c52e0430026042f71c004ba7d3ceb', region_name='cn-north-4')

In [0]:
session.download_data(bucket_path="/aistart/obs aifood baseline codes/", path="/content/drive/My Drive")

Successfully download file aistart/obs aifood baseline codes from OBS to local /content/drive/My Drive
