In [1]:
import gym
import os
import random
import time
import cv2
import copy
import numpy as np
import collections
import matplotlib.pyplot as plt
import json
import scipy
import argparse
from PIL import Image
from collections import deque
from tqdm import tqdm
import ray
from scipy.signal import savgol_filter
# plt.style.use('')


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
import torchvision as tv
from torch.utils.data import Dataset
import pickle

from src.common.atari_wrappers import wrap_deepmind, make_atari
from src.common.utils import LinearSchedule, DataLoaderX, DataPrefetcher, ReplayDataset
from src.common.vec_env import ShmemVecEnv, VecEnvWrapper, DummyVecEnv
from src.agents.model import NatureCNN

In [2]:

num_env = 16
num_actors = 8
total_steps = int(1e7)
epoches = 1000
update_per_data = 8
replay_size = int(1e6)
discount = 0.99
batch_size = 512
lr = 1e-3

target_net_update_freq = 200
exploration_ratio = 0.1
steps_per_epoch = total_steps // epoches 


In [3]:
def make_env(game, episode_life=True, clip_rewards=True):
    env = make_atari(f'{game}NoFrameskip-v4')
    env = wrap_deepmind(env, episode_life=episode_life, clip_rewards=clip_rewards, frame_stack=True, scale=False, transpose_image=True)
    return env

In [5]:
@ray.remote(num_gpus=0.125)
class Actor:
    def __init__(self, rank, game):
        if rank < num_actors:
            self.envs = ShmemVecEnv([lambda: make_env(game) for _ in range(num_env)], context='fork')
        else:
            self.envs = ShmemVecEnv([lambda: make_env(game, False, False) for _ in range(num_env)], context='fork')
        self.R = np.zeros(num_env)
        self.obs = self.envs.reset()
        self.state_shape, self.action_dim = self.envs.observation_space.shape[0], self.envs.action_space.n
        self.model = NatureCNN(self.state_shape[0], self.action_dim).cuda()
        self.rank = rank
    
    def sample(self, epsilon, state_dict):
        self.model.load_state_dict(state_dict)
        steps = steps_per_epoch // (num_env * num_actors)
        Rs, Qs = [], []
        tic = time.time()
        local_replay = deque(maxlen=replay_size)
        for step in range(steps):
            action_random = np.random.randint(0, self.action_dim, num_env)
            st = torch.from_numpy(np.array(self.obs)).float().cuda() / 255.0
            qs = self.model(st)
            qs_max, qs_argmax = qs.max(dim=-1)
            action_greedy = qs_argmax.tolist()
            Qs.append(qs_max.mean().item())
            action = [act_grd if p > epsilon else act_rnd for p, act_rnd, act_grd in zip(np.random.rand(num_env), action_random, action_greedy)]
    
            obs_next, reward, done, info = self.envs.step(action)
            for entry in zip(self.obs, action, reward, obs_next, done):
                local_replay.append(entry)
            self.obs = obs_next
            self.R += np.array(reward)
            for idx, d in enumerate(done):
                if d:
                    Rs.append(self.R[idx])
                    self.R[idx] = 0
        toc = time.time()
        # print(f"Rank {self.rank}, Data Collection Time: {toc - tic}, Speed {steps_per_epoch / (toc - tic)}")
        return local_replay, Rs, Qs, self.rank

    

In [6]:
class Agent:
    def __init__(self, game):
        test_env = make_atari(game)
        self.state_shape, self.action_dim = test_env.observation_space.shape[0], test_env.action_space.n
        self.model = NatureCNN(self.state_shape[0], self.action_dim).cuda()
        self.model_target = copy.deepcopy(self.model).cuda()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
        self.replay = deque(maxlen=replay_size)
        self.update_steps = 0
        self.device = torch.device('cuda:0')
        
    def get_datafetcher(self):
        dataset = ReplayDataset(self.replay)
        dataloader = DataLoaderX(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
        datafetcher = DataPrefetcher(dataloader, self.device)
        datafetcher.preload()
        return datafetcher
    
    def append_data(self, data):
        self.replay.extend(data)
    
    def train_step(self):
        try:
            data = self.prefetcher.next()
        except:
            self.prefetcher = self.get_datafetcher()
            data = self.prefetcher.next()

        states, actions, rewards, next_states, terminals = data
        states = states.float() / 255.0
        next_states = next_states.float() / 255.0
        actions = actions.long()
        terminals = terminals.float()
        rewards = rewards.float()

        with torch.no_grad():
            q_next = self.model_target(next_states)
            q_next_online = self.model(next_states)
            q_next = q_next.gather(1, q_next_online.argmax(dim=-1).unsqueeze(-1)).squeeze(-1)
            q_target = rewards + discount * (1 - terminals) * q_next

        q = self.model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        loss = F.smooth_l1_loss(q, q_target)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.update_steps += 1

        if self.update_steps % 500 == 0:
            self.model_target.load_state_dict(self.model.state_dict())
        return loss.detach()

In [7]:
def formated_print(var_name, xs):
    print("{0}\t {1:10.5f}\t{2:10.5f}\t{3:10.5f}\t".format(var_name, np.mean(xs), np.std(xs), np.max(xs)))

def train(game):
    ray.init()
    
    epsilon_schedule = LinearSchedule(1.0, 0.01, int(total_steps * exploration_ratio))
    actors = [Actor.remote(rank, game) for rank in range(num_actors)]
    tester = Actor.remote(num_actors)
    
    agent = Agent(game)
    sample_ops = [a.sample.remote(1.0, agent.model.state_dict()) for a in actors]
    test_op = tester.sample.remote(0.01, agent.model.state_dict())
    
    TRRs, RRs, QQs, LLs = [], [], [], []
    for local_replay, Rs, Qs, rank in ray.get(sample_ops + [test_op]):
        if rank < num_actors:
            agent.append_data(local_replay)
            RRs += Rs
            QQs += Qs
        else:
            TRRs += Rs
            
    
    print("Warming up reward:", np.mean(RRs), np.std(RRs), np.max(RRs))
    print("Warming up Qmax:", np.mean(QQs), np.std(QQs), np.max(QQs))        
        
    steps = 0
    epoch = 0
    tic = time.time()
    while True:
        done_id, sample_ops = ray.wait(sample_ops)
        data = ray.get(done_id)
        local_replay, Rs, Qs, rank = data[0]
        
        if rank < num_actors:
            # Actor
            agent.append_data(local_replay)
            steps += len(local_replay)
            epsilon = epsilon_schedule(len(local_replay))
            sample_ops.append(actors[rank].sample.remote(epsilon, agent.model.state_dict()))
            
            RRs += Rs
            QQs += Qs
        else:
            # Tester
            sample_ops.append(tester.sample.remote(0.01, agent.model.state_dict()))
            TRRs += Rs
        
        # Trainer
        for _ in range(20):
            loss = agent.train_step()
            LLs.append(loss)
        
        if (steps // steps_per_epoch) > epoch:
            if epoch % 10 == 0:
                toc = time.time()
                print("=" * 100)
                print(f"Epoch: {epoch:5d}\t Steps: {steps:10d}\t Average Speed: {steps / (toc - tic):8.2f}\t Epsilon: {epsilon}")
                print('-' * 100)
                formated_print("EP Training Reward", RRs[-1000:])
                formated_print("EP Loss           ", torch.stack(LLs).tolist()[-1000:])
                formated_print("EP Test Reward    ", TRRs[-1000:])
                formated_print("EP Qmax           ", QQs[-1000:])
    
                print("=" * 100)
                print(" " * 100)
                
                torch.save({
                    'model': agent.model.state_dict(),
                    'optim': agent.optimizer.state_dict(),
                    'epoch': epoch,
                    'epsilon': epsilon,
                    'steps': steps,
                    'Rs': RRs,
                    'TRs': TRRs,
                    'Qs': QQs,
                    'Ls': LLs,
                }, f'ckpt/{game}_e{epoch}')

            epoch += 1

In [8]:
if __name__ == '__main__':
    game = 'Breakout'
    train(game)

2020-08-03 05:26:20,060	INFO resource_spec.py:212 -- Starting Ray with 231.01 GiB memory available for workers and up to 103.01 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-08-03 05:26:20,681	INFO services.py:1165 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m
Exception in thread Thread-86:
Traceback (most recent call last):
  File "/home/bzhou/miniconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/bzhou/miniconda3/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/home/bzhou/miniconda3/lib/python3.6/site-packages/torch/utils/data/_utils/pin_memory.py", line 25, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/home/bzhou/miniconda3/lib/python3.6/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/home/bzhou/miniconda3/lib/python3.6/site-p

[2m[36m(pid=46936)[0m Logging to /tmp/openai-2020-08-03-05-26-22-628167
[2m[36m(pid=46936)[0m Creating dummy env object to get spaces
[2m[36m(pid=46948)[0m Logging to /tmp/openai-2020-08-03-05-26-22-626287
[2m[36m(pid=46948)[0m Creating dummy env object to get spaces
[2m[36m(pid=46952)[0m Logging to /tmp/openai-2020-08-03-05-26-22-615337
[2m[36m(pid=46952)[0m Creating dummy env object to get spaces
[2m[36m(pid=46962)[0m Logging to /tmp/openai-2020-08-03-05-26-22-646089
[2m[36m(pid=46962)[0m Creating dummy env object to get spaces
[2m[36m(pid=46960)[0m Logging to /tmp/openai-2020-08-03-05-26-22-625368
[2m[36m(pid=46960)[0m Creating dummy env object to get spaces
[2m[36m(pid=46954)[0m Logging to /tmp/openai-2020-08-03-05-26-22-614172
[2m[36m(pid=46954)[0m Creating dummy env object to get spaces
[2m[36m(pid=46956)[0m Logging to /tmp/openai-2020-08-03-05-26-22-639788
[2m[36m(pid=46956)[0m Creating dummy env object to get spaces
[2m[36m(pid=46958)

KeyboardInterrupt: 

In [None]:
# plt.plot(TRs)


In [None]:
# LLs = []
# RRs = []
# for epoch in range(10):
#     model, model_target, Ls = train(model, model_target)
#     # Rs = test(model)
#     LLs += Ls
#     # RRs += Rs



In [None]:
# plt.plot(Ls)

In [None]:
# import pickle
# with open('replay.pkl', 'wb') as f:
#     pickle.dump(replay, f, pickle.HIGHEST_PROTOCOL)