In [1]:
import gym
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" #### REMOVE THIS LINE WHEN CUDA CONFIG IS FIXED
import tensorflow as tf
import numpy as np
import sys
import time
import gym_snake
import json
import importlib
import random
from PIL import Image
from tqdm import tqdm

sys.path.insert(0, '..')
from utils.Buffer import ReplayBuffer
from rl.models import get_policy_architecture, get_value_architecture
from algos.PPO import PPO_agent
from algos.DQN import DQN_agent

# %load_ext line_profiler
%matplotlib notebook

In [2]:
# tetris = importlib.import_module('pytris-effect.src.gameui')

In [3]:
run_name = 'cartpole'
action = 'train'
algo = ('DQN', 'Dueling')

In [4]:
cfg_fp = os.path.join('..', 'configs', run_name + '.json')
with open(cfg_fp, 'r') as f:
    config = json.load(f)
ckpt_folder = os.path.join('..', 'checkpoints')

In [5]:
env_name = config['env']
if run_name == 'tetris':
    env = tetris.GameUI(graphic_mode=False, its_per_sec=2, sec_per_tick=0.5)
else:
    env = gym.make(env_name).env if 'use_raw_env' in config else gym.make(env_name)

In [6]:
env.reset().shape

(4,)

In [7]:
def show_img(arr):
    scaling = 30
    data = np.zeros((scaling*arr.shape[0], scaling*arr.shape[1], 3), dtype=np.uint8)
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            for k in range(data.shape[2]):
                data[i,j,k] = arr[i//scaling,j//scaling,k]
    img = Image.fromarray(data, 'RGB')
    # img.save('my.png')
    img.show()

In [8]:
if action == 'evaluate':
    %lprun -f env.drawMatrix env.drawMatrix()

In [9]:
#show_img(env.reset())

In [10]:
if False:
    action = 1
    obs, reward, dn, info = env.step(action)
    show_img(obs)
    print(reward, dn, info)

In [11]:
def do_step():
    _, _, dn, _ = env.step(random.choice(range(7)))
    if dn:
        env.reset()

In [12]:
#%timeit env.reset()

In [13]:
#%timeit do_step()

In [14]:
#%lprun -f env.get_obs do_step()

In [15]:
model = get_policy_architecture(env_name, algo=algo)
if 'DQN' in algo:
    target = tf.keras.models.clone_model(model)
else:
    value = get_value_architecture(env_name)

In [16]:
if 'DQN' in algo:
    agent = DQN_agent(
        model,
        # (TODO): Move args for ReplayBuffer into DQN
        ReplayBuffer(config.get("max_buf_size", 20000), mode='proportional'),
        target=target,
        env=env,
        mode=('DDQN', 'PER'), # 'PER'
        learning_rate=config['learning_rate'],
        batch_size=config['batch_size'],
        update_steps=1,
        multistep=10,
        alpha=1.0,
        beta=1.0,
        gamma=0.99,
        delta=0.0005,
        env_name=config['env_name'],
        algo_name='DQN',
        ckpt_folder=ckpt_folder,
        run_name='cartpole-test'
    )
elif 'PPO' in algo:
    agent = PPO_agent(
        model,
        value,
        env=env,
        learning_rate=config['learning_rate'],
        minibatch_size=config['minibatch_size'],
        epsilon=0.1,
        gamma=1.0,
        env_name=config['env_name'],
        #run_name='snake-PPO-mpi8-09-01-21-run2',
        ckpt_folder=ckpt_folder
    )

In [17]:
t_max = config['t_max']

In [18]:
agent.load_from_checkpoint()
hist = []

In [19]:
if action == 'train':
    if 'DQN' in algo:
        # fill buffer with some random samples
        for i in tqdm(range(25)):
            agent.collect_rollout(t_max=t_max, policy=lambda x: np.random.choice(2), train=False, display=False)
        hist += agent.train(epochs=config['train_epochs'], t_max=t_max, display=False)
    elif 'PPO' in algo:
        agent.train(epochs=config['train_epochs'], t_max=t_max, buf_size=3000, min_buf_size=600, display=False)

100%|██████████| 25/25 [00:00<00:00, 2390.84it/s]


Training epochs:   0%|          | 0/200 [00:00<?, ?it/s]

[5] Average reward: 9.8
Predicted reward: [[0.35074022 0.2276392 ]]
Buffer size: 591
Saving to checkpoint...
[10] Average reward: 10.2
Predicted reward: [[0.96714383 0.5862954 ]]
Buffer size: 642
Saving to checkpoint...
[15] Average reward: 10.4
Predicted reward: [[1.9587462 1.3376465]]
Buffer size: 694
Saving to checkpoint...
[20] Average reward: 10.2
Predicted reward: [[3.3864245 2.6292496]]
Buffer size: 745
Saving to checkpoint...
[25] Average reward: 11.2
Predicted reward: [[4.9704704 4.432596 ]]
Buffer size: 801
Saving to checkpoint...
[30] Average reward: 41.4
Predicted reward: [[8.857255 8.812306]]
Buffer size: 1008
Saving to checkpoint...
[35] Average reward: 92.2
Predicted reward: [[10.041965 10.195121]]
Buffer size: 1469
Saving to checkpoint...
[40] Average reward: 176.0
Predicted reward: [[10.967134 11.793234]]
Buffer size: 2349
Saving to checkpoint...
[45] Average reward: 185.6
Predicted reward: [[14.380096 14.279826]]
Buffer size: 3277
Saving to checkpoint...
[50] Average 

KeyboardInterrupt: 

In [None]:
print(agent.get_model(agent.preprocess(env.reset())))

In [None]:
def test_rollout(t_max, env, close=True):
    import sys
    obs = agent.preprocess(env.reset())
    reward = 0
    for i in range(t_max):
        # print(agent.get_policy(obs))
        # act = agent.get_action(obs, greedy=True)[0]
        act = agent.get_action(obs, mode='greedy')[0][0]
        obs, r, dn, info = env.step(agent.action_wrapper(act))
        env.render()
        print(act, file=sys.stderr)
        time.sleep(0.05)
        obs = agent.preprocess(obs)
        reward += r
        if dn:
            break

    print("Total reward: {}".format(reward), file=sys.stderr)
    if close: env.close()

In [None]:
if action == 'test':
    test_rollout(10000, env, close=False)

In [None]:
# agent.train(4, t_max=500, min_buf_size=10)

In [None]:
# %lprun -f agent.train agent.train(1, t_max=500, buf_size=2000, min_buf_size=10)