In [None]:
import gym
import tensorflow as tf
import numpy as np
from algo.PPO import PPO_agent
import time
import gym_snake
import json
import os
import importlib
import random
from PIL import Image

%load_ext line_profiler
%matplotlib notebook

In [None]:
tetris = importlib.import_module('pytris-effect.src.gameui')

In [None]:
run_name = 'tetris'
action = 'train'

In [None]:
cfg_fp = os.path.join('configs', run_name + '.json')
with open(cfg_fp, 'r') as f:
    config = json.load(f)

In [None]:
env_name = config['env']
if run_name == 'tetris':
    env = tetris.GameUI(graphic_mode=False, its_per_sec=2, sec_per_tick=0.5)
else:
    env = gym.make(env_name).env if 'use_raw_env' in config else gym.make(env_name)

In [None]:
env.reset().shape

In [None]:
def show_img(arr):
    scaling = 30
    data = np.zeros((scaling*arr.shape[0], scaling*arr.shape[1], 3), dtype=np.uint8)
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            for k in range(data.shape[2]):
                data[i,j,k] = arr[i//scaling,j//scaling,k]
    img = Image.fromarray(data, 'RGB')
    # img.save('my.png')
    img.show()

In [None]:
if action == 'evaluate':
    %lprun -f env.drawMatrix env.drawMatrix()

In [None]:
#show_img(env.reset())

In [None]:
if False:
    action = 1
    obs, reward, dn, info = env.step(action)
    show_img(obs)
    print(reward, dn, info)

In [None]:
def do_step():
    _, _, dn, _ = env.step(random.choice(range(7)))
    if dn:
        env.reset()

In [None]:
#%timeit env.reset()

In [None]:
#%timeit do_step()

In [None]:
#%lprun -f env.get_obs do_step()

In [None]:
if env_name == "CartPole-v0":
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(4,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    value = tf.keras.Sequential([
        tf.keras.Input(shape=(4,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation=None)
    ])
elif env_name == "MountainCar-v0":
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(2,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])
elif env_name == "Acrobot-v1":
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(6,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(48, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])
    value = tf.keras.Sequential([
        tf.keras.Input(shape=(6,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(48, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation=None)
    ])
elif env_name == "gym_snake:snake-v0":
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(15, 15, 3)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        #tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        #tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(4, activation='softmax')
    ])
    value = tf.keras.Sequential([
        tf.keras.Input(shape=(15, 15, 3)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        #tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        #tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation=None)
    ])
elif env_name == "tetris":  # the final raid boss
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(20,10,3)),
        tf.keras.layers.Conv2D(32, (2, 2), activation='elu', padding='same'),
        tf.keras.layers.Conv2D(64, (3, 3), activation='elu', padding='valid'), # new addition
        tf.keras.layers.Flatten(),
        #tf.keras.layers.Dense(1024, activation='relu'),
        tf.keras.layers.Dense(256, activation='elu'),
        tf.keras.layers.Dense(64, activation='elu'),
        tf.keras.layers.Dense(7, activation='softmax') # NO-OP is an action
    ])
    value = tf.keras.Sequential([
        tf.keras.Input(shape=(20,10,3)),
        tf.keras.layers.Conv2D(32, (2, 2), activation='elu', padding='same'),
        tf.keras.layers.Conv2D(64, (3, 3), activation='elu', padding='valid'), # new addition
        tf.keras.layers.Flatten(),
        #tf.keras.layers.Dense(1024, activation='relu'),
        tf.keras.layers.Dense(256, activation='elu'),
        tf.keras.layers.Dense(64, activation='elu'),
        tf.keras.layers.Dense(1, activation=None)
    ])

In [None]:
agent = PPO_agent(
    model,
    value,
    env=env,
    learning_rate=config['learning_rate'],
    minibatch_size=20, #config['minibatch_size'],
    epsilon=0.1,
    env_name=config['env_name'],
    run_name='tetris-test11' # 'snake-PPO-23-33-21'
)

In [None]:
t_max = config['t_max']

In [None]:
agent.load_from_checkpoint()

In [None]:
if action == 'train':
    agent.train(epochs=config['train_epochs'], t_max=t_max, buf_size=3000, min_buf_size=500)

In [None]:
def test_rollout(t_max, env, close=True):
    import sys
    obs = agent.preprocess(env.reset())
    reward = 0
    for i in range(t_max):
        # print(agent.get_policy(obs))
        act = agent.get_action(obs, greedy=True)[0]
        obs, r, dn, info = env.step(agent.action_wrapper(act))
        env.render()
        print(act, file=sys.stderr)
        time.sleep(0.05)
        obs = agent.preprocess(obs)
        reward += r
        if dn:
            break

    print("Total reward: {}".format(reward), file=sys.stderr)
    if close: env.close()

In [None]:
if action == 'test':
    test_rollout(10000, env, close=False)

In [None]:
# agent.train(4, t_max=500, min_buf_size=10)

In [None]:
%lprun -f agent.train agent.train(1, t_max=500, buf_size=2000, min_buf_size=10)