# First-Visit Visualization

## import packages

In [None]:
import matplotlib.pyplot as plt
from IPython import display

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import gym
import gym.spaces as spaces
import random
from scipy.interpolate import griddata
from PIL import Image

In [None]:
class Explorer(nn.Module):
    
    def __init__(self, obs_space, action_space, hidden_size):
        super().__init__()
        self.fc1 = nn.Linear(obs_space, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_space)
        self.ReLU = nn.ReLU()

    def forward(self, state):
        hid = self.ReLU(self.fc1(state))
        hid = self.ReLU(self.fc2(hid))
        return F.softmax(self.fc3(hid), dim=-1)

In [None]:
class Agent():
    
    def __init__(self, network, lr):
        self.network = network
        self.optimizer = optim.Adam(self.network.parameters(), lr=lr)
        
    def forward(self, state):
        return self.network(state)
    
    def learn(self, log_probs):
        loss = log_probs.sum() #decease the probability of taken actions
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def sample(self, state):
        action_prob = self.network(state)
        action_dist = Categorical(action_prob)
#         action = action_dist.sample()
        action = action_prob.argmax()
        log_prob = action_dist.log_prob(action)
        return action.item(), log_prob

    def save(self, PATH):
        Agent_Dict = {
            "network" : self.network.state_dict(),
            "optimizer" : self.optimizer.state_dict()
        }
        torch.save(Agent_Dict, PATH)

    def load(self, PATH):
        checkpoint = torch.load(PATH)
        self.network.load_state_dict(checkpoint["network"])
        self.optimizer.load_state_dict(checkpoint["optimizer"])


# GridWorld

In [None]:
class GridWorld():
    
    def __init__(self, size=23, max_steps=5000):
        self.size = size
        self.row  = size-1
        self.col  = size//2
        self.steps = 0
        self.max_steps = max_steps
        self.action_space = spaces.Discrete(4)
        self.observation_space = gym.spaces.Box(low=0, high=size-1, shape=(2,), dtype=np.int)
        
    def step(self, action):
        ''' 0: up, 1: down, 2:left, 3: right '''
        if(action == 0):
            self.row += 1
        elif(action == 1):
            self.row -= 1
        elif(action == 2):
            self.col -= 1
        elif(action == 3):
            self.col += 1
            
        self.row = min(max(self.row, 0), self.size-1)
        self.col = min(max(self.col, 0), self.size-1)
        self.steps += 1
        info={}
        
        return (self.row, self.col), 0, self.steps >= self.max_steps, info
    
    def reset(self):
        # put the agent to the upper-mid of the world
        self.row = self.size-1
        self.col = self.size//2
        self.steps = 0
        return (self.row, self.col)

In [None]:
device = torch.device("cuda")

''' For first-visit stats'''
first_visits = {'ϵ-greedy':[], 'ϵz-greedy':[], 'DAE(ours)':[]}

hidden_size = 512
lr = 1e-4
NUM_EPISODE = 50
grid_size = 23
max_steps = 5000
mu = 2

env = GridWorld(size=grid_size, max_steps=max_steps)
action_space = 4
obs_space = 2
for method in ['ϵ-greedy', 'ϵz-greedy', 'DAE(ours)']:
    for e in range(NUM_EPISODE):
        print(f'epoch {e+1}/{NUM_EPISODE}', end = '    \r')

        novelty_estimator_network = NoveltyEstimatorNetwork(obs_space, action_space, hidden_size).to(device)
        noveltyEstimator = NoveltyEstimator(novelty_estimator_network, lr)

        state = env.reset()
        first_visit = np.full((grid_size, grid_size), max_steps)
        total_step = 1
        duration = 0
        while True:

            ''' record first visit '''
            row, col = state
            if first_visit[row][col] == max_steps:
                first_visit[row][col] = total_step
            ''' end recording '''


            '''e-greedy'''
            if method == 'ϵ-greedy':
                action = env.action_space.sample()


            '''ez-greedy'''
            if method == 'ϵz-greedy':
                if duration == 0:
                    action   = env.action_space.sample()
                    duration = np.random.zipf(mu, 1)
                duration -= 1


            if method == 'DAE(ours)':
                #normalize state
                state = (row/grid_size, col/grid_size)
                action, log_prob = noveltyEstimator.sample(torch.FloatTensor(state).to(device))
                noveltyEstimator.learn(log_prob)


            '''next state'''
            next_state, _, done, _ = env.step(action)
            state = next_state
            total_step += 1
            if done or not (first_visit==max_steps).any():
                break
        first_visits[method].append(first_visit)

    first_visits[method] = np.array(first_visits[method])
    first_visits[method] = first_visits[method].mean(0)

In [None]:
contourfs = []
fig = plt.figure(figsize=(9,3))
for i, method in enumerate(['ϵ-greedy', 'ϵz-greedy', 'DAE(ours)']):
    fig.add_subplot(1,3,i+1)
    plt.contour (first_visits[method], vmin=0, vmax=max_steps ,cmap='jet', linewidths=2)
    c = plt.contourf(first_visits[method], vmin=0, vmax=max_steps ,cmap='jet', alpha=0.6)
    contourfs.append(c)

    plt.title(method, fontsize=24)
    plt.axis('off')

plt.tight_layout()
plt.savefig('./GridWorld/GridWorld.png', format="png")

## MountainCar

In [None]:
class RunningMeanStd(object):
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
    def __init__(self, epsilon=1e-4, shape=()):
        self.mean = np.zeros(shape, 'float64')
        self.var = np.ones(shape, 'float64')
        self.count = epsilon

    def update(self, x):
        batch_mean = np.mean(x, axis=0)
        batch_var = np.var(x, axis=0)
        batch_count = x.shape[0]
        self.update_from_moments(batch_mean, batch_var, batch_count)

    def update_from_moments(self, batch_mean, batch_var, batch_count):
        delta = batch_mean - self.mean
        tot_count = self.count + batch_count

        new_mean = self.mean + delta * batch_count / tot_count
        m_a = self.var * (self.count)
        m_b = batch_var * (batch_count)
        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
        new_var = M2 / (self.count + batch_count)

        new_count = batch_count + self.count

        self.mean = new_mean
        self.var = new_var
        self.count = new_count

In [None]:
''' For first-visit stats'''
first_visits = {'ϵ-greedy':[], 'ϵz-greedy':[], 'DAE(ours)':[]}

V_MIN, V_MAX = -0.07, 0.07
X_MIN, X_MAX = -1.2 , 0.6

max_steps = 5000
mu = 2
NUM_BURN_IN = 10
NUM_EPISODE = 50
hidden_size = 512
lr = 1e-4

env = gym.make('MountainCar-v0')
env._max_episode_steps = max_steps

obs_space = 2
action_space = 3

# burn-in for observation normalization parameters
obs_rms = RunningMeanStd(shape=(2,))
for e in range(NUM_BURN_IN):
    print(f'burn-in {e+1}/{NUM_BURN_IN}', end = '    \r')
    
    explorer = Explorer(obs_space, action_space, hidden_size=hidden_size).cuda()
    agent = Agent(explorer, lr=lr)

    state = env.reset()
    total_step = 1
    while True:
#         update obs normalization parameters
        state = np.array(state)
        state = np.expand_dims(state, 0)
        obs_rms.update(state)
#         normalize state
        state -= obs_rms.mean
        state /= np.sqrt(obs_rms.var)
        action, log_prob = agent.sample(torch.FloatTensor(state).cuda())
        agent.learn(log_prob)

        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_step += 1
        if done:
            break
    
print(obs_rms.mean, np.sqrt(obs_rms.var))

for method in ['ϵ-greedy', 'ϵz-greedy', 'DAE(ours)']:
    for e in range(NUM_EPISODE):
        print(f'episode {e+1}/{NUM_EPISODE}', end = '    \r')
        explorer = Explorer(obs_space, action_space, hidden_size=hidden_size).cuda()
        agent = Agent(explorer, lr=lr)

        state = env.reset()
        first_visit = np.full((12,12), max_steps)
        total_step = 1
        duration = 0
        while True:
            ''' record first visit '''
            position, velocity = state
            pos = int((position - X_MIN) / ((X_MAX-X_MIN)/11.))
            vel = int((velocity - V_MIN) / ((V_MAX-V_MIN)/11.))
            if first_visit[pos][vel] == max_steps:
                first_visit[pos][vel] = total_step
            ''' end recording '''


            '''e-greedy'''
            if method == 'ϵ-greedy':
                action = env.action_space.sample()


            '''ez-greedy'''
            if method == 'ϵz-greedy':
                if duration == 0:
                    action   = env.action_space.sample()
                    duration = np.random.zipf(mu, 1)
                duration -= 1


            '''ours'''
            if method == 'DAE(ours)':
                state = np.array(state)
                state -= obs_rms.mean
                state /= np.sqrt(obs_rms.var)
                action, log_prob = agent.sample(torch.FloatTensor(state).cuda())
                agent.learn(log_prob)






            next_state, reward, done, _ = env.step(action)
            state = next_state
            total_step += 1
            if done:
                break
        first_visits[method].append(first_visit)

    '''drawing picture'''
    first_visits[method] = np.array(first_visits[method])
    first_visits[method] = first_visits[method].mean(0)

In [None]:
contourfs = []
fig = plt.figure(figsize=(9,3))
for i, method in enumerate(['ϵ-greedy', 'ϵz-greedy', 'DAE(ours)']):
    fig.add_subplot(1,3,i+1)
    pts = []
    for x in range(12):
        for v in range(12):
            pts.append([1.8/11*x-1.2, 0.14/11*v-0.07])
    xi, yi = np.mgrid[-1.2:0.7:0.1, -0.07:0.07:0.01]
    # grid the data.
    zi = griddata(pts, first_visits[method].flatten(), (xi, yi), method='linear')

    plt.contour (xi, yi, zi, levels=8, vmin=0, vmax=max_steps ,cmap='jet', linewidths=2)
    plt.contourf(xi, yi, zi, levels=8, vmin=0, vmax=max_steps, cmap='jet', alpha=0.6)

    plt.axis('off')
    plt.plot([0.5, 0.5], [-0.07, 0.07], linestyle = 'dashed', color='black')
    plt.title(method, fontsize=24)

plt.tight_layout()
plt.savefig('./MountainCar/MountainCar.png')

# labyrinth

In [None]:
import sys
from gym import spaces
from gym.utils import seeding

LEFT, DOWN, RIGHT, UP = 0, 1, 2, 3
STEPS = [LEFT, DOWN, RIGHT, UP]
OPPOSITE = {LEFT: RIGHT, RIGHT: LEFT, UP: DOWN, DOWN: UP}


def step_grid(cur, d, size):
    x, y = cur
    if d == LEFT:
        x -= 1
    elif d == RIGHT:
        x += 1
    elif d == UP:
        y -= 1
    elif d == DOWN:
        y += 1
    if x < 0 or y < 0 or x >= size or y >= size:
        return cur
    return (x, y)


def gen_labyrinth(size, np_random):
    edges = np.zeros((size, size, 4), dtype=bool)
    visit = np.zeros((size, size), dtype=bool)
    stack = [(0, 0)]

    while len(stack):
        cur = stack.pop()
        visit[cur] = 1
        neib = [d for d in STEPS if not visit[step_grid(cur, d, size)]]
        if len(neib):
            stack.append(cur)
            next_d = np_random.choice(neib)
            next_pos = step_grid(cur, next_d, size)
            edges[cur][next_d] = edges[next_pos][OPPOSITE[next_d]] = 1
            stack.append(next_pos)
    return edges


class PolEnv(gym.Env):
    metadata = {"render.modes": ["human"]}

    def __init__(self, size, max_steps):
        self.size = size
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Discrete(4)
        self.seed()
        self.max_steps = max_steps
        self.steps = 0

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert action in STEPS
        if self.map[self.pos][action]:
            self.pos = step_grid(self.pos, action, self.size)
        self.visit[self.pos] = 1
        self.steps += 1
        done = self.visit.all() or self.steps >= self.max_steps
        state = self.map[self.pos].astype(np.uint8)
        reward = -1.0
        return self.pos, reward, done, {}

    def reset(self):
        self.map = gen_labyrinth(self.size, self.np_random)
        self.visit = np.zeros((self.size, self.size), dtype=bool)
        self.pos = (0,0)
        self.visit[self.pos] = 1
        self.steps = 0
        return self.pos

    def render(self, mode="human"):
        m2 = np.zeros((self.size * 2 + 1, self.size * 2 + 1), dtype=int)
        m2[1::2, 1::2] = 1
        m2[:-1:2, 1::2] = self.map[:, :, LEFT]
        m2[1::2, :-1:2] = self.map[:, :, UP]
        m2[self.pos[0] * 2 + 1, self.pos[1] * 2 + 1] = 2

        for s in m2.astype(str):
            s = "".join(s)
            s = s.replace("0", "#")
            s = s.replace("1", " ")
            s = s.replace("2", "@")
            sys.stdout.write(s + "\n")

    def close(self):
        self.map = None

In [None]:
''' For first-visit stats'''
first_visits = {'ϵ-greedy':[], 'ϵz-greedy':[], 'DAE(ours)':[]}

hidden_size = 512
lr = 1e-4
NUM_EPISODE = 50
maze_size = 5
max_steps = 700
mu = 2
device = torch.device("cuda")

env = PolEnv(size=maze_size, max_steps=max_steps)
action_space = 4
obs_space = 2
for method in ['ϵ-greedy', 'ϵz-greedy', 'DAE(ours)']:
    for e in range(NUM_EPISODE):
        print(f'epoch {e+1}/{NUM_EPISODE}', end = '    \r')

        explorer = Explorer(obs_space, action_space, hidden_size).to(device)
        agent = Agent(explorer, lr)

        state = env.reset()
        first_visit = np.full((maze_size, maze_size), max_steps)
        total_step = 1
        duration = 0
        while True:
            ''' record first visit '''
            row, col = state
            if first_visit[row][col] == max_steps:
                first_visit[row][col] = total_step
            ''' end recording '''

            if method == 'ϵ-greedy':
                action = env.action_space.sample()
            if method == 'ϵz-greedy':
                if duration == 0:
                    action   = env.action_space.sample()
                    duration = np.random.zipf(mu, 1)
                duration -= 1



            if method == 'DAE(ours)':
                mean = 0
                std  = maze_size
                state = ((row-mean)/std, (col-mean)/std)
                action, log_prob = agent.sample(torch.FloatTensor(state).to(device))
                agent.learn(log_prob)

            next_state, _, done, _ = env.step(action)
            state = next_state
            total_step += 1
            if done:
                break
        first_visits[method].append(first_visit)

    first_visits[method] = np.array(first_visits[method])
    first_visits[method] = first_visits[method].mean(0)

In [None]:
fig = plt.figure(figsize=(9,3))
for i, method in enumerate(['ϵ-greedy', 'ϵz-greedy', 'DAE(ours)']):
    fig.add_subplot(1,3,i+1)
    plt.contour (first_visits[method], levels=6, vmin=0, vmax=max_steps ,cmap='jet', linewidths=2)
    plt.contourf(first_visits[method], levels=6, vmin=0, vmax=max_steps ,cmap='jet', alpha=0.6)
    plt.title(method, fontsize=24)
    plt.axis('off')
plt.tight_layout()
plt.savefig('./Labyrinth/Labyrinth.png')

# Minigrid - multi room

In [None]:
from gym_minigrid.wrappers import *

In [None]:
''' For first-visit stats'''
first_visits = {'ϵ-greedy':[], 'ϵz-greedy':[], 'DAE(ours)':[]}

hidden_size = 512
lr = 1e-4
NUM_EPISODE = 10
max_steps = 20000
mu = 2
device = torch.device("cuda")

env = gym.make('MiniGrid-MultiRoom-N6-v0')
env.max_steps = max_steps
height = env.height
width  = env.width
action_space = env.action_space.n
obs_space = 2

for method in ['ϵ-greedy', 'ϵz-greedy', 'DAE(ours)']:
    for e in range(NUM_EPISODE):
        print(f'    epoch {e+1}/{NUM_EPISODE}', end = '    \r')

        explorer = Explorer(obs_space, action_space, hidden_size).to(device)
        agent = Agent(explorer, lr)
        env.seed(1)
        env.reset()
        first_visit = np.full((height, width), max_steps)
        total_step = 1
        duration = 0
        while True:
            ''' record first visit '''
            col, row = env.agent_pos
            if first_visit[row][col] == max_steps:
                first_visit[row][col] = total_step
            ''' end recording '''

            if method == 'ϵ-greedy':
                action = env.action_space.sample()
            if method == 'ϵz-greedy':
                if duration == 0:
                    action   = env.action_space.sample()
                    duration = np.random.zipf(mu, 1)
                duration -= 1
            if method == 'DAE(ours)':
                mean = 0
                std  = width
                state = ((col-mean)/std, (row-mean)/std)
                action, log_prob = agent.sample(torch.FloatTensor(state).to(device))
                agent.learn(log_prob)

            _, _, done, _ = env.step(action)
            total_step += 1
            if done:
                break
        first_visits[method].append(first_visit)

    first_visits[method] = np.array(first_visits[method])
    first_visits[method] = first_visits[method].mean(0)
    first_visits[method] = np.flip(first_visits[method], 0)

In [None]:
fig = plt.figure(figsize=(9,3))
for i, method in enumerate(['ϵ-greedy', 'ϵz-greedy', 'DAE(ours)']):
    fig.add_subplot(1,3,i+1)
    plt.contour (first_visits[method], vmin=0, vmax=max_steps ,cmap='jet', linewidths=2)
    plt.contourf(first_visits[method], vmin=0, vmax=max_steps ,cmap='jet', alpha=0.6)
    plt.title(method, fontsize=24)
    plt.axis('off')
plt.tight_layout()
plt.savefig('./MultiRoom/MultiRoom.png')