In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from gridworld import GridworldMdp
from agents import OptimalAgent, MyopicAgent
from fast_agents import FastMyopicAgent, FastOptimalAgent
from mdp_interface import Mdp
from agent_runner import get_reward_from_trajectory, run_agent
import numpy as np
from maia_chess_backend.maia.tfprocess import get_tfp
import tensorflow as tf
from multiprocessing import Pool
import tqdm

In [3]:
np.set_printoptions(precision=5, linewidth=200)

In [4]:
def gen_gridworld_arr(gridworld):
    arr = np.zeros((3,width,height), dtype=np.int8)
    arr[0] = np.array(gridworld.walls)
    
    for (x,y) in gridworld.rewards:
        arr[1,x,y] = gridworld.rewards[(x,y)]
        
    (x,y) = gridworld.get_start_state()
    arr[2,x,y] = 1
    
    return arr

In [5]:
def gen_random_connected(height, width, num_rewards):
    for _ in range(5):
        try:
            return GridworldMdp.generate_random_connected(height=height,width=width,num_rewards=num_rewards,noise=0)
        except:
            pass
    raise ValueError('Could not generate Gridworld')

In [6]:
height=6
width=6
num_rewards=4

num_trials = 1
gamma = 0.9

num_start_states = 1
episode_length = 5

def gen_data(num_grids):
    agent = FastMyopicAgent(horizon=2)
    optimal_agent = FastMyopicAgent(horizon=episode_length)
#     optimal_agent = OptimalAgent()
    mdp_size = width * height * 2
    data = np.zeros((num_grids,4,width,height))
    trial = 0

    # number of times when intervention actually made a difference
    count = 0
    total_actions = 0
    diff_actions = 0
    

    for i in range(num_grids):
#         print(f'IterI {i}', end='\r')
        gridworld = gen_random_connected(height, width, num_rewards)
        mdp = Mdp(gridworld)
        gridworld_arr = gen_gridworld_arr(gridworld)
        # perform intervention with various episode lengths left

        start_state = gridworld.get_random_start_state()
        mdp.gridworld.start_state = start_state
#             print(mdp.gridworld, end='\n\n')

        agent.set_mdp(gridworld)
        optimal_agent.set_mdp(gridworld)

        agent_action = agent.get_action(start_state)
        optimal_action = optimal_agent.get_action(start_state)

        r1,r2 = 0.0,0.0

        total_actions += 1
        if agent_action != optimal_action:
            diff_actions+=1

            agent_trajectory = run_agent(agent,mdp,episode_length=episode_length)
            r1 = get_reward_from_trajectory(agent_trajectory)
            intervened_trajectory = run_agent(agent,mdp,episode_length=episode_length, first_optimal=optimal_agent)
            r2 = get_reward_from_trajectory(intervened_trajectory)

#                 for t1,t2 in zip(agent_trajectory, intervened_trajectory):
#                     print(f'{t1}\t|\t{t2}')

        data[i,:3] = gridworld_arr
        data[i,3] = r2 - r1

#     print()
#     print(data.shape)
#     print(f'Out of {total_actions} actions, {diff_actions} were different')
    return data

In [7]:
%%time
with Pool(32) as p:
    n = 1000
    myopic6x6 = list(tqdm.tqdm(p.imap(gen_data, [1000]*n), total=n))

100%|██████████| 1000/1000 [03:13<00:00,  5.17it/s]

CPU times: user 9.02 s, sys: 4.35 s, total: 13.4 s
Wall time: 3min 13s





In [8]:
myopic6x6 = np.concatenate(myopic6x6)
np.save('/scratch1/fs1/chien-ju.ho/RIS/518/myopic6x6.npy', myopic6x6)
myopic6x6.shape

(1000000, 4, 6, 6)

In [9]:
myopic6x6 = np.load('/scratch1/fs1/chien-ju.ho/RIS/518/myopic6x6.npy')
myopic6x6.shape

(1000000, 4, 6, 6)

In [10]:
x = myopic6x6[:,:3]
x.shape

(1000000, 3, 6, 6)

In [11]:
y = myopic6x6[:,3,0,0]
y.shape

(1000000,)

In [12]:
y = (y>1).astype(int)

In [13]:
y.mean()

0.306218

In [14]:
pos = (y==1)
neg = (y==0)

print(pos.sum(), neg.sum())

n = int(min(pos.sum(), neg.sum()) * 0.5)
div = int(n*0.8)

print(n, div)

xpos = x[pos][:n]
xneg = x[neg][:n]
ypos = y[pos][:n]
yneg = y[neg][:n]

print(xpos.shape, xneg.shape)

xtrain = np.concatenate([xpos[:div],xneg[:div]])
ytrain = np.concatenate([ypos[:div],yneg[:div]])


xeval = np.concatenate([xpos[div:n],xneg[div:n]])
yeval = np.concatenate([ypos[div:n],yneg[div:n]])

xtrain.shape, ytrain.shape, xeval.shape, yeval.shape

306218 693782
153109 122487
(153109, 3, 6, 6) (153109, 3, 6, 6)


((244974, 3, 6, 6), (244974,), (61244, 3, 6, 6), (61244,))

In [15]:
tfp = get_tfp(filters=64, blocks=6, regularizer=False, input_size=3, board_size=6, output_size=1)

optimizer = tfp.optimizer
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metrics = ['accuracy',tf.keras.metrics.Precision(),tf.keras.metrics.Recall(),tf.keras.metrics.AUC()]
# metrics = ['accuracy']

tfp.model.compile(optimizer, loss, metrics)

In [16]:
## First experiment
tfp.model.fit(xtrain,ytrain)
tfp.model.evaluate(xeval,yeval)



[0.6547573208808899,
 0.5782933831214905,
 0.5771669745445251,
 0.5855920314788818,
 0.6188567280769348]