In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
from gridworld import GridworldMdp
from agents import OptimalAgent, MyopicAgent
from fast_agents import FastMyopicAgent, FastOptimalAgent
from mdp_interface import Mdp
from agent_runner import run_agent, get_reward_from_trajectory
import numpy as np
from multiprocessing import Pool
import tqdm
from maia_chess_backend.maia.tfprocess import get_tfp
import tensorflow as tf

In [3]:
np.set_printoptions(precision=5, linewidth=200)

In [None]:
def gen_gridworld_arr(gridworld):
    arr = np.zeros((3,width,height), dtype=np.int8)
    arr[0] = np.array(gridworld.walls)
    
    for (x,y) in gridworld.rewards:
        arr[1,x,y] = gridworld.rewards[(x,y)]
        
    (x,y) = gridworld.get_start_state()
    arr[2,x,y] = 1
    
    return arr

In [None]:
def gen_random_connected(height, width, num_rewards):
    for _ in range(5):
        try:
            return GridworldMdp.generate_random_connected(height=height,width=width,num_rewards=num_rewards,noise=0)
        except:
            pass
    raise ValueError('Could not generate Gridworld')

In [None]:
height=6
width=6
num_rewards=4

gamma = 0.9
myopic_horizon = 2

episode_length = 5
cost = 1

# change this if adjusting episode_length or num_interventions
# num_trials_per_grid = 512

num_entries_per_trial = 1

def get_minibatch(gridworld, curr_state, action):
    mdp = Mdp(gridworld, curr_state)
    next_state,reward = mdp.perform_action(action)
    minibatch = (curr_state, action, next_state, reward)
    return minibatch, next_state

def gen_data(num_grids):

    data = np.zeros((num_grids,4, height,width))
    trial = 0

    for i in range(num_grids):
#         print(f'Iter {i}', end='\r')
        gridworld = gen_random_connected(height, width, num_rewards)
        mdp = Mdp(gridworld)
        gridworld_arr = gen_gridworld_arr(gridworld)
        start_state = gridworld.get_start_state()
        mdp.gridworld.start_state = start_state
        
        
        dummy_agent = FastMyopicAgent(horizon=episode_length)
        dummy_agent.set_mdp(gridworld)
        
        def recurse(agent_list, moves_left):
            if moves_left == 0: return [agent_list]

            myopic_agent = FastMyopicAgent(horizon=min(moves_left, myopic_horizon))
            optimal_agent = FastMyopicAgent(horizon=moves_left)

            l1 = recurse(agent_list+[(0,myopic_agent)], moves_left-1)
            l2 = recurse(agent_list+[(1,optimal_agent)], moves_left-1)

            return l1+l2
        
        
        agent_lists = recurse([], episode_length)
        #print([[i[0] for i in agent_list] for agent_list in agent_lists])
        rewards = []
        
        for agent_list in agent_lists:
            num_ints = sum([j[0] for j in agent_list])
            agent_list = [j[1] for j in agent_list]
            trajectory = run_agent(dummy_agent, mdp, episode_length=episode_length, agent_list=agent_list)
            rewards.append(get_reward_from_trajectory(trajectory) - num_ints*cost)
#         print(rewards)
        idx = np.array(rewards).argmax()
#         print(i)
#         print(agent_lists[i][0][0])

        data[i,:3] = gridworld_arr
        data[i,3] = agent_lists[idx][0][0]
    return data

In [None]:
%%time
with Pool(32) as p:
#     experiment2 = p.map(gen_data, [1]*1000)
    n = 4000000
    experiment2_big = list(tqdm.tqdm(p.imap(gen_data, [1]*n), total=n))

In [None]:
experiment2 = np.concatenate(experiment2)
experiment2.shape

In [4]:
experiment2 = np.load('/scratch1/fs1/chien-ju.ho/RIS/518/experiment2.npy')

In [5]:
x = experiment2[:,:3]
x.shape

(1000000, 3, 6, 6)

In [6]:
y = experiment2[:,3,0,0]
y.shape

(1000000,)

In [7]:
y.mean()

0.4913

In [8]:
pos = (y==1)
neg = (y==0)

print(pos.sum(), neg.sum())

n = int(min(pos.sum(), neg.sum()) * 0.5)
div = int(n*0.8)

print(n, div)

xpos = x[pos][:n]
xneg = x[neg][:n]
ypos = y[pos][:n]
yneg = y[neg][:n]

print(xpos.shape, xneg.shape)

xtrain = np.concatenate([xpos[:div],xneg[:div]])
ytrain = np.concatenate([ypos[:div],yneg[:div]])


xeval = np.concatenate([xpos[div:n],xneg[div:n]])
yeval = np.concatenate([ypos[div:n],yneg[div:n]])

xtrain.shape, ytrain.shape, xeval.shape, yeval.shape

491300 508700
245650 196520
(245650, 3, 6, 6) (245650, 3, 6, 6)


((393040, 3, 6, 6), (393040,), (98260, 3, 6, 6), (98260,))

In [13]:
tfp = get_tfp(filters=64, blocks=6, regularizer=False, input_size=3, board_size=6, output_size=1)

optimizer = tfp.optimizer
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metrics = ['accuracy',tf.keras.metrics.Precision(),tf.keras.metrics.Recall(),tf.keras.metrics.AUC()]
# metrics = ['accuracy']

tfp.model.compile(optimizer, loss, metrics)

In [14]:
## Second experiment
tfp.model.fit(xtrain,ytrain)
tfp.model.evaluate(xeval,yeval)



[0.05002376809716225,
 0.9795440435409546,
 0.9772322177886963,
 0.9819661974906921,
 0.9984188675880432]