In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ['PYGLET_HEADLESS'] = 'True'
os.environ['PYGLET_HEADLESS_DEVICE'] = "3"

In [3]:
from copy import deepcopy
import json
import os

import gymnasium
import networkx as nx
from ltl.dfa import DFA
from torch_policies.learning_params import LearningParameters, \
    add_fields_to_parser, get_learning_parameters
from ts_utils.matcher import dfa2graph, get_training_edges, match_remove_edges
from ts_utils.policy_switcher import PolicySwitcher

from ts_utils.ts_policy_bank import TianshouPolicyBank, load_ts_policy_bank
from ts_utils.ts_envs import generate_envs
from ts_utils.ts_argparse import add_parser_cmds

from utils.print_ltl import ltl_to_print

# %%
from tianshou.data import Batch
import numpy as np

from test_utils import Tester, TestingParameters

import argparse
import pickle

from torch.utils.tensorboard import SummaryWriter
from tianshou.utils.logger.tensorboard import TensorboardLogger
import os

import cProfile

  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


Renderer Vendor: Mesa/X.org
Renderer Hardware: llvmpipe (LLVM 15.0.6, 256 bits)


In [4]:
rl_algo = "dsac"
game_name = "miniworld_simp_no_vis"
run_subfolder = None
run_id = 0
train_size = 50

save_dpath = os.environ['HOME'] + "/data/shared/ltl-transfer-ts"
domain_name = "minecraft"
train_type = "mixed"
prob = 1.0
algo = "lpopl"
run_prefix = None
no_deterministic_eval = False
task_id = 15
verbose = True
test_type = "mixed"
edge_matcher = "relaxed"
device = "cpu"
render = False
map_id = 13

PARALLEL_TRAIN = False

In [5]:
# Running the experiment

# learning params
learning_params = get_learning_parameters(
    policy_name=rl_algo, 
    game_name=game_name,
    alpha=0.03,
)
testing_params = TestingParameters(custom_metric_folder=run_subfolder)
print("Initialized Learning Params:", learning_params)

train_envs, test_envs = generate_envs(
    game_name=game_name, 
    parallel=PARALLEL_TRAIN, 
    map_id=map_id, 
    seed=run_id,
    no_info=False,
    ltl_progress_is_term=False,
    max_episode_steps=9000
)

# path for logger
print(run_prefix, run_prefix is None)
tb_log_path = os.path.join(
    save_dpath, "results", f"{game_name}_{domain_name}", f"{train_type}_p{prob}", 
    f"{algo}_{rl_algo}", f"map{map_id}", str(run_id), 
    f"alpha={'auto' if learning_params.auto_alpha else learning_params.alpha}",
) if run_prefix is None else run_prefix
print("Loading from", tb_log_path)
print(testing_params.custom_metric_folder)
if testing_params.custom_metric_folder is not None:
    tb_log_path = os.path.join(tb_log_path, testing_params.custom_metric_folder)
print("Loading from", tb_log_path)

# load the proper lp
with open(os.path.join(tb_log_path, "learning_params.pkl"), "rb") as f:
    learning_params = pickle.load(f)

# tester
tester = Tester(
    learning_params=learning_params, 
    testing_params=testing_params,
    map_id=map_id,
    prob=prob,
    train_size=train_size,
    rl_algo=rl_algo,
    tasks_id=task_id,
    dataset_name=domain_name,
    train_type=train_type,
    test_type=test_type,
    edge_matcher=edge_matcher, 
    save_dpath=save_dpath,
    game_name=game_name,
    logger=None
)
tasks = tester.tasks

# sampler
env_size = test_envs.get_env_attr("size", 0)[0]
state_space = gymnasium.spaces.Box(
    low=np.array([1, 1, -180]), 
    high=np.array([env_size - 1, env_size - 1, 180])
)

# run training
policy_bank = load_ts_policy_bank(
    tb_log_path, 
    num_actions=test_envs.action_space[0].n,
    num_features=test_envs.observation_space[0].shape[0],
    hidden_layers=[256, 256, 256],
    learning_params=learning_params,
    device=device,
    verbose=True
)
tasks = tester.get_LTL_tasks()
try:
    ltl = tasks[task_id]
except IndexError:
    print("Task ID", task_id, "not found in the task list.")
    exit(1)

print("Running task", ltl_to_print(ltl))

# reset with the correct ltl
task_params = tester.get_task_params(ltl)
obs, info = test_envs.reset(options=dict(task_params=task_params))
env_state = test_envs.get_env_attr("curr_state", 0)[0]

# set policy to be deterministic if set up to do so
if no_deterministic_eval:
    raise NotImplementedError("Deterministic evaluation not implemented.")

# collect and rollout
results = {}
if render:
    test_envs.render()
    input("Press Enter When Ready...")

# edge matching
if verbose: print("Gathering training edges from the bank...")
train_edges, t_edge2ltls = get_training_edges(policy_bank)
task_dfa: DFA = deepcopy(test_envs.get_env_attr("dfa")[0])
dfa_graph = dfa2graph(test_envs.get_env_attr("dfa")[0])

# look for infeasible edges in the testing ("eval") DFA and remove it
if verbose: print("Matching training/testing edges and removing infeasible edges...")
test2trains = match_remove_edges(
    dfa_graph, train_edges, task_dfa.state, task_dfa.terminal[0], tester.edge_matcher
)


Initialized Learning Params: LearningParameters(lr=0.0001, max_timesteps_per_task=1000000, buffer_size=200000, print_freq=5000, train_freq=12, batch_size=512, learning_starts=30000, gamma=0.99, max_timesteps_per_episode=1000, exploration_fraction=0.2, exploration_final_eps=0.02, target_network_update_freq=1000, pi_lr=0.0001, alpha=0.03, tau=1, auto_alpha=False, target_entropy=0.2772588722239781, non_active_target_entropy=0.2772588722239781, dsac_random_steps=0, cnn_shared_net=True, goal_conditioned=False)
Falling back to num_samples=4
Falling back to num_samples=4


  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Falling back to num_samples=4
Falling back to num_samples=4
None True
Loading from /home/wyc/data/shared/ltl-transfer-ts/results/miniworld_simp_no_vis_minecraft/mixed_p1.0/lpopl_dsac/map13/0/alpha=0.03
None
Loading from /home/wyc/data/shared/ltl-transfer-ts/results/miniworld_simp_no_vis_minecraft/mixed_p1.0/lpopl_dsac/map13/0/alpha=0.03
Results saved at: /home/wyc/data/shared/ltl-transfer-ts/results_icra24/dsac/deterministic/mixed_50_mixed_relaxed/map_13/prob_1.0
Loading policy for LTL:  F(a) & F(c) & F(d) & F(s) & F(a & X(F(c)))
Loading policy for LTL:  F(c) & F(d) & F(s) & F(c)
Loading policy for LTL:  F(a) & F(d) & F(s) & F(a & X(F(c)))
Loading policy for LTL:  F(d) & F(s) & F(c)
Loading policy for LTL:  F(a) & F(c) & F(s) & F(a & X(F(c)))
Loading policy for LTL:  F(c) & F(s) & F(c)
Loading policy for LTL:  F(a) & F(s) & F(a & X(F(c)))
Loading policy for LTL:  F(s) & F(c)
Loading policy for LTL:  F(a) & F(c) & F(d) & F(a & X(F(c)))
Loading policy for LTL:  F(c) & F(d) & F(c)
Loading

In [9]:
# import pickle
# pickle.dump(test2trains, open("test2trains.pkl", "wb"))

In [10]:
from ts_utils.matcher import dfa2graph, get_training_edges, match_remove_edges
from ts_utils.policy_switcher import PolicySwitcher

In [11]:
ltl = tasks[task_id]

In [13]:
# TODO save some metrics
policy_switcher = PolicySwitcher(policy_bank, test2trains, t_edge2ltls, ltl)

if verbose: print("Running the experiment...")
for epi in range(100):
    if verbose: print("Episode", epi)
    # reset
    policy_switcher.reset_excluded_policy()
    obs, info = test_envs.reset()
    if render:
        test_envs.render()
    
    # rollout of one episode
    node2option2prob = {} # node -> (ltl, edge_pair) -> prob
    term = trunc = False
    i1 = 0 # global step counter
    cum_rew = 0

    FAIL_STATUS = ""

    while not term and not trunc:
        # gather the informations
        env_state = info[0]['loc']
        curr_node = info[0]['dfa_state']
        next_node = curr_node

        while next_node == curr_node and not term and not trunc:
            # try every possible option
            env_state = info[0]['loc']
            best_policy, training_edges, ltl, stats = policy_switcher.get_best_policy(curr_node, env_state)
            if best_policy is None:
                FAIL_STATUS = "No path available"
                if verbose: print("No policy available for node", curr_node, "; goal: ", ltl_to_print(info[0]['ltl_goal']))
                break

            if verbose: 
                print("Executing policy", ltl_to_print(ltl), 
                        "with training edges", training_edges,
                        "on node", curr_node,
                        "with env state", env_state
                )
                print("       test stats: prob:", stats[0], "| len:", stats[1])

            for _ in range(500): # option step limit
                a = best_policy.forward(Batch(obs=obs, info=info)).act
                obs, reward, term, trunc, info = test_envs.step(a.numpy())

                next_node = info[0]['dfa_state']
                i1 += 1
                cum_rew += reward

                if render: test_envs.render()
                if next_node != curr_node and verbose: print("Hit proposition:", info[0]['true_props'])
                
                if term or trunc or next_node != curr_node:
                    break
            
            if not term and not trunc and next_node == curr_node:
                # we are stuck in the same node
                policy_switcher.exclude_policy(curr_node, best_policy)
                if verbose: print("   Policy failed to finish. Excluding policy", ltl_to_print(ltl), "on node", curr_node)

        if trunc[0] or term[0] or FAIL_STATUS != "": # env game over
            success = term[0] and info[0]['dfa_state'] != -1 and not trunc[0]
            env_state = test_envs.get_env_attr("curr_state", 0)[0]
            if trunc:
                FAIL_STATUS = "Truncated"
            elif info[0]['dfa_state'] == -1:
                FAIL_STATUS = "DFA Dead end"
            elif info[0]['dfa_state'] != task_dfa.terminal[0]:
                FAIL_STATUS = "DFA not terminal"
            result = {
                "success": success,
                "steps": i1 + 1,
                "message": "success" if success else FAIL_STATUS,
                "final_state": env_state, 
            }
            print(result)
            break

    print("Done!")

Running the experiment...
Episode 0
Executing policy F(d) & F(f) & F(g) & F(f & F(e)) & F(d & F(b)) with training edges ('!a&!c&!d&!g&!h', 'd&!a&!c&!g&!h') on node 0 with env state (1.4424539252092132, 5.9479221546448375, 287.53060781481213)
       test stats: prob: 1 | len: 1
Hit proposition: d
No policy available for node 4 ; goal:  F(a) & F(c) & F(g) & F(h) & F(h & X(F(g))) & F(h & X(F(a))) & F(g & X(F(a)))
{'success': False, 'steps': 350, 'message': 'DFA not terminal', 'final_state': (1.6480115599062084, 8.93560601521519, 317.53060781481213)}
Done!
Episode 1
Executing policy F(a) & F(a & F(b)) with training edges ('!a&!c&!d&!g&!h', 'a&!c&!d&!g&!h') on node 0 with env state (6.523476188168569, 4.296450425646251, 163.65257772866542)
       test stats: prob: 1 | len: 17
Hit proposition: a
Executing policy F(b) & F(d) & F(g) & F(d & F(b)) with training edges ('!c&!d&!g&!h', 'd&!c&!g&!h') on node 1 with env state (3.0279713745733563, 4.123148167575141, 193.65257772866545)
       test st

Running task F(a) & F(c) & F(d) & F(g) & F(h) & F(h & X(F(g))) & F(h & X(F(a))) & F(g & X(F(a)))