In [1]:
#!/usr/bin/env python
# coding: utf-8

from mce_irl_pomdps import parser_pomdp
from mce_irl_pomdps import irl_pomdp_solver as irl_solver
import numpy as np
import stormpy

# For reproducibility
np.random.seed(201)


# Build pomdps with different memory size
pomdp_r_1 = parser_pomdp.PrismModel("rocks_5_2.pm", counter_type=stormpy.pomdp.PomdpMemoryPattern.selective_counter, memory_len=1, export=False)
pomdp_r_5 = parser_pomdp.PrismModel("rocks_5_2.pm", counter_type=stormpy.pomdp.PomdpMemoryPattern.fixed_counter, memory_len=5, export=False)

# Set the parameter for the trust region
irl_solver.trustRegion = {'red' : lambda x : ((x - 1) / 1.5 + 1),
                          'aug' : lambda x : min(1.5,(x-1)*1.25+1),
                          'lim' : 1+1e-3}

# Options for the solver
options_opt = irl_solver.OptOptions(mu=1e3, mu_spec=1, mu_rew=1.0, maxiter=100, maxiter_weight=100,
                                    graph_epsilon=1e-6, discount=0.999, verbose=True, verbose_solver=False)

# True reward in the POMDP environment
weight = {'finish' : 100, 'bad' : 4, 'time' : 1}


# Build the solver for different memory size
irlPb_1 = irl_solver.IRLSolver(pomdp_r_1, init_trust_region=1.01, max_trust_region=1.5, options=options_opt)
irlPb_5 = irl_solver.IRLSolver(pomdp_r_5, init_trust_region=1.01, max_trust_region=1.5, options=options_opt)

In [2]:
# Get the optimal policy for memory size 1 and save such policy and the associated performances
pol_val_grb_1 = irlPb_1.from_reward_to_policy_via_scp(weight, save_info=(20, 'rock_mem1_fwd', weight))
# Get the optimal policy for memory size 1 and save such policy and the associated performances
pol_val_grb_5 = irlPb_5.from_reward_to_policy_via_scp(weight, save_info=(20, 'rock_mem5_fwd', weight))
# Get the optimal policy if the agent has full observability
pol_val_mdp = irlPb_1.from_reward_to_optimal_policy_mdp_lp(weight, gamma=options_opt.discount, save_info=(-1,'rock_mdp_fwd', weight))

Using license file /home/gradandpostdoc/gurobi.lic
Academic license - for non-commercial use only
Initialize Linear subproblem to be solved at iteration k
[Time used to build the full Model : 0.07496929168701172]
[Initialization] Reward attained -116.1782759248895, Spec SAT : 0
[Initialization] Number of steps : 0
[Iter 0]: Reward attained -110.53960578195777, Spec SAT : 0, Trust region : 1.0125
[Iter 0]: Update time : 0.014951705932617188s, Checking time : 0.013142824172973633s, Solve time: 0.16815876960754395s
[Iter 1]: Reward attained -103.87337406075876, Spec SAT : 0, Trust region : 1.015625
[Iter 1]: Update time : 0.02995443344116211s, Checking time : 0.02700042724609375s, Solve time: 0.2840909957885742s
[Iter 2]: Reward attained -96.09654617047042, Spec SAT : 0, Trust region : 1.01953125
[Iter 2]: Update time : 0.04702615737915039s, Checking time : 0.04042959213256836s, Solve time: 0.3970475196838379s
[Iter 3]: Reward attained -87.16992469815973, Spec SAT : 0, Trust region : 1.02

In [3]:
# Generate Trajectory of different length using the state-based policy from the MDP and observation-based from MDP
obs_based = True
pol_val_grb_5 = parser_pomdp.correct_policy(pol_val_grb_5) # Correct the policy for numerical instabilities
traj_pomdp_mem_5, _ = pomdp_r_5.simulate_policy(pol_val_grb_5, weight, 100, 500, obs_based=obs_based, stop_at_accepting_state=True)
obs_based = False
traj_mdp_5, _ = pomdp_r_1.simulate_policy(pol_val_mdp, weight, 100, 500, obs_based=obs_based, stop_at_accepting_state=True)


# COmpute the feature expectation of the trajectorie
feat_pomdp_mem5_5 =irlPb_5.compute_feature_from_trajectory(traj_pomdp_mem_5)
feat_mdp_5 =irlPb_1.compute_feature_from_trajectory(traj_mdp_5)

In [4]:
print(feat_pomdp_mem5_5)
print(feat_mdp_5)

{'bad': -0.7730988349500113, 'time': -9.709853751641345, 'finish': 0.9885254234364631}
{'bad': 0.0, 'time': -5.677936015631063, 'finish': 0.9925156077798876}


In [6]:
irl_solver.trustRegion = {'red' : lambda x : ((x - 1) / 1.5 + 1),
                          'aug' : lambda x : min(1.5,(x-1)*1.25+1),
                          'lim' : 1+1e-4}

options_opt = irl_solver.OptOptions(mu=1e3, mu_spec=1e1, mu_rew=1, maxiter=100, max_update=2, 
                                    maxiter_weight=300, rho_weight= 1, verbose_solver=False,
                                    graph_epsilon=1e-6, discount=0.999, verbose=False, verbose_weight=True)
# Decreasing step size in the gradient updates
irl_solver.gradientStepSize = lambda iterVal, diffFeat : 1 / np.power(iterVal+1, 0.5)


# Learn from the MDP demonstrations on a single memory
irlPb_1._options = options_opt
_, pol_mdp_mem1_5 = irlPb_1.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'rock_mem1_trajsize5mdp_irl', weight))
# Learn from the MDP demonstrations on a memory len 5
irlPb_5._options = options_opt
_, pol_mdp_mem5_5 = irlPb_5.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'rock_mem5_trajsize5mdp_irl', weight))

[No Iter 1]: Entropy + spec 131.80888175823765, Reward -96.43796287918215, Spec SAT : 0, Trust region : 1.015625
---------------- Printing visitation iteration 0 ---------------- 
-23.06777993860238 0.0 bad | rmsprop:  53.212247129578635
-73.7721235089089 -5.677936015631063 time | rmsprop:  463.6818370369676
0.4019405683294473 0.9925156077798876 finish | rmsprop:  0.03487788772218891
[No Iter 1]: Entropy + spec 122.807686455922, Reward -289.01374087052477, Spec SAT : 0, Trust region : 1.015625
[Diff with feature matching] : 91.75254247133067 ]
[New weight value] : {'bad': 3.2360679772896814, 'time': 3.2360679774756775, 'finish': 3.2360676569431788} ]
Update time : 0.10203909873962402s, Checking time : 0.054245948791503906s, Solve time: 0.47478437423706055s
---------------- Printing visitation iteration 1 ---------------- 
-21.02728125615087 0.0 bad | rmsprop:  92.10567811914814
-68.6863780933969 -5.677936015631063 time | rmsprop:  814.320030639988
0.40350185077129264 0.9925156077798876

In [7]:
# Learn from the POMDP demonstrations on a single memory
irlPb_1._options = options_opt
_, pol_pomdp_mem1_5 = irlPb_1.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'rock_mem1_trajsize5pomdp_irl', weight))
# Learn from the POMDP demonstrations on a memory len 5
irlPb_5._options = options_opt
_, pol_pomdp_mem5_5 = irlPb_5.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'rock_mem5_trajsize5pomdp_irl', weight))

[No Iter 1]: Entropy + spec 131.80888175823765, Reward -96.43796287918215, Spec SAT : 0, Trust region : 1.015625
---------------- Printing visitation iteration 0 ---------------- 
-23.06777993860238 -0.7730988349500113 bad | rmsprop:  49.705280551355386
-73.7721235089089 -9.709853751641345 time | rmsprop:  410.3974406452917
0.4019405683294473 0.9885254234364631 finish | rmsprop:  0.03440817922409187
[No Iter 1]: Entropy + spec 122.80764410543776, Reward -289.0136169199482, Spec SAT : 0, Trust region : 1.015625
[Diff with feature matching] : 86.94353571602694 ]
[New weight value] : {'bad': 3.2360679772748573, 'time': 3.236067977472547, 'finish': 3.236067652567239} ]
Update time : 0.08625555038452148s, Checking time : 0.0559847354888916s, Solve time: 0.46852970123291016s
---------------- Printing visitation iteration 1 ---------------- 
-21.0272553527002 -0.7730988349500113 bad | rmsprop:  85.75783812077208
-68.68636670418155 -9.709853751641345 time | rmsprop:  717.1806045848766
0.403502

In [None]:
# Build the model with side information
pomdp_r_1_si = parser_pomdp.PrismModel("rocks_5_2.pm",  ["P=? [\"notbad\" U \"goal\"]"], counter_type=stormpy.pomdp.PomdpMemoryPattern.selective_counter, memory_len=1, export=False)
# pomdp_r_5_si = parser_pomdp.PrismModel("rocks_5_2.pm", ["P=? [F \"goal\"]"], counter_type=stormpy.pomdp.PomdpMemoryPattern.fixed_counter, memory_len=5, export=False)


options_opt = irl_solver.OptOptions(mu=1e3, mu_spec=1e1, mu_rew=1, maxiter=100, max_update= 2, 
									maxiter_weight=300, rho_weight= 1, verbose_solver=False,
									graph_epsilon=1e-6, discount=0.999, verbose=False, verbose_weight=True)
# Build the solver for different memory size
irlPb_1_si = irl_solver.IRLSolver(pomdp_r_1_si, init_trust_region=1.01, sat_thresh=0.8, max_trust_region=1.5, options=options_opt)
# irlPb_5_si = irl_solver.IRLSolver(pomdp_r_5_si, init_trust_region=1.01, sat_thresh=0.98, max_trust_region=1.25, options=options_opt)

In [None]:
# Learn from the MDP demonstrations on a single memory
irlPb_1_si._options = options_opt
_, pol_mdp_mem1_5_si = irlPb_1_si.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'rock_mem1_trajsize5mdp_irl_si', weight))
# Learn from the MDP demonstrations on a memory len 5
# irlPb_5_si._options = options_opt
# _, pol_mdp_mem5_5_si = irlPb_5_si.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'rock_mem5_trajsize5mdp_irl_si', weight))

In [None]:
options_opt.mu = 1e4
irlPb_1_si._options = options_opt
_, pol_pomdp_mem1_5 = irlPb_1_si.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'rock_mem1_trajsize5pomdp_irl_si', weight))
# Learn from the POMDP demonstrations on a memory len 5
# irlPb_5_si._options = options_opt
# _, pol_pomdp_mem5_5 = irlPb_5_si.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'rock_mem5_trajsize5pomdp_irl_si', weight))

In [9]:
pomdp_r_5_si = parser_pomdp.PrismModel("rocks_5_2.pm",  ["P=? [\"notbad\" U \"goal\"]"], counter_type=stormpy.pomdp.PomdpMemoryPattern.fixed_counter, memory_len=5, export=False)
irlPb_5_si = irl_solver.IRLSolver(pomdp_r_5_si, init_trust_region=1.01, sat_thresh=0.9, max_trust_region=1.5, options=options_opt)
irlPb_5_si._options = options_opt
# _, pol_mdp_mem5_5_si = irlPb_5_si.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'rock_mem5_trajsize5mdp_irl_si', weight))
irlPb_5_si._options = options_opt
_, pol_pomdp_mem5_5 = irlPb_5_si.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'rock_mem5_trajsize5pomdp_irl_si', weight))

('pol_pomdp_mem5_5',
 '=',
 'irlPb_5_si.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5,',
 'save_info=(20,',
 "'rock_mem5_trajsize5pomdp_irl_si',",
 'weight))')