In [1]:
#!/usr/bin/env python
# coding: utf-8

from mce_irl_pomdps import parser_pomdp
from mce_irl_pomdps import irl_pomdp_solver as irl_solver
import numpy as np
import stormpy

# For reproducibility
np.random.seed(201)
# Build pomdps with different memory size
pomdp_r_1 = parser_pomdp.PrismModel("obstacle_10.pm", counter_type=stormpy.pomdp.PomdpMemoryPattern.selective_counter, memory_len=1, export=False)
pomdp_r_5 = parser_pomdp.PrismModel("obstacle_10.pm", counter_type=stormpy.pomdp.PomdpMemoryPattern.selective_counter, memory_len=5, export=False)

# Set the parameter for the trust region
irl_solver.trustRegion = {'red' : lambda x : ((x - 1) / 1.5 + 1),
                          'aug' : lambda x : min(1.5,(x-1)*1.25+1),
                          'lim' : 1+1e-3}

# Options for the solver
options_opt = irl_solver.OptOptions(mu=1e3, mu_spec=1, mu_rew=1.0, maxiter=150, maxiter_weight=100,
                                    graph_epsilon=1e-6, discount=0.999, verbose=True, verbose_solver=False)

# True reward in the POMDP environment
weight = {'finish' : 500, 'crash_state' : 300, 'time' : 5}

In [2]:
# Build the solver for different memory size
irlPb_1 = irl_solver.IRLSolver(pomdp_r_1, init_trust_region=1.01, max_trust_region=1.5, options=options_opt)
irlPb_5 = irl_solver.IRLSolver(pomdp_r_5, init_trust_region=1.01, max_trust_region=1.5, options=options_opt)

# Get the optimal policy for memory size 1 and save such policy and the associated performances
pol_val_grb_1 = irlPb_1.from_reward_to_policy_via_scp(weight, save_info=(20, 'obstacle_mem1_fwd', weight))
# # Get the optimal policy for memory size 1 and save such policy and the associated performances
pol_val_grb_5 = irlPb_5.from_reward_to_policy_via_scp(weight, save_info=(20, 'obstacle_mem5_fwd', weight))
# Get the optimal policy if the agent has full observability
pol_val_mdp = irlPb_1.from_reward_to_optimal_policy_mdp_lp(weight, gamma=options_opt.discount, save_info=(-1,'obstacle_mdp_fwd', weight))

# Generate Trajectory of different length using the state-based policy from the MDP and observation-based from MDP
obs_based = True
pol_val_grb_5 = parser_pomdp.correct_policy(pol_val_grb_5) # Correct the policy for numerical instabilities
traj_pomdp_mem_5, _ = pomdp_r_5.simulate_policy(pol_val_grb_5, weight, 10, 500, obs_based=obs_based, stop_at_accepting_state=True)
obs_based = False
traj_mdp_5, _ = pomdp_r_1.simulate_policy(pol_val_mdp, weight, 10, 500, obs_based=obs_based, stop_at_accepting_state=True)


# COmpute the feature expectation of the trajectorie
feat_pomdp_mem5_5 =irlPb_5.compute_feature_from_trajectory(traj_pomdp_mem_5)
feat_mdp_5 =irlPb_1.compute_feature_from_trajectory(traj_mdp_5)

# Print the feature of the pomdp and mdp
print(feat_pomdp_mem5_5)
print(feat_mdp_5)

Using license file /home/gradandpostdoc/gurobi.lic
Academic license - for non-commercial use only
Initialize Linear subproblem to be solved at iteration k
[Time used to build the full Model : 0.021997690200805664]
[Initialization] Reward attained -7899.233084972939, Spec SAT : 0
[Initialization] Number of steps : 0
[Iter 0]: Reward attained -7415.842288286331, Spec SAT : 0, Trust region : 1.0125
[Iter 0]: Update time : 0.0049893856048583984s, Checking time : 0.0057163238525390625s, Solve time: 0.04798531532287598s
[Iter 1]: Reward attained -6866.526561767287, Spec SAT : 0, Trust region : 1.015625
[Iter 1]: Update time : 0.010177850723266602s, Checking time : 0.010950326919555664s, Solve time: 0.09343791007995605s
[Iter 2]: Reward attained -6256.984920768995, Spec SAT : 0, Trust region : 1.01953125
[Iter 2]: Update time : 0.012727499008178711s, Checking time : 0.013859748840332031s, Solve time: 0.12291526794433594s
[Iter 3]: Reward attained -5599.275119178309, Spec SAT : 0, Trust region

In [3]:
# Trust region contraction and expansion
irl_solver.trustRegion = {'red' : lambda x : ((x - 1) / 1.5 + 1),
                          'aug' : lambda x : min(1.5,(x-1)*1.25+1),
                          'lim' : 1+1e-4}

# Solver option
options_opt = irl_solver.OptOptions(mu=1e3, mu_spec=1e1, mu_rew=1, maxiter=100, max_update=2, 
                                    maxiter_weight=200, rho_weight= 1, verbose_solver=False,
                                    graph_epsilon=1e-6, discount=0.999, verbose=False, verbose_weight=True)

# Decreasing step size in the gradient updates
irl_solver.gradientStepSize = lambda iterVal, diffFeat : 1 / np.power(iterVal+1, 0.5)


# Learn from the MDP demonstrations on a single memory
irlPb_1._options = options_opt
_, pol_mdp_mem1_5 = irlPb_1.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'obstacle_mem1_trajsize10mdp_irl', weight))
# Learn from the MDP demonstrations on a memory len 10
irlPb_5._options = options_opt
_, pol_mdp_mem5_5 = irlPb_5.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'obstacle_mem5_trajsize10mdp_irl', weight))

# Learn from the POMDP demonstrations on a single memory
irlPb_1._options = options_opt
_, pol_pomdp_mem1_5 = irlPb_1.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'obstacle_mem1_trajsize10pomdp_irl', weight))
# Learn from the POMDP demonstrations on a memory len 10
irlPb_5._options = options_opt
_, pol_pomdp_mem5_5 = irlPb_5.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'obstacle_mem5_trajsize10pomdp_irl', weight))

[No Iter 1]: Entropy + spec 713.3079073601427, Reward -535.8963244750782, Spec SAT : 0, Trust region : 1.015625
---------------- Printing visitation iteration 0 ---------------- 
-22.632616455610627 0.0 crash_state | rmsprop:  51.22353276267768
-513.748959060293 -13.49419179022513 time | rmsprop:  25025.48321764297
0.48525104082525516 0.985505808209775 finish | rmsprop:  0.025025483229094
[No Iter 1]: Entropy + spec 605.0286198852943, Reward -1471.890531266066, Spec SAT : 0, Trust region : 1.015625
[Diff with feature matching] : 523.387638493063 ]
[New weight value] : {'crash_state': 3.236067977281524, 'time': 3.236067977499343, 'finish': 3.2360675307417215} ]
Update time : 0.011650562286376953s, Checking time : 0.011193275451660156s, Solve time: 0.10150313377380371s
---------------- Printing visitation iteration 1 ---------------- 
-19.983964924002585 0.0 crash_state | rmsprop:  86.03706489478647
-435.4188042321512 -13.49419179022513 time | rmsprop:  40324.97275430562
0.56358121176729

In [4]:
pomdp_r_1_si = parser_pomdp.PrismModel("obstacle_10.pm",  ["P=? [\"notbad\" U \"goal\"]"], counter_type=stormpy.pomdp.PomdpMemoryPattern.selective_counter, memory_len=1, export=False)
pomdp_r_5_si = parser_pomdp.PrismModel("obstacle_10.pm", ["P=? [\"notbad\" U \"goal\"]"], counter_type=stormpy.pomdp.PomdpMemoryPattern.selective_counter, memory_len=5, export=False)


options_opt = irl_solver.OptOptions(mu=1e4, mu_spec=1e1, mu_rew=1, maxiter=100, max_update= 2, 
									maxiter_weight=200, rho_weight= 1, verbose_solver=False,
									graph_epsilon=1e-6, discount=0.999, verbose=False, verbose_weight=True)
# Build the solver for different memory size
irlPb_1_si = irl_solver.IRLSolver(pomdp_r_1_si, init_trust_region=1.01, sat_thresh=0.9, max_trust_region=1.5, options=options_opt)
irlPb_5_si = irl_solver.IRLSolver(pomdp_r_5_si, init_trust_region=1.01, sat_thresh=0.9, max_trust_region=1.5, options=options_opt)

# Learn from the MDP demonstrations on a single memory
irlPb_1_si._options.maxiter_weight = 100
_, pol_mdp_mem1_5_si = irlPb_1_si.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'obstacle_mem1_trajsize10mdp_irl_si', weight))
irlPb_1_si._options = options_opt
_, pol_pomdp_mem1_5_si = irlPb_1_si.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'obstacle_mem1_trajsize10pomdp_irl_si', weight))

# irlPb_1_si._options.maxiter_weight = 120
# irlPb_5_si._options = options_opt
# _, pol_mdp_mem10_5_si = irlPb_5_si.solve_irl_pomdp_given_traj(feat_mdp_5, save_info=(20, 'obstacle_mem5_trajsize10mdp_irl_si', weight))
# irlPb_5_si._options = options_opt
# _, pol_pomdp_mem10_5_si = irlPb_5_si.solve_irl_pomdp_given_traj(feat_pomdp_mem5_5, save_info=(20, 'obstacle_mem5_trajsize10pomdp_irl_si', weight))

[No Iter 26]: Entropy + spec 21.878060163663314, Reward -22.24320060250809, Spec SAT : 0.9005007390857727, Trust region : 1.1543209876543212
---------------- Printing visitation iteration 0 ---------------- 
0.0 0.0 crash_state | rmsprop:  0.0
-23.121323566300216 -13.49419179022513 time | rmsprop:  9.26816662339146
0.8781229637921261 0.985505808209775 finish | rmsprop:  0.0011531075275224986
[No Iter 6]: Entropy + spec 21.79787170696459, Reward -71.62247080352013, Spec SAT : 0.899999060889081, Trust region : 1.0020576131687242
[Diff with feature matching] : 9.734514620492734 ]
[New weight value] : {'crash_state': 1.0, 'time': 3.2360679762934734, 'finish': 3.2360582817282313} ]
Update time : 1.021261215209961s, Checking time : 2.0433003902435303s, Solve time: 14.099860668182373s
---------------- Printing visitation iteration 1 ---------------- 
0.0 0.0 crash_state | rmsprop:  0.0
-23.01029007232876 -13.49419179022513 time | rmsprop:  17.396962612517882
0.877732036163924 0.98550580820977