In [9]:
#!/usr/bin/env python
# coding: utf-8

from mce_irl_pomdps import parser_pomdp
from mce_irl_pomdps import irl_pomdp_solver as irl_solver
import numpy as np
import stormpy

# For reproducibility
np.random.seed(201)


# Build pomdps with different memory size
pomdp_r_1 = parser_pomdp.PrismModel("avoid_4_2.pm", counter_type=stormpy.pomdp.PomdpMemoryPattern.selective_counter, memory_len=1, export=False)

# Set the parameter for the trust region
irl_solver.trustRegion = {'red' : lambda x : ((x - 1) / 1.5 + 1),
                          'aug' : lambda x : min(1.5,(x-1)*1.25+1),
                          'lim' : 1+1e-3}

# Options for the solver
options_opt = irl_solver.OptOptions(mu=1e3, mu_spec=1, mu_rew=1, maxiter=300, maxiter_weight=100,
                                    graph_epsilon=1e-6, discount=0.999, verbose=True, verbose_solver=False)

# True reward in the POMDP environment
weight = { 'crash_state' : 20, 'finish' : 50, 'avoid' : 50, 'time' : 0.1}


# Build the solver for different memory size
irlPb_1 = irl_solver.IRLSolver(pomdp_r_1, init_trust_region=1.01, max_trust_region=1.5, options=options_opt)

In [10]:
# Get the optimal policy for memory size 1 and save such policy and the associated performances
pol_val_grb_1 = irlPb_1.from_reward_to_policy_via_scp(weight, save_info=(20, 'avoid_mem1_fwd', weight))
# Get the optimal policy if the agent has full observability
pol_val_mdp = irlPb_1.from_reward_to_optimal_policy_mdp_lp(weight, gamma=options_opt.discount, save_info=(-1,'avoid_mdp_fwd', weight))

Initialize Linear subproblem to be solved at iteration k
[Time used to build the full Model : 0.40100836753845215]
[Initialization] Reward attained -3115.579075647687, Spec SAT : 0
[Initialization] Number of steps : 0
[Iter 0]: Reward attained -3001.180331972321, Spec SAT : 0, Trust region : 1.0125
[Iter 0]: Update time : 0.0392756462097168s, Checking time : 0.10593914985656738s, Solve time: 0.26616668701171875s
[Iter 1]: Reward attained -2865.752852930926, Spec SAT : 0, Trust region : 1.015625
[Iter 1]: Update time : 0.10325479507446289s, Checking time : 0.2814624309539795s, Solve time: 0.7367358207702637s
[Iter 2]: Reward attained -2707.633438321607, Spec SAT : 0, Trust region : 1.01953125
[Iter 2]: Update time : 0.17553448677062988s, Checking time : 0.34116172790527344s, Solve time: 1.0782887935638428s
[Iter 3]: Reward attained -2525.9282740465574, Spec SAT : 0, Trust region : 1.0244140625
[Iter 3]: Update time : 0.22276568412780762s, Checking time : 0.44306254386901855s, Solve time

In [15]:
# Generate Trajectory of different length using the state-based policy from the MDP and observation-based from MDP
obs_based = True
pol_val_grb_1 = parser_pomdp.correct_policy(pol_val_grb_1) # Correct the policy for numerical instabilities
traj_pomdp_mem_1, _ = pomdp_r_1.simulate_policy(pol_val_grb_1, weight, 10, 500, obs_based=obs_based, stop_at_accepting_state=True)
obs_based = False
traj_mdp_1, _ = pomdp_r_1.simulate_policy(pol_val_mdp, weight,10, 500, obs_based=obs_based, stop_at_accepting_state=True)


# COmpute the feature expectation of the trajectorie
feat_pomdp_mem1_15 =irlPb_1.compute_feature_from_trajectory(traj_pomdp_mem_1)
feat_mdp_15 =irlPb_1.compute_feature_from_trajectory(traj_mdp_1)

In [16]:
print(feat_pomdp_mem1_15)
print(feat_mdp_15)

{'avoid': 0.0, 'time': -105.24038178356223, 'finish': 0.6063181114677711, 'crash_state': 0.0}
{'avoid': 0.0, 'time': -70.20151832621166, 'finish': 0.7978280984782624, 'crash_state': 0.0}


In [19]:
options_opt = irl_solver.OptOptions(mu=1e3, mu_spec=1e1, mu_rew=1, maxiter=100, max_update=2, 
                                    maxiter_weight=300, rho_weight=1, verbose_solver=False,
                                    graph_epsilon=1e-6, discount=0.999, verbose=False, verbose_weight=True)
# Decreasing step size in the gradient updates
irl_solver.gradientStepSize = lambda iterVal, diffFeat : 1 / np.power(iterVal+1, 0.5)


# Learn from the MDP demonstrations on a single memory
irlPb_1._options = options_opt
# _, pol_mdp_mem1_15 = irlPb_1.solve_irl_pomdp_given_traj(feat_mdp_15, save_info=(20, 'avoid_mem1_trajsize10mdp_irl', weight))
_, pol_pomdp_mem1_15 = irlPb_1.solve_irl_pomdp_given_traj(feat_pomdp_mem1_15, save_info=(20, 'avoid_mem1_trajsize10pomdp_irl', weight))

[No Iter 1]: Entropy + spec 54.8790361187025, Reward -108.6266674787653, Spec SAT : 0, Trust region : 1.015625
---------------- Printing visitation iteration 0 ---------------- 
-55.72978732500518 0.0 avoid | rmsprop:  310.5809195290307
-47.77470674872589 -105.24038178356223 time | rmsprop:  330.2303807209412
0.9036243714674748 0.6063181114677711 finish | rmsprop:  0.008839101223501135
-6.025797776502422 0.0 crash_state | rmsprop:  3.6310238843301526
[No Iter 1]: Entropy + spec 51.7597757629695, Reward -130.9334266060351, Spec SAT : 0, Trust region : 1.015625
[Diff with feature matching] : 119.51856639634363 ]
[New weight value] : {'avoid': 3.2360679774637915, 'time': -1.2360679774659333, 'finish': -1.23606671262793, 'crash_state': 3.236067974420675} ]
Update time : 0.2404937744140625s, Checking time : 0.2871990203857422s, Solve time: 0.7737977504730225s
---------------- Printing visitation iteration 1 ---------------- 
-51.79072132058719 0.0 avoid | rmsprop:  547.7507090668
-45.127894

In [23]:
# Build the model with side information
pomdp_r_1_si = parser_pomdp.PrismModel("avoid_4_2.pm",  ["P=? [\"notbad\" U \"goal\"]"], counter_type=stormpy.pomdp.PomdpMemoryPattern.selective_counter, memory_len=1, export=False)


options_opt = irl_solver.OptOptions(mu=1e4, mu_spec=1e1, mu_rew=1, maxiter=100, max_update= 20, 
									maxiter_weight=300, rho_weight= 1, verbose_solver=False,
									graph_epsilon=1e-6, discount=0.999, verbose=False, verbose_weight=True)
# Build the solver for different memory size
irlPb_1_si = irl_solver.IRLSolver(pomdp_r_1_si, init_trust_region=1.01, sat_thresh=0.98, max_trust_region=1.5, options=options_opt)

# Learn from the MDP demonstrations on a single memory
irlPb_1_si._options = options_opt
_, pol_mdp_mem1_5_si = irlPb_1_si.solve_irl_pomdp_given_traj(feat_mdp_15, save_info=(20, 'avoid_mem1_trajsize10mdp_irl_si', weight))
# _, pol_pomdp_mem1_5_si = irlPb_1_si.solve_irl_pomdp_given_traj(feat_pomdp_15, save_info=(20, 'evade_mem1_trajsize10pomdp_irl_si', weight))

[No Iter 36]: Entropy + spec 17.34551481094031, Reward -127.24556408367667, Spec SAT : 0.9806258650013114, Trust region : 1.5
---------------- Printing visitation iteration 0 ---------------- 
0.0 0.0 avoid | rmsprop:  0.0
-127.24556408367667 -70.20151832621166 time | rmsprop:  325.4023156379762
0.0 0.7978280984782624 finish | rmsprop:  0.06365296747214398
0.0 0.0 crash_state | rmsprop:  0.0
[No Iter 19]: Entropy + spec 39.53192700215281, Reward -369.10266895408597, Spec SAT : 0.9808400663482939, Trust region : 1.5
[Diff with feature matching] : 57.84187385594328 ]
[New weight value] : {'avoid': 1.0, 'time': 3.2360679774654315, 'finish': 3.236067801854584, 'crash_state': 1.0} ]
Update time : 2.299907684326172s, Checking time : 1.5960595607757568s, Solve time: 6.46293306350708s
---------------- Printing visitation iteration 1 ---------------- 
0.0 0.0 avoid | rmsprop:  0.0
-114.05899737717398 -70.20151832621166 time | rmsprop:  485.20993094473846
0.0 0.7978280984782624 finish | rmsprop:

AttributeError: Unable to retrieve attribute 'x'

In [None]:
weight = { 'crash_state' : 10, 'finish' : 10, 'avoid' : 5, 'time' : 0.5}