# Experiments with natural gradient, constrained optimisation, and MCE IRL

This notebook contains the source for Sam's EE227C project. The aim is to apply (efficiently, somehow) natural gradient learning to maximum causal entropy IRL.

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import scipy

import imitation.tabular_irl as tirl
import imitation.model_env as menv

sns.set(context='notebook')

In [None]:
mdp = menv.RandomMDP(
    n_states=16,
    n_actions=3,
    branch_factor=2,
    horizon=20,
    random_obs=True,
    obs_dim=5,
    generator_seed=42)
V, Q, pi = tirl.mce_partition_fh(mdp)
Dt, D = tirl.mce_occupancy_measures(mdp, pi=pi)
demo_counts = D @ mdp.observation_matrix
init_weights = np.random.randn(*demo_counts.shape)
# opt = tirl.AMSGrad(init_weights, alpha=0.1)
# final_weights, final_counts = tirl.maxent_irl(
#     mdp, opt, demo_counts, linf_eps=1e-4)
# print('Final 2-norm of feature count differences:', np.linalg.norm(final_counts - demo_counts))
# print('Final 2-dist in recovered weights & real ones:', np.linalg.norm(final_weights - mdp._reward_weights))

In [None]:
opt = tirl.SGD(init_weights, alpha=0.1)
final_weights, final_counts = tirl.maxent_irl_ng(
    mdp, opt, demo_counts, linf_eps=1e-4)

In [None]:
opt = tirl.SGD(init_weights, alpha=0.5)
final_weights, final_counts = tirl.maxent_irl_ng(
    mdp, opt, demo_counts, linf_eps=1e-4)

In [None]:
opt = tirl.SGD(init_weights, alpha=0.02)
final_weights, final_counts = tirl.maxent_irl(
    mdp, opt, demo_counts, linf_eps=1e-4)

In [None]:
# Same experiments, but on grid world
mdp = menv.CliffWorld(
    width=8,
    height=6,
    horizon=9,
    use_xy_obs=True)
V, Q, pi = tirl.mce_partition_fh(mdp)
Dt, D = tirl.mce_occupancy_measures(mdp, pi=pi)
demo_counts = D @ mdp.observation_matrix
init_weights = np.random.randn(*demo_counts.shape)
opt = tirl.AMSGrad(init_weights, alpha=1)
final_weights, final_counts = tirl.maxent_irl(
    mdp, opt, demo_counts, linf_eps=1e-4)

In [None]:
mdp.draw_value_vec(Dt[:9].sum(axis=0))
plt.title("True occupancy")
plt.show()

In [None]:
_, _, pi_fake = tirl.mce_partition_fh(mdp, R=mdp.observation_matrix @ final_weights)
Dt_fake, D_fake = tirl.mce_occupancy_measures(mdp, pi=pi_fake)
mdp.draw_value_vec(Dt_fake[:9].sum(axis=0))
plt.title("Occupancy for linear reward function")
plt.show()

In [None]:
final_weights