In [3]:
import mdptoolbox as mdpt, numpy as np
import mdptoolbox.example
import MDP

In [18]:
### Generate a bunch of MDPs with different parameters, sparsity

NUM_MDPs = 100
NUM_STATES = 10
NUM_ACTIONS = 4

def get_transition_matrix(num_states, num_actions, generator = np.random.dirichlet):
    P = np.zeros((num_actions, num_states, num_states)) # (A, S, S) shape
    for a in range(num_actions):
        for s in range(num_states):
            P[a, s, :] = generator(np.ones(num_states))
    return P

def get_reward_matrix(num_states, num_actions, sparsity = 0.0, generator = np.random.normal):
    R = np.zeros((num_states, num_actions))
    for a in range(num_actions):
        for s in range(num_states):
            if np.random.rand() < sparsity:
                R[s, a] = 0
            else:
                R[s, a] = generator()
    return R

DISCOUNT = 0.9
EPSILON = 0.01
MAX_ITER = 1000

In [22]:
def generate_tests(num_mdps = NUM_MDPs, sparsity_levels = np.arange(NUM_MDPs) / NUM_MDPs, mdp_generator = mdpt.mdp.PolicyIteration):
    """
    Generate a bunch of MDPs with different sparsity levels, and return the sparsity levels and the MDPs

    Args:
        sparsity_levels: a list of sparsity levels to generate MDPs with
    Returns:
        sparsity_levels: the sparsity levels used to generate the MDPs, in the same order as the MDPs
        MDPS: an array of MDPs
    """
    sparsity_copy = sparsity_levels.copy() # defensive copy
    np.random.shuffle(sparsity_copy)
    MDPS = np.array([mdp_generator(
        get_transition_matrix(NUM_STATES, NUM_ACTIONS), 
        get_reward_matrix(NUM_STATES, NUM_ACTIONS, sparsity_copy[i]), 
        DISCOUNT, max_iter = MAX_ITER) 
        for i in range(num_mdps)
    ])
    return sparsity_copy, MDPS

sparsity_levels, MDPS = generate_tests()
for mdp in MDPS:
    mdp.run()
    # print(mdp.policy) # debug
# print(MDPS[0].policy) # debug

(2, 1, 0, 0, 0, 1, 2, 1, 3, 0)
(2, 1, 0, 1, 3, 0, 1, 0, 0, 1)
(0, 3, 2, 3, 0, 0, 1, 1, 1, 0)
(3, 1, 2, 3, 0, 0, 1, 1, 1, 1)
(3, 1, 0, 0, 0, 1, 0, 0, 2, 0)
(3, 2, 1, 1, 1, 3, 1, 2, 0, 1)
(0, 0, 1, 0, 0, 0, 0, 0, 0, 0)
(3, 0, 2, 1, 3, 3, 3, 1, 2, 2)
(2, 2, 3, 0, 1, 1, 2, 1, 2, 0)
(2, 3, 1, 3, 0, 2, 3, 0, 0, 2)
(2, 2, 1, 2, 3, 3, 0, 1, 0, 1)
(1, 1, 1, 0, 2, 0, 2, 3, 2, 3)
(1, 0, 3, 3, 0, 3, 3, 2, 0, 3)
(0, 2, 3, 3, 1, 2, 1, 2, 2, 2)
(1, 2, 2, 1, 0, 2, 0, 0, 2, 1)
(2, 3, 2, 0, 3, 1, 0, 3, 3, 3)
(2, 0, 1, 1, 1, 1, 3, 1, 0, 0)
(2, 1, 3, 2, 2, 1, 1, 1, 1, 0)
(2, 2, 3, 1, 1, 1, 0, 2, 0, 3)
(2, 1, 2, 0, 2, 0, 0, 3, 3, 1)
(3, 2, 3, 2, 3, 0, 0, 0, 0, 0)
(0, 2, 3, 0, 0, 3, 1, 3, 0, 3)
(0, 0, 3, 1, 0, 3, 3, 1, 0, 1)
(1, 1, 2, 0, 3, 2, 2, 0, 2, 3)
(3, 0, 2, 3, 0, 2, 2, 3, 2, 3)
(3, 0, 1, 2, 1, 1, 0, 0, 3, 1)
(3, 1, 0, 2, 1, 0, 0, 3, 2, 2)
(1, 3, 2, 3, 0, 3, 1, 3, 3, 0)
(1, 0, 2, 2, 2, 2, 2, 3, 1, 1)
(0, 0, 2, 1, 0, 3, 0, 1, 0, 3)
(2, 3, 2, 2, 0, 2, 3, 2, 2, 3)
(0, 2, 3, 0, 0, 0, 2, 0, 3, 2)
(1, 1, 0

In [None]:
### Build a classifier to predict sparsity level from a policy
### Idea 1: hack-y heuristics

def heuristic_classifier(MDP, policy):
    """
    A heuristic classifier that predicts the sparsity level of an MDP's reward function given its 
    optimal policy
    1. 
    """
    


In [None]:
### Idea 2: neural network

