Goal: measure how correlated certain features of the policy $\pi_0$ are to the value $P(URS | \pi=\pi_0)$, where $URS$ indicates that the policy was optimized for a random reward function $R \in U[-1,1]^{|T|}$ (where $|T|$ is the number of transitions with non-zero probability). For simplicity's sake, we assume that it was either optimized for some $R$ or generated uniformly randomly from the set of all policies, with a 50% chance of each scenario. We also assume that the reward is generated i.i.d. via $R(s, a, s') \sim N(0, 1)$.

In [2]:
import mdptoolbox as mdpt, numpy as np
import mdptoolbox.example

In [3]:
### Generate a bunch of MDPs with different parameters, sparsity
from functools import partial

NUM_MDPs = 100
NUM_STATES = 10
NUM_ACTIONS = 4

def get_transition_matrix(num_states, num_actions, generator = np.random.dirichlet):
    """
    Returns a determinstic transition matrix for a given number of states and actions
    
    Returns:
        P: (num_actions, num_states, num_states) array, where P[a, s, s'] is the probability of 
        transitioning from state s to state s' given action a
    """
    P = np.zeros((num_actions, num_states, num_states)) # (A, S, S) shape
    for a in range(num_actions):
        for s in range(num_states):
            P[a, s, :] = generator(np.ones(num_states))
    return P

def get_reward_matrix(num_states, num_actions, sparsity = 0.0, generator = partial(np.random.uniform, -1, 1), **kwargs):
    """
    Returns a reward matrix for a given number of states and actions
    """
    num_sparse_rewards = int(sparsity * num_actions * num_states ** 2)
    rewards = np.array([(0 if i < num_sparse_rewards else generator(**kwargs)) for i in range(num_actions * num_states ** 2)])
    np.random.shuffle(rewards)
    return rewards.reshape((num_actions, num_states, num_states))

DISCOUNT = 0.9
EPSILON = 0.01 # roughly indicates the "skill level" of the agent
MAX_ITER = 1000

In [4]:
def generate_tests(num_mdps = NUM_MDPs, sparsity_levels: np.ndarray = None, mdp_generator = mdpt.mdp.ValueIteration, P_generator = None, **kwargs):
    """
    Generate a bunch of MDPs with different sparsity levels, and return the sparsity levels and the MDPs

    Args:
        sparsity_levels: a list of sparsity levels to generate MDPs with
    Returns:
        sparsity_levels: the sparsity levels used to generate the MDPs, in the same order as the MDPs
        MDPS: an array of MDPs
    """
    (max_iter, epsilon) = (kwargs['max_iter'], kwargs['epsilon']) if 'max_iter' in kwargs and 'epsilon' in kwargs else (MAX_ITER, EPSILON)
    sparsity_levels = sparsity_levels if sparsity_levels is not None else np.arange(num_mdps) / num_mdps
    sparsity_copy = sparsity_levels.copy() # defensive copy
    np.random.shuffle(sparsity_copy)
    MDPS = np.array([mdp_generator(
        get_transition_matrix(NUM_STATES, NUM_ACTIONS) if P_generator is None else P_generator(NUM_STATES, NUM_ACTIONS), 
        get_reward_matrix(NUM_STATES, NUM_ACTIONS, sparsity_copy[i]), 
        DISCOUNT, max_iter = max_iter) 
        for i in range(num_mdps)
    ])
    for mdp in MDPS:
        if mdp_generator == mdpt.mdp.ValueIteration:
            mdp.epsilon = epsilon
    return sparsity_copy, MDPS

In [48]:
### Generate a bunch of MDPs (with baseline/zero sparsity), solve some of them, 
# generate random policy for others

def transition_function_sparse_loops(states, actions):
    """
    Sparse transition function with guaranteed loops
    """
    transitions = np.zeros((actions, states, states))
    for state in range(states):
        for action in range(actions):
            if action == 0:
                for next_state in range(states):
                    transitions[action, state, next_state] = 1 if next_state == state else 0
            else: # sparse randomness
                transitions[action, state, :] = np.zeros(states)
                transitions[action, state, np.random.randint(0, states)] = 1
    return transitions

NUM_MDPs = 10000
MDPS = generate_tests(NUM_MDPs, sparsity_levels = np.zeros(NUM_MDPs), P_generator = transition_function_sparse_loops)[1]
random_pol_indices = np.random.choice(NUM_MDPs, int(NUM_MDPs / 2), replace = False) # The indices of the MDPs with random policies
# print(random_pol_indices)
for i in range(NUM_MDPs): # 50% RR, 50% random
    MDPS[i].run()
for i in random_pol_indices:
    MDPS[i].policy = np.random.randint(NUM_ACTIONS, size = NUM_STATES)
policies = np.array([mdp.policy for mdp in MDPS])
# print(policies.shape)
random_pol_set = set(random_pol_indices)
random_or_rr = np.array([0 if i in random_pol_set else 1 for i in range(NUM_MDPs)])
# 0 if random, 1 if generated from RR

In [49]:
print(policies[0:10], random_or_rr[0:10])

[[0 3 0 0 3 2 0 2 0 2]
 [2 0 0 0 3 2 3 3 0 3]
 [3 2 3 3 3 2 3 1 2 2]
 [0 3 2 2 2 3 2 1 2 3]
 [1 2 0 1 0 2 1 2 3 1]
 [3 3 2 0 2 3 2 3 2 3]
 [2 1 1 3 1 2 3 2 1 3]
 [2 0 2 0 2 3 1 2 0 3]
 [0 2 2 3 1 2 0 3 2 2]
 [3 1 2 3 3 0 1 1 1 3]] [0 0 1 0 0 0 0 0 0 0]


In [69]:
### Linear Regression
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras

def regression(X, y, test_size = 0.2, regression = LinearRegression):
    """
    Trains a linear regression model on the given data, and returns the model and the mean squared error
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)
    model = regression().fit(X_train, y_train)
    return model, model.predict_proba(X_test), y_test

def neural_network(X, y, test_size = 0.2, *args, **kwargs):
    """
    Trains a neural network on the given data, and returns the model and the mean squared error
    """
    def build_model():
        model = keras.Sequential([
            keras.layers.Dense(64, activation = 'relu', input_shape = [X.shape[1]]),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(64, activation = 'relu'),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(1, activation = 'sigmoid')
        ])
        return model
    model = build_model()
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['mae'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)
    model.fit(X_train, y_train, epochs = 100, validation_split = 0.2, verbose = 1, 
              callbacks = [keras.callbacks.EarlyStopping(patience = 3)])
    return model, model.predict(X_test), y_test

### Generate features
encoder = OneHotEncoder(categories = 'auto', sparse_output = False, drop = 'first')
# Drop first to avoid multicollinearity, large coefficients
# encoder.fit(np.arange(NUM_ACTIONS))
# print(encoder.categories_)
policies_encoded = encoder.fit_transform(policies)
print(policies_encoded.shape) # expected (NUM_MDPs, NUM_ACTIONS * (NUM_STATES - 1))
print(np.array(MDPS[0].P).flatten().shape) # expected (NUM_ACTIONS * NUM_STATES ** 2,)

### Train the model
features = np.array([np.concatenate((np.array(MDPS[i].P).flatten(), policies_encoded[i]), axis = 0)
                     for i in range(NUM_MDPs)])
model, y_pred, y_test = regression(policies_encoded, random_or_rr, regression = LogisticRegression)
print("Average cross-entropy loss:", log_loss(y_test, y_pred, normalize = True))
print("Accuracy:", np.mean([np.round(y_pred[i][0]) != y_test[i] for i in range(len(y_pred))])) 
# if round(y_pred[0]) is 0, then model thinks 1 is more likely; if 1, then 0 is more likely
# print(y_pred)
print("Baseline log loss:", log_loss(y_test, np.full(y_pred.shape, 0.5), normalize = True))
print("Model coefficients:", model.coef_)
print("Sample outputs:", [(y_pred[i], y_test[i]) for i in range(10)])

(10000, 30)
(400,)
Average cross-entropy loss: 0.559703187665407
Accuracy: 0.716
Baseline log loss: 0.6931471805599454
Model coefficients: [[0.93365944 1.09335861 1.08134117 0.9046895  0.86205477 0.9282034
  0.94159757 0.93144888 0.99646272 0.93322143 0.9173951  0.91758552
  1.07091078 1.05837064 1.05053715 0.99545364 0.91679749 0.96035986
  0.95649449 0.8689126  0.8401811  1.06887182 0.98296002 0.95953848
  0.93476001 0.97818676 0.90844333 0.95514768 1.05818958 1.00610408]]
Sample outputs: [(array([0.56976936, 0.43023064]), 1), (array([0.3161201, 0.6838799]), 1), (array([0.54547437, 0.45452563]), 0), (array([0.88607478, 0.11392522]), 0), (array([0.16481902, 0.83518098]), 1), (array([0.34293926, 0.65706074]), 0), (array([0.51744431, 0.48255569]), 1), (array([0.54844328, 0.45155672]), 1), (array([0.30403666, 0.69596334]), 1), (array([0.54423985, 0.45576015]), 0)]


In [70]:
### Grab the five policies with the highest and lowest probabilities of being random

highest_probs = np.argsort(y_pred[:, 1])[-5:]
lowest_probs = np.argsort(y_pred[:, 1])[:5]
#print("Highest probabilities:", [(y_pred[i], y_test[i]) for i in highest_probs])
for i in np.concatenate((highest_probs, lowest_probs)):
    print("Policy:", policies[i], "Probability:", y_pred[i], "Actual:", y_test[i])

Policy: [2 1 3 3 3 0 2 0 3 1] Probability: [0.12215122 0.87784878] Actual: 1
Policy: [2 2 3 2 1 0 0 3 1 2] Probability: [0.12193878 0.87806122] Actual: 1
Policy: [0 3 2 0 3 0 2 0 3 2] Probability: [0.12147113 0.87852887] Actual: 1
Policy: [2 2 3 3 3 1 1 3 3 3] Probability: [0.12060468 0.87939532] Actual: 0
Policy: [0 2 1 3 1 3 1 0 2 3] Probability: [0.11692491 0.88307509] Actual: 1
Policy: [0 0 0 1 2 1 3 0 1 3] Probability: [0.99463203 0.00536797] Actual: 0
Policy: [1 1 3 2 3 0 1 1 2 3] Probability: [0.99380286 0.00619714] Actual: 0
Policy: [1 2 2 1 2 1 1 2 2 1] Probability: [0.99378602 0.00621398] Actual: 0
Policy: [2 2 3 3 1 1 3 1 1 0] Probability: [0.98556498 0.01443502] Actual: 0
Policy: [3 0 1 3 1 0 3 0 0 3] Probability: [0.98546272 0.01453728] Actual: 0


- On a random deterministic MDP(s), it doesn't seem like URS is identifiable, which is perhaps to be expected as every policy is optimal for some (normalized) reward function
    - This also matches our results when looking at the distribution of optimal policies for "cloud"-y MDPs
- Apparently my neural networks aren't predicting very well
- With MDPs with loops, logistic regression achieves ~0.70 accuracy; neural network does slightly better than random?
    - Although the NN's accuracy is basically 0.5
    - This holds true when we use the label predictions for regression (model.predict), as well as the probability prediction (model.predict_proba)
