In [2]:
import mdptoolbox as mdpt, numpy as np
import mdptoolbox.example
import MDP

Suppose, given a transition function and discount rate, we generate a random reward function over all transitions. We then sparsify the reward function by setting some proportion (e.g. 10%) of the transition values to 0. We then generate the optimal policy for said reward function (using, for instance, policy iteration). We now attempt to build a model that can predict the sparsity used to generate the optimal policy given the transition function, discount rate, and policy itself, but *not* the reward function, as otherwise the problem would be trivial.

In [3]:
### Generate a bunch of MDPs with different parameters, sparsity

NUM_MDPs = 100
NUM_STATES = 10
NUM_ACTIONS = 4

def get_transition_matrix(num_states, num_actions, generator = np.random.dirichlet):
    P = np.zeros((num_actions, num_states, num_states)) # (A, S, S) shape
    for a in range(num_actions):
        for s in range(num_states):
            P[a, s, :] = generator(np.ones(num_states))
    return P

def get_reward_matrix(num_states, num_actions, sparsity = 0.0, generator = np.random.normal):
    R = np.zeros((num_states, num_actions))
    for a in range(num_actions):
        for s in range(num_states):
            if np.random.rand() < sparsity:
                R[s, a] = 0
            else:
                R[s, a] = generator()
    return R

DISCOUNT = 0.9
EPSILON = 0.01
MAX_ITER = 1000

The sparsity levels generated by generate_tests are divided using arange from 0 to 1 and then scrambled randomly, meaning that in effect each sparsity level in the training and test sets is sampled uniformly from [0, 1].

In [45]:
def generate_tests(num_mdps = NUM_MDPs, sparsity_levels = None, mdp_generator = mdpt.mdp.PolicyIteration):
    """
    Generate a bunch of MDPs with different sparsity levels, and return the sparsity levels and the MDPs

    Args:
        sparsity_levels: a list of sparsity levels to generate MDPs with
    Returns:
        sparsity_levels: the sparsity levels used to generate the MDPs, in the same order as the MDPs
        MDPS: an array of MDPs
    """
    sparsity_levels = sparsity_levels if sparsity_levels is not None else np.arange(num_mdps) / num_mdps
    sparsity_copy = sparsity_levels.copy() # defensive copy
    np.random.shuffle(sparsity_copy)
    MDPS = np.array([mdp_generator(
        get_transition_matrix(NUM_STATES, NUM_ACTIONS), 
        get_reward_matrix(NUM_STATES, NUM_ACTIONS, sparsity_copy[i]), 
        DISCOUNT, max_iter = MAX_ITER) 
        for i in range(num_mdps)
    ])
    return sparsity_copy, MDPS

sparsity_levels, MDPS = generate_tests()
for mdp in MDPS:
    mdp.run()
    # print(mdp.policy) # debug
# print(MDPS[0].policy) # debug

In [5]:
### Build a classifier to predict sparsity level from a policy
### Idea 1: hack-y heuristics

def heuristic_classifier(MDP, policy):
    """
    A heuristic classifier that predicts the sparsity level of an MDP's reward function given its 
    optimal policy
    1. 
    """
    # TODO: implement this


In [60]:
### Idea 2: neural network
# Thanks again ChatGPT for outlining the code structure

sparsity, MDPs = generate_tests(100000)
# print(np.array(MDPs[0].P).shape)
training_data = [(np.array(mdp.P), mdp.discount, mdp.policy, sparsity[i]) for i, mdp in enumerate(MDPs)]

from sklearn.preprocessing import OneHotEncoder
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings("ignore")

# Step 1: Feature extraction function
def extract_features(transition_function, discount_rate, optimal_policy):
    """
    Extract features from the MDP's transition function, discount rate, and optimal policy
    """
    # opt_policy = optimal_policy.reshape(-1, 1)  # Reshape for sklearn which expects 2D input

    # # Initialize the OneHotEncoder
    # encoder = OneHotEncoder(sparse=False)  # Use sparse=False to get a dense array

    # # Fit and transform
    # opt_policy_one_hot = encoder.fit_transform(opt_policy)
    features = np.concatenate((transition_function.flatten(), [discount_rate], optimal_policy.flatten()))
    # print(features.shape)
    # length 10*10*4 + 1 + 10 = 411

    # Placeholder features
    # features = np.random.rand(411)
    return features

# Step 2: Data preparation (assuming you have your data in an appropriate format)
# This is a placeholder function - you would replace it with actual data loading and processing
def prepare_data(training_data):
    features = []
    labels = []
    for transition_function, discount_rate, optimal_policy, sparsity_level in training_data:
        features.append(extract_features(transition_function, discount_rate, optimal_policy))
        labels.append(sparsity_level)
    return np.array(features), np.array(labels)

# Step 3: Model selection

def build_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='linear')  # Linear activation for regression output
    ])

    # Num parameters: 411*64 + 64 + 64*64 + 64 + 64*64 + 64 + 64*1 + 1 = 26497
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='mean_squared_error',  # Suitable for regression
                  metrics=['mae'])  # Mean Absolute Error as an additional metric
    # ``loss" refers to training data, ``val_loss" refers to validation data
    return model

features, labels = prepare_data(training_data)
# Example: features shape is (num_samples, num_features), adjust 'input_dim' accordingly
input_dim = features.shape[1]  # Assuming 'features' is already defined and preprocessed

model = build_model(input_dim)

# Training the model
model.fit(features, labels, epochs=100, validation_split=0.2, verbose = 1, 
          callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)])

# Don't forget to preprocess your new data before making predictions
# predicted_sparsity = model.predict(new_features)

# Step 4: Training the model (placeholder for training data)
# training_data = load_your_data_somehow()
# features, labels = prepare_data(training_data)
# model.fit(features, labels)

# Step 5: Prediction function
def predict_sparsity(transition_function, discount_rate, optimal_policy):
    features = extract_features(transition_function, discount_rate, optimal_policy).reshape(1, -1)
    predicted_sparsity = model(features) # more efficient than .predict() for single samples
    return predicted_sparsity

# Note: The actual training step and data preparation would depend on your specific dataset and environment setup.
test_sparsity, test_MDPs = generate_tests()
test_data = [(np.array(mdp.P), mdp.discount, mdp.policy) for mdp in (test_MDPs)]
NUM_TESTS = 1000
mse = np.zeros(NUM_TESTS)

for i in range(min(NUM_TESTS, len(test_data))):
    transition_function, discount_rate, optimal_policy = test_data[i]
    prediction = predict_sparsity(transition_function, discount_rate, optimal_policy)
    mse[i] = (prediction - test_sparsity[i])**2
    # print(f"Predicted sparsity level for MDP {i}: {prediction}, actual sparsity level: {test_sparsity[i]}, Squared error: {mse[i]}")

print(f"Mean squared error: {np.mean(mse)}")
print("Expected squared error: when x, y ~ U[0, 1], E[(x-y)^2] = 1/12 = 0.0833...")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Mean squared error: 0.033806095979522444
Expected squared error: when x, y ~ U[0, 1], E[(x-y)^2] = 1/12 = 0.0833...


- As a control, when the input layer (with same dimension as transition_function + discount rate + optimal policy) is randomized, MSE = ~0.115
- I should also note that I'm choosing hyperparameters here in a rather unprincipled way by guess-timating their effects on the model
- The loss seems to settle around 0.033 after ~20% into each epoch when given 10^5 training points 

In [63]:
### Idea 3: Multiple linear regression 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

sparsity, MDPs = generate_tests(100000)
# print(np.array(MDPs[0].P).shape)
training_data = [(np.array(mdp.P), mdp.discount, mdp.policy, sparsity[i]) for i, mdp in enumerate(MDPs)]
print(sparsity)
features, labels = prepare_data(training_data)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create a model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean squared error: {mse}")
print("Expected squared error: when x, y ~ U[0, 1], E[(x-y)^2] = 1/12 = 0.0833...")
print(f"Mean absolute error: {mae}")


[0.30965 0.36155 0.52631 ... 0.76905 0.8053  0.19016]
Mean squared error: 0.03445141010174199
Expected squared error: when x, y ~ U[0, 1], E[(x-y)^2] = 1/12 = 0.0833...
Mean absolute error: 0.14673268830908204
