In [1]:
import gym
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torch_geometric.nn.conv import GCNConv
import math

Setup:
- Starting point: just try to train classifier on RL policies

In [13]:
### DQN implementation

# Define the Q-network
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=64):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, x):
        return self.fc(x)
    
    def get_weights(self):
        return self.state_dict()

# Experience Replay Buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)
            
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(next_state), done

    def __len__(self):
        return len(self.buffer)

# DQN Agent
class DQNAgent:
    def __init__(self, state_dim, action_dim, hidden_dim=64, lr=1e-2, batch_size=64, gamma=0.99, replay_size=1000):
        self.model = DQN(state_dim, action_dim, hidden_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.replay_buffer = ReplayBuffer(replay_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.action_dim = action_dim
    
    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size)
        
        state = torch.FloatTensor(state)
        next_state = torch.FloatTensor(next_state)
        if len(state.shape) == 1:
            state = state.reshape(-1, 1)
        if len(next_state.shape) == 1:
            next_state = next_state.reshape(-1, 1)
        action = torch.LongTensor(action)
        reward = torch.FloatTensor(reward)
        done = torch.FloatTensor(done)

        q_values = self.model.forward(state)
        next_q_values = self.model.forward(next_state)

        # state = state.T
        # next_state = next_state.T
        
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]
        expected_q_value = reward + self.gamma * next_q_value * (1 - done)
        
        loss = nn.MSELoss()(q_value, expected_q_value.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def act(self, state, epsilon):
        if random.random() > epsilon:
            state = torch.FloatTensor(np.expand_dims(state, 0))
            q_value = self.model(state)
            action = q_value.max(-1)[1].item()
        else:
            action = random.randrange(self.action_dim)
        return action
    
class QTableAgent:
    def __init__(self, state_dim, action_dim, lr=1e-2, gamma=0.99):
        self.q_table = np.zeros((state_dim, action_dim))
        self.lr = lr
        self.gamma = gamma
        self.action_dim = action_dim
    
    def update(self, state, action, reward, next_state, done):
        q_value = self.q_table[state, action]
        next_q_value = np.max(self.q_table[next_state])
        expected_q_value = reward + self.gamma * next_q_value * (1 - done)
        
        self.q_table[state, action] += self.lr * (expected_q_value - q_value)
    
    def act(self, state, epsilon):
        if random.random() > epsilon:
            action = np.argmax(self.q_table[state])
        else:
            action = random.randrange(self.action_dim)
        return action

In [17]:
NUM_NON_ZERO_REWARDS = 0
def one_hot_state(state, env):
    state_arr = np.zeros(env.observation_space.n)
    state_arr[state] = 1
    return state_arr

def train_dqn(env_name="CartPole-v1", episodes=500, epsilon_start=1.0, epsilon_final=0.01, 
              epsilon_decay=500, reward_function = None, verbose = False, return_reward = False, 
              print_every=50, **kwargs):
    """
    Train a DQN agent on the specified environment.
    
    Args:
        env_name: str
            Name of the environment to train the agent on.
        episodes: int
            Number of episodes to train the agent for.
        epsilon_start: float
            Initial epsilon value for epsilon-greedy action selection.
        epsilon_final: float
            Final epsilon value for epsilon-greedy action selection.
        epsilon_decay: float
            Decay rate for epsilon.
        reward_function: function
            Optional reward function to use for training.
        verbose: bool
            Whether to print training progress.

    Returns:
        DQNAgent: trained DQN agent. 
    """
    global NUM_NON_ZERO_REWARDS
    env = gym.make(env_name)
    if len(env.observation_space.shape) == 0:
        state_dim = env.observation_space.n
    else:
        state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = DQNAgent(state_dim, action_dim, **kwargs)
    
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * np.exp(-1. * frame_idx / epsilon_decay)
    
    rewards = np.zeros(episodes)   
    for episode in range(episodes):
        state = env.reset() # Reset the environment, reward
        if state_dim == env.observation_space.n:
            state = one_hot_state(state, env)
        episode_reward = 0
        while True:
            epsilon = epsilon_by_frame(episode)
            # One-hot encode the state
            action = agent.act(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            if state_dim == env.observation_space.n:
                next_state = one_hot_state(next_state, env)

            if reward_function and done: #custom reward function
                reward = reward_function(next_state)
            NUM_NON_ZERO_REWARDS += 0 if math.isclose(reward, 0) else 1
            
            agent.replay_buffer.push(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward
            
            agent.update()
            
            if done:
                break
            # print(f"Episode: {episode+1}, Total reward: {episode_reward}, Epsilon: {epsilon:.2f}")

        # Optional: Render the environment to visualize training progress
        if verbose and episode % print_every == print_every - 1:
        #     render_env(env, agent)
            print(f"Episode: {episode+1}, Total reward: {episode_reward}, Epsilon: {epsilon:.2f}")
        rewards[episode] = episode_reward

    env.close()
    return agent if not return_reward else (agent, rewards)

# Optional: Function to render the environment with the current policy
def render_env(env, agent):
    state = env.reset()
    done = False
    while not done:
        action = agent.act(state, 0)  # Using 0 epsilon for greedy action selection
        # print(env.step(action))
        next_state, reward, done, _ = env.step(action)
        env.render()
        state = next_state

In [19]:
def train_qtable(env_name="CartPole-v1", episodes=500, epsilon_start=1.0, epsilon_final=0.01, 
              epsilon_decay=500, reward_function = None, verbose = False, return_reward = False, 
              print_every=50, **kwargs):
    """
    Train a Q-table agent on the specified environment."""
    global NUM_NON_ZERO_REWARDS
    env = gym.make(env_name)
    if len(env.observation_space.shape) == 0:
        state_dim = env.observation_space.n
    else:
        state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = QTableAgent(state_dim, action_dim, **kwargs)

    rewards = np.zeros(episodes)
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * np.exp(-1. * frame_idx / epsilon_decay)
    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        while True:
            epsilon = epsilon_by_frame(episode)
            action = agent.act(state, epsilon)
            next_state, reward, done, _ = env.step(action)
            if reward_function and done:
                reward = reward_function(next_state)

            agent.update(state, action, reward, next_state, done)
            state = next_state
            episode_reward += reward
            if done:
                break

        if verbose and episode % print_every == print_every - 1:
            print(f"Episode: {episode+1}, Total reward: {episode_reward}, Epsilon: {epsilon:.2f}")
        rewards[episode] = episode_reward
        
    env.close()
    return agent if not return_reward else (agent, rewards)
    

In [4]:
NEAR_ZERO = 1e-9
def test_dqn(env, agent, episodes=10, reward_function=None, verbose = False):
    print(f"Maximum reward: {env.spec.reward_threshold}")
    average_value = 0
    for episode in range(episodes):
        # if episode == 0:
        #     render_env(env, agent)
        state = env.reset()
        if len(env.observation_space.shape) == 0:
            state = one_hot_state(state, env)
        episode_reward = 0
        done = False
        while not done:
            action = agent.act(state, 0)  # Using 0 epsilon for greedy action selection
            next_state, reward, done, _ = env.step(action)
            if len(env.observation_space.shape) == 0:
                next_state = one_hot_state(next_state, env)
            if reward_function and done:
                reward = reward_function(next_state)
            episode_reward += reward
            state = next_state
        if verbose:
            print(f"Episode: {episode+1}, Total reward: {episode_reward}")
        average_value += episode_reward
    average_value /= episodes
    print(f"Average reward: {average_value}")
    

In [24]:
def test_qtable(env, agent, episodes=10, reward_function=None, verbose = False):
    """
    Test a Q-table agent on the specified environment.
    (This is basically test_dqn but without the one-hot encoding.)
    """
    print(f"Maximum reward: {env.spec.reward_threshold}")
    average_value = 0
    for episode in range(episodes):
        state = env.reset()
        episode_reward = 0
        done = False
        while not done:
            action = agent.act(state, 0)  # Using 0 epsilon for greedy action selection
            next_state, reward, done, _ = env.step(action)
            if reward_function and done:
                reward = reward_function(next_state)
            episode_reward += reward
            state = next_state
        if verbose:
            print(f"Episode: {episode+1}, Total reward: {episode_reward}")
        average_value += episode_reward
    average_value /= episodes
    print(f"Average reward: {average_value}")

In [21]:
env_name = "Taxi-v3"
agent, rewards = train_qtable(env_name = env_name, episodes = 10000, verbose = True, return_reward = True,
                           epsilon_decay=10, lr=0.1, gamma=0.9)
# rewards

Episode: 50, Total reward: -317, Epsilon: 0.02
Episode: 100, Total reward: -227, Epsilon: 0.01
Episode: 150, Total reward: -326, Epsilon: 0.01
Episode: 200, Total reward: -161, Epsilon: 0.01
Episode: 250, Total reward: -193, Epsilon: 0.01
Episode: 300, Total reward: -60, Epsilon: 0.01
Episode: 350, Total reward: -86, Epsilon: 0.01
Episode: 400, Total reward: -192, Epsilon: 0.01
Episode: 450, Total reward: -295, Epsilon: 0.01
Episode: 500, Total reward: -8, Epsilon: 0.01
Episode: 550, Total reward: -19, Epsilon: 0.01
Episode: 600, Total reward: -51, Epsilon: 0.01
Episode: 650, Total reward: 14, Epsilon: 0.01
Episode: 700, Total reward: -53, Epsilon: 0.01
Episode: 750, Total reward: -13, Epsilon: 0.01
Episode: 800, Total reward: 7, Epsilon: 0.01
Episode: 850, Total reward: 10, Epsilon: 0.01
Episode: 900, Total reward: 8, Epsilon: 0.01
Episode: 950, Total reward: 11, Epsilon: 0.01
Episode: 1000, Total reward: 5, Epsilon: 0.01
Episode: 1050, Total reward: 12, Epsilon: 0.01
Episode: 1100, T

In [26]:
test_qtable(gym.make(env_name), agent, episodes = 100)
# agent.model.get_weights()

Maximum reward: 8
Average reward: 7.87


In [7]:
### Coherence classifier

#agent.model.get_weights()

# Define a simple GCN model
from torch_geometric.data import Data
class GCN(torch.nn.Module):
    def __init__(self, data):
        super(GCN, self).__init__()
        # Define the GCN layers
        self.conv1 = GCNConv(data.num_node_features, 4)  # Input features to hidden
        self.conv2 = GCNConv(4, 2)  # Hidden to output features
        self.data = data

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        # Pass data through the first GCN layer, then apply ReLU
        x = torch.relu(self.conv1(x, edge_index))
        # Pass data through the second GCN layer
        x = self.conv2(x, edge_index)
        return x
    
    
def nn_to_data(model: nn.Module) -> Data:
    edges = []

    # Counter for global neuron index
    idx = 0

    # Iterate over each layer in the network
    base = next(model.children())
    if isinstance(base, nn.Sequential):
        layers = list(base.children())
        layers2 = list(base.children())
    else:
        layers = list(model.children()) # iterator over the layers of the model
        layers2 = list(model.children())
    
    num_nodes = layers2[0].weight.shape[1] + sum([layer.weight.shape[0] for layer in layers2 if isinstance(layer, nn.Linear)])
    num_node_features = num_nodes
    node_features = torch.zeros(num_nodes, num_node_features)
    # shape = (num_nodes, num_node_features), where the node features are the bias of each node
    # and the weights of the edges to each node (zero if there is no edge)

    for layer in layers:
        if isinstance(layer, nn.Linear):
            # Update edges based on the weight matrix
            input_dim = layer.weight.shape[1]
            output_dim = layer.weight.shape[0]
            for i in range(input_dim):  # Input neurons (e.g. 4)
                for j in range(output_dim):  # Output neurons (e.g. 64)
                    edges.append((idx + i, idx + input_dim + j))
            
            # Update node features (e.g., biases)
            biases = torch.tensor(layer.bias.detach().numpy())
            edge_weights = torch.tensor(layer.weight.detach().numpy().T)
            node_features[idx + input_dim:idx + input_dim + output_dim, 0] = biases
            node_features[idx:idx + input_dim, 1+idx:1+idx+output_dim] = edge_weights
            node_features[idx + input_dim:idx + input_dim + output_dim, 1+idx:1+idx+input_dim] = edge_weights.T
            
            # Update the global neuron index
            idx += input_dim

    # Convert lists to PyTorch tensors
    num_nonzero = [np.count_nonzero(node_features[i]) for i in range(node_features.shape[0])]
    # print(num_nonzero)
    row_mean, row_median, row_var = torch.mean(node_features[:, 1:], dim=1), torch.median(node_features[:, 1:], dim=1)[0], torch.var(node_features[:, 1:], dim=1)
    x = torch.stack([node_features[:, 0], row_mean, row_median, row_var]).T
    # print(x.shape)
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    return Data(x=x, edge_index=edge_index)

data = nn_to_data(agent.model)
gcn = GCN(data)
# data.x.shape, data.edge_index.shape
# print(data.x)

#Debug
out_of_bounds = data.edge_index >= data.x.shape[0]
if out_of_bounds.any():
    print("Out-of-bounds indices found at locations:")
    print(data.edge_index[:, out_of_bounds.any(dim=0)])

In [8]:
# Dataset generation
env = gym.make(env_name)
NEAR_ZERO = 1e-9
NUM_REWARD_CALLS = 0
NUM_NON_ZERO_REWARDS = 0
def deterministic_random(*args, lb = -1, ub = 1, sparsity = 0.0, continuous = False):
    """
    Create a deterministic random number generator for a given set of arguments.
    Used to generate deterministic reward functions for the coherence classifier.
    [Edit 4/3/24: adapted to continuous state space]"""
    global NUM_REWARD_CALLS
    NUM_REWARD_CALLS += 1
    unique_seed = f"{args}".encode("utf-8")
    random.seed(unique_seed)
    return random.uniform(lb, ub) if random.random() > sparsity else random.uniform(-NEAR_ZERO, NEAR_ZERO)

NUM_TRAIN_R_FUNCS = 50
NUM_EPS_TRAIN_R = 50
URS_r_funcs = [lambda *args: deterministic_random(args) for _ in range(NUM_TRAIN_R_FUNCS)]
URS_agents = [train_dqn(env_name = env_name, 
                        episodes=NUM_EPS_TRAIN_R, reward_function=r_func) for r_func in URS_r_funcs]
USS_r_funcs = [lambda *args: deterministic_random(args, sparsity=0.999) for _ in range(NUM_TRAIN_R_FUNCS)]
print(f"Number of reward function calls: {NUM_REWARD_CALLS}")
print(f"Number of non-zero rewards: {NUM_NON_ZERO_REWARDS}")
USS_agents = [train_dqn(env_name = env_name, 
                        episodes=NUM_EPS_TRAIN_R, reward_function=r_func) for r_func in USS_r_funcs]

Number of reward function calls: 59414
Number of non-zero rewards: 59414


In [9]:
# Test if deterministic_random is deterministic and has the correct sparsity
assert deterministic_random(1, 2, 3, 4) == deterministic_random(1, 2, 3, 4)
assert not deterministic_random(1, 2, 3, 4) == deterministic_random(1, 2, 3, 6)
[deterministic_random(1, 2, 3, i, sparsity = 0.5) for i in range(10)]

[0.6014137224608205,
 5.734868810378968e-10,
 0.18947200717913826,
 -0.11464719428521586,
 1.9375194864306798e-11,
 3.1131227593489704e-10,
 -7.023178277046693e-10,
 2.965355797951794e-10,
 0.41831271768541045,
 -3.207247699354683e-10]

In [13]:
# Test when do USS agents have non-zero rewards
USS_test_r_func = lambda *args: deterministic_random(args, sparsity=0.0)
assert USS_test_r_func(42) == USS_test_r_func(42)
USS_test_agent = train_dqn(env_name = env_name, episodes=500, reward_function=USS_test_r_func, 
                           verbose = True)
# Epsilon measuring how much the agent is exploring

Episode: 50, Total reward: -3.423544244882467, Epsilon: 0.91
Episode: 100, Total reward: -2.9740031925320296, Epsilon: 0.82
Episode: 150, Total reward: 0.44985805334502116, Epsilon: 0.74
Episode: 200, Total reward: 1.9841901190163609, Epsilon: 0.67
Episode: 250, Total reward: 1.9401242078162357, Epsilon: 0.61
Episode: 300, Total reward: -1.4982150868261042, Epsilon: 0.55
Episode: 350, Total reward: -3.697850242769119, Epsilon: 0.50
Episode: 400, Total reward: 2.4217055508022867, Epsilon: 0.46
Episode: 450, Total reward: -3.5300506186568104, Epsilon: 0.41
Episode: 500, Total reward: -1.2907808496919664, Epsilon: 0.37


In [11]:
# epsilon_final, epsilon_start, epsilon_decay = 0.01, 1.0, 500
# [epsilon_final + (epsilon_start - epsilon_final) * np.exp(-1. * frame_idx / epsilon_decay) for frame_idx in range(500)]
test_dqn(gym.make("CartPole-v1"), USS_test_agent, reward_function=USS_test_r_func)

Maximum reward: 475.0
Episode: 1, Total reward: -15.55250607481602
Episode: 2, Total reward: 7.210228177137941
Episode: 3, Total reward: -8.919077872422985
Episode: 4, Total reward: 3.3986894664967524
Episode: 5, Total reward: -5.9617775051492
Episode: 6, Total reward: -8.334476064077458
Episode: 7, Total reward: 0.05630472186067914
Episode: 8, Total reward: -2.9496726002074958
Episode: 9, Total reward: -17.47453520502488
Episode: 10, Total reward: 27.669555440546127


In [28]:
UPS_agents = [DQNAgent(env.observation_space.shape[0], env.action_space.n) for _ in range(NUM_TRAIN_R_FUNCS)]

import torch.nn.functional as F
from torch_geometric.nn import global_mean_pool, GCNConv, GATConv

class GraphLevelGCN(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GraphLevelGCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 16)
        self.linear = torch.nn.Linear(16, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        # edge_weights = data.edge_attr
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        
        # Aggregate node features to graph-level features
        x = global_mean_pool(x, batch)
        
        # Make a binary classification prediction
        x = self.linear(x)
        return torch.sigmoid(x)

class GATGraphLevelBinary(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GATGraphLevelBinary, self).__init__()
        self.conv1 = GATConv(num_node_features, 8, heads=8, dropout=0.6)
        # Increase the number of output features from the first GAT layer
        self.conv2 = GATConv(8 * 8, 16, heads=1, concat=False, dropout=0.6)
        # Additional GAT layer for richer node representations
        self.linear = torch.nn.Linear(16, 1)
        # Final linear layer to produce a graph-level output

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        
        x = global_mean_pool(x, batch)  # Aggregate node features to graph-level
        x = self.linear(x)
        return torch.sigmoid(x)  # Sigmoid activation function for binary classification

# Training loop
USS_data = [nn_to_data(agent.model) for agent in USS_agents]
URS_data = [nn_to_data(agent.model) for agent in URS_agents]
print(URS_data[0].x.shape)
UPS_data = [nn_to_data(agent.model) for agent in UPS_agents]
assert URS_data[0].x.shape == UPS_data[0].x.shape

torch.Size([134, 4])


In [36]:
# Binary classification between two datasets
dataset1 = URS_data
dataset2 = UPS_data
indices = np.random.permutation(len(dataset1) + len(dataset2))
data = [dataset1[i] if i < len(dataset1) else dataset2[i - len(dataset1)] for i in indices]
for i in range(len(data)):
    data[i].y = 1.0 if indices[i] < len(dataset1) else 0.0 # Binary labels for each node; 1 = URS, 0 = UPS
    # Hence roughly speaking, 1 = more coherent, 0 = less coherent

train_data_ratio = 0.8
train_data, test_data = data[:int(train_data_ratio * len(data))], data[int(train_data_ratio * len(data)):]
# Loss and optimizer
num_node_features = data[0].x.shape[1] # Number of features for each node
model = GraphLevelGCN(num_node_features)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

epochs = 40
# Set the number of epochs to wait for early stopping
patience = 3
# Initialize variables for early stopping
best_loss = float('inf')
epochs_without_improvement = 0

for epoch in range(epochs):
    avg_train_loss = 0
    for datapt in train_data:
        model.train()
        optimizer.zero_grad()

        # print(f"datapt.x shape: {datapt.x.shape}")  # Should be [num_nodes, num_node_features]
        # print(f"datapt.edge_index shape: {datapt.edge_index.shape}")  # Should be [2, num_edges]
        out = model.forward(datapt)
        # print(out.size())
        # print(torch.tensor([[datapt.y]]).size())
        loss = criterion(out, torch.tensor([[datapt.y]]))  # Adjust shape as necessary
        loss.backward()
        optimizer.step()
        avg_train_loss += loss.item()
    avg_train_loss /= len(train_data)

    avg_test_loss = 0
    for datapt in test_data:
        model.eval()
        with torch.no_grad():
            out = model.forward(datapt)
            loss = criterion(out, torch.tensor([[datapt.y]]))
            avg_test_loss += loss.item()
    avg_test_loss /= len(test_data)
    
    print(f'Epoch {epoch+1}: Average Train Loss: {avg_train_loss}, Average Test Loss: {avg_test_loss}')
    
    # Early Stopping
    if avg_test_loss < best_loss:
        best_loss = avg_test_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break

Epoch 1: Average Train Loss: 0.5860631885792827, Average Test Loss: 0.573617972061038
Epoch 2: Average Train Loss: 0.27878733583838766, Average Test Loss: 0.17967451736330986
Epoch 3: Average Train Loss: 0.18251030254305095, Average Test Loss: 0.08214785424061119
Epoch 4: Average Train Loss: 0.13158310467361503, Average Test Loss: 0.04473840835489682
Epoch 5: Average Train Loss: 0.08827660984985304, Average Test Loss: 0.024071608714803006
Epoch 6: Average Train Loss: 0.05517907837950346, Average Test Loss: 0.01481392509106172
Epoch 7: Average Train Loss: 0.04254065692901765, Average Test Loss: 0.015370194957677085
Epoch 8: Average Train Loss: 0.04110167140394285, Average Test Loss: 0.020746125668010505
Epoch 9: Average Train Loss: 0.04651376320540823, Average Test Loss: 0.047950702946972255
Early stopping at epoch 9


In [37]:
# Test GCN model on a "more powerful" NN
print(model.forward(dataset1[0]))
print(model.forward(dataset2[0]))
powerful_models = [nn_to_data(train_dqn(env_name = env_name, episodes = 5 * i).model) 
                   for i in [1, 3, 10]]
print([model.forward(data) for data in powerful_models])

tensor([[1.0000]], grad_fn=<SigmoidBackward0>)
tensor([[0.0007]], grad_fn=<SigmoidBackward0>)
[tensor([[0.2728]], grad_fn=<SigmoidBackward0>), tensor([[0.9962]], grad_fn=<SigmoidBackward0>), tensor([[1.]], grad_fn=<SigmoidBackward0>)]


- The classifier training process is finicky -- sometimes it overfits, sometimes it underfits -- but sometimes can reach very low loss (< 0.002)
- Even weak classifiers classify powerful models (a.k.a. agents with >15 episodes in CartPole) as having P(URS) = 1, corresponding to coherence ~ $\infty$
- P(USS) / P(URS) is still having trouble as a metric; seems extremely difficult to detect differences between USS and URS-generated policies here with current methods
    - We will need some kind of "more advanced" coherence metric to distinguish more advanced policies; TODO: implement UUS somehow
- Adding node weights to every other node to the features passed into the GCN (such that, in CartPole, the data matrix has shape (134, 134) instead of (134, 1)) makes the GCN much worse, probably because of higher dimensionality
    - Using attention in the GNN does not help, and in fact actively overfits when using (134, 1) data
- Even with sparsity = 0.999, USS is still hard to distinguish
- For simpler discrete environments, maybe a Q-table is enough to solve the problem
- Takes >= 500 episodes, small epsilon to effectively learn DQN policy


In [None]:
# Now classifying q-table agents
NUM_EPS_TRAIN_R = 1000
NUM_TRAIN_R_FUNCS = 50
URS_agents = [train_qtable(env_name = env_name, episodes=NUM_EPS_TRAIN_R, 
                           reward_function = lambda *args: deterministic_random(args))]
USS_agents = [train_qtable(env_name = env_name, episodes=NUM_EPS_TRAIN_R,
                            reward_function = lambda *args: deterministic_random(args, sparsity=0.99))]
UPS_agents = [QTableAgent(env.observation_space.shape[0], env.action_space.n) for _ in range(NUM_TRAIN_R_FUNCS)]

# The Q-Table is already one-hot encoded, so we don't need to convert it to a Data object