In [1]:
import importlib
import rllib
from rllib.mdp import OptimalGoalPolicy
from rllib.shapeworld import GoalWorld
import numpy as np
import pandas as pd
import importlib
from random import sample
from math import log


## Class definitions

In [2]:
from typing import Sequence, Hashable, TypeVar, Generic, Container, Tuple
import random
import numpy as np
import pandas as pd
from random import Random
import pickle
from collections import defaultdict, Counter, namedtuple
from tqdm import tqdm
from typing import TypeVar
from rllib.mdp import MarkovDecisionProcess
# from .distributions import Distribution, DiscreteDistribution, Uniform, Gaussian

# We want actions to be hashable so that we can use them as dictionary keys for Q-learning later
Action = TypeVar('Action', bound=Hashable)
Shape = namedtuple('Shape',['sides', 'shade', 'texture'])
State = TypeVar('State', bound=Hashable)
Likelihood = TypeVar('Likelihood', float, int)

class GoalSelectionPolicy(Generic[State, Action]):
    def __init__(self, mdp: MarkovDecisionProcess[State, Action]):
        '''Initialize the policy with the MDP.'''
        self.mdp = mdp
        self.state_space = mdp.state_space

    def sample_action(self, rng : Random = random) -> tuple[Action, Likelihood]:
        '''Uses some policy to select which goal to select.'''
        raise NotImplementedError
    
    def reset(self):
        raise NotImplementedError

class OptimalGoalPolicy(GoalSelectionPolicy[State, Action]):
    '''Takes the optimal value function computed from value iteration to select goals.'''

    def __init__(self, mdp: MarkovDecisionProcess[State, Action], value_function: dict[State, float]):
        super().__init__(mdp)
        # we will use the value function from value iteration to select the goal
        self.value_function = value_function

    def sample_action(self, rng: random.Random = random) -> tuple[State, float]:
        '''Use softmax to sample a goal state based on the value function.'''
        # Assuming 'actions' here are actually states for goal selection
        states = list(self.value_function.keys())
        state_values = np.array([self.value_function[state] for state in states])

        # Compute softmax probabilities
        exp_values = np.exp(state_values - np.max(state_values))  # for numerical stability
        probabilities = exp_values / np.sum(exp_values)

        # Sample a state based on these probabilities
        selected_state = rng.choices(states, weights=probabilities)[0]

        # Calculate the negative log likelihood of the selected state
        nll = -np.log(probabilities[states.index(selected_state)])

        return (selected_state, nll)
    
    def calc_log_lik(self, state: State) -> float:
        '''Calculate the log likelihood of selecting a particular state.'''
        state_values = np.array([self.value_function[s] for s in self.state_space])
        exp_values = np.exp(state_values - np.max(state_values))
        probabilities = exp_values / np.sum(exp_values)
        return -np.log(probabilities[self.state_space.index(state)])
    
    def calc_log_likelihood_all(self) -> dict[State, float]:
        '''Calculate the log likelihood of all states in the state space.'''
        # get all the value for the states and convert them to probabilities
        state_values = np.array([self.value_function[state] for state in self.state_space])
        exp_values = np.exp(state_values - np.max(state_values))
        probabilities = exp_values / np.sum(exp_values)
        # print(f"Length of probabilities: {len(probabilities)}")
        log_likelihoods = {s: -np.log(probabilities[self.state_space.index(s)]) for s in self.state_space}
        return log_likelihoods
    
    def calc_log_likelihood_all_df(self) -> pd.DataFrame:
        '''Return log lik as a dataframe.'''
        log_likelihood = self.calc_log_likelihood_all()
        return pd.DataFrame(log_likelihood.items(), columns=['State', 'Log Likelihood'])
    
    def reset(self):
        pass

## Optimal goal policy

In [10]:
# set up the goal space
goal_world = GoalWorld()

# initialize optimal agent
with open('goal_value_function.pkl', 'rb') as file:
    value_function = pickle.load(file)
    
optimal_planner = OptimalGoalPolicy(
    mdp=goal_world,
    value_function=value_function
)

# look at the best choices for the agent
log_likelihood_df = optimal_planner.calc_log_likelihood_all_df()
sorted_df = log_likelihood_df.sort_values(by='Log Likelihood').reset_index(drop=True)

# we can also calc the log lik of specific states
State = TypeVar('State', bound=Hashable)
shape = Shape(sides='square', shade='medium', texture='not_present')
state: Tuple[Hashable, Hashable, Hashable] = (shape, shape, shape)
#optimal_planner.calc_log_lik(state)
sorted_df.head(36)
value_iteration_states_nll = sorted_df

# PCFG Based Policy

In [12]:

class PCFGGoalPolicy(GoalSelectionPolicy[State, Action]):
    '''A policy that uses a probabilistic context-free grammar to generate goals.'''

    def __init__(self, mdp: MarkovDecisionProcess[State, Action], p_rules, cap=100):
        super().__init__(mdp)
        self.NON_TERMINALS = [x[0] for x in p_rules]
        self.PRODUCTIONS = {}
        self.CAP = cap
        self.rules = {}
        for rule in p_rules:
            self.PRODUCTIONS[rule[0]] = rule[1]

    def generate_tree(self, logging=True, tree_str='S', log_prob=0., depth=0):
        '''Use the production rules to generate a possible rule.'''
        current_nt_indices = [tree_str.find(nt) for nt in self.NON_TERMINALS]
        # Sample a non-terminal for generation
        to_gen_idx = sample([idx for idx, el in enumerate(current_nt_indices) if el > -1], 1)[0]
        to_gen_nt = self.NON_TERMINALS[to_gen_idx]
        # Do generation
        leaf = sample(self.PRODUCTIONS[to_gen_nt], 1)[0]
        to_gen_tree_idx = tree_str.find(to_gen_nt)
        tree_str = tree_str[:to_gen_tree_idx] + leaf + tree_str[(to_gen_tree_idx+1):]
        # Update production log prob
        log_prob += log(1/len(self.PRODUCTIONS[to_gen_nt]))
        # Increase depth count
        depth += 1

        # Recursively rewrite string
        if any (nt in tree_str for nt in self.NON_TERMINALS) and depth <= self.CAP:
            return self.generate_tree(logging, tree_str, log_prob, depth)
        elif any (nt in tree_str for nt in self.NON_TERMINALS):
            if logging:
                print('====DEPTH EXCEEDED!====')
            return None
        else:
            if logging:
                print(tree_str, log_prob)
            return tree_str, log_prob
        
    def generate_rules(self, n_iterations=100000) -> pd.DataFrame:
        '''Generate a number of rules.'''
        # reset rules for the object
        self.rules = []

        # generate rules
        for _ in range(n_iterations):
            rule = self.generate_tree(logging=False)
            if rule is not None:
                self.rules.append(rule)

        # put the rules in a dataframe
        df = pd.DataFrame(self.rules, columns=['program', 'lp'])
        grouped_df = df.groupby('program')['lp'].mean().reset_index()
        count_df = df['program'].value_counts().reset_index()
        count_df.columns = ['program', 'count']
        result_df = pd.merge(grouped_df, count_df, on='program')
        result_df = result_df.sort_values(by='count', ascending=False).reset_index()
        return result_df
    
    def rule_applies(self, s: State, rule: str) -> bool:
        '''Check if a rule applies to a state.'''
        # TODO: Make a shape wrapper class to evaluate these statements
        
        # turn the rule from a string into a function
        executable_string = "s." + rule
        raise NotImplementedError('Need to implement this function')
        return exec(executable_string) 
    
    def calculate_log_likelihood(self, s: State, rule: str) -> float:
        '''Calculate the log likelihood of a state given a rule.'''

        # check to see if the rule applies to the state
        if self.rule_applies(s, rule):
            rule_likelihood = self.rules[rule]
            return log(rule_likelihood)
        else:
            return 0
    
    def calculate_log_likelihood_all(self) -> float:
        '''Calculate the log likelihood of all states given produced rules.'''
        # check to make sure we have rules
        if len(self.rules) == 0:
            raise ValueError('No rules have been generated yet!')
        
        # iterate through all states and calculate the log likelihood
        log_likelihoods = {}

        # loop through each state and rule
        for state in self.state_space:
            log_likelihood = 0
            # we have to check if the rule applied to the state
            for rule in self.rules:
                # TODO: check to make sure this math is right for log lik
                log_likelihood += rule[1] * self.calculate_log_likelihood(state, rule[0])
            log_likelihoods[state] = log_likelihood

        return log_likelihoods

    def _get_rules(self) -> dict[str, list[str]]:
        return self.rules
    
    def _get_rules_df(self) -> pd.DataFrame:
        # put the rules in a dataframe
        df = pd.DataFrame(self.rules, columns=['program', 'lp'])
        grouped_df = df.groupby('program')['lp'].mean().reset_index()
        count_df = df['program'].value_counts().reset_index()
        count_df.columns = ['program', 'count']
        result_df = pd.merge(grouped_df, count_df, on='program')
        result_df = result_df.sort_values(by='lp', ascending=False).reset_index()
        return result_df



       

In [16]:
# instantatie the goal space
goal_world_pcfg = GoalWorld()

# the rules of the grammar
productions = [
  ['S', ['and(S,S)', 'A']],
  ['A', ['same(B,C)', 'unique(B,C)']],
  ['B', ['everything', 'D', 'E']],
  ['C', ['true', 'color', 'shape', 'texture', 'false']],
  ['D', ['one', 'G']],
  ['E', ['two', 'H']],
  ['F', ['square', 'circle', 'triangle', 'light', 'medium', 'dark', 'plain', 'stripe']],
  ['G', ['a', 'b', 'c']],
  ['H', ['ab', 'ac', 'bc']],
]

pcfg_policy = PCFGGoalPolicy(mdp=goal_world_pcfg, p_rules=productions, cap=100)

In [17]:
pcfg_ll = pcfg_policy.generate_rules(100000)