### Paper AZR

In [3]:
import random
from typing import Tuple, List, Any, Callable


class Environment:
    """Ambiente verificável, baseado em execução de código."""
    def execute(self, p: str, i: Any) -> Any:
        try:
            exec_globals = {}
            exec(p, exec_globals)
            return exec_globals['f'](i)
        except Exception:
            return None

    def verify(self, y_pred, y_true) -> int:
        return int(y_pred == y_true)


class AZR:
    def __init__(self):
        self.env = Environment()
        self.buffer_deduction = []

    def π_propose(self, task_type: str, references: List[Tuple]) -> Tuple[str, Any]:
        # Simula proposição de tarefa (p, i)
        if task_type == 'deduction':
            program = "def f(x): return x * 2"
            i = random.randint(1, 10)
            return program, i

    def f_env(self, τ: Tuple[str, Any]) -> Tuple[Tuple[str, Any], Any]:
        # Gera (x, y*) a partir do programa e input proposto
        p, i = τ
        o = self.env.execute(p, i)
        if o is not None:
            return (p, i), o
        return None, None

    def π_solve(self, x: Tuple[str, Any]) -> Any:
        # Resolve a tarefa (dedução)
        p, i = x
        return self.env.execute(p, i)

    def r_solve(self, y_pred: Any, y_true: Any) -> int:
        return self.env.verify(y_pred, y_true)

    def r_propose(self, τ: Tuple[str, Any], solver: Callable) -> float:
        # Calcula recompensa de proposição com base em acerto parcial
        (x, y_star) = self.f_env(τ)
        if not x: return 0.0
        successes = 0
        trials = 3
        for _ in range(trials):
            y_pred = solver(x)
            successes += self.r_solve(y_pred, y_star)
        avg = successes / trials
        if avg == 0 or avg == 1: return 0.0
        return 1 - avg  # Fórmula (4) do artigo

    def absolute_zero_loop(self, λ=1.0):
        # Etapa 1: Proposição
        τ = self.π_propose("deduction", references=[])
        # Etapa 2: Gerar tarefa e resposta
        (x, y_star) = self.f_env(τ)
        if not x: return

        # Etapa 3: Solução
        y_pred = self.π_solve(x)

        # Etapa 4: Recompensas
        rp = self.r_propose(τ, self.π_solve)
        rs = self.r_solve(y_pred, y_star)
        R = rp + λ * rs

        print(f"Tarefa: {x}, Gabarito: {y_star}, Predição: {y_pred}, R_propose: {rp:.2f}, R_solve: {rs}, R_total: {R:.2f}")

        self.buffer_deduction.append((x, y_star, y_pred, R))


# 🔁 Rodar simulação
azr = AZR()
for _ in range(5):
    azr.absolute_zero_loop()


Tarefa: ('def f(x): return x * 2', 6), Gabarito: 12, Predição: 12, R_propose: 0.00, R_solve: 1, R_total: 1.00
Tarefa: ('def f(x): return x * 2', 3), Gabarito: 6, Predição: 6, R_propose: 0.00, R_solve: 1, R_total: 1.00
Tarefa: ('def f(x): return x * 2', 4), Gabarito: 8, Predição: 8, R_propose: 0.00, R_solve: 1, R_total: 1.00
Tarefa: ('def f(x): return x * 2', 5), Gabarito: 10, Predição: 10, R_propose: 0.00, R_solve: 1, R_total: 1.00
Tarefa: ('def f(x): return x * 2', 6), Gabarito: 12, Predição: 12, R_propose: 0.00, R_solve: 1, R_total: 1.00


In [5]:
import random
from typing import List, Tuple, Dict, Any


class TaskBuffers:
    """Buffers D_abd, D_ded, D_ind como no paper (sec. 3.3.1)"""
    def __init__(self, B=4, S=4):
        self.D_seed = []
        self.D_abduction = []
        self.D_deduction = []
        self.D_induction = []
        self.B = B
        self.S = S

    def init_seeding(self):
        # Inicializa com o "zero triplet": função identidade
        zero_triplet = ("def f(x): return x", "Hello World", "Hello World")
        self.D_seed.append(zero_triplet)
        self.D_abduction.extend([zero_triplet] * (self.B * self.S))
        self.D_deduction.extend([zero_triplet] * (self.B * self.S))
        self.D_induction.extend([("def f(x): return x**2", [(i, i**2) for i in range(1, 4)], "Quadrado")] * (self.B * self.S))


In [7]:
class TaskConstructor:
    def __init__(self, env):
        self.env = env

    def construct_deduction_task(self, p: str, i: Any) -> Tuple[str, Any, Any]:
        o = self.env.execute(p, i)
        if o is not None:
            return (p, i, o)
        return None

    def construct_abduction_task(self, p: str, o_target: Any) -> Tuple[str, Any, Any]:
        for i in range(10):
            if self.env.execute(p, i) == o_target:
                return (p, i, o_target)
        return None

    def construct_induction_task(self, p: str, N=4) -> Tuple[str, List[Tuple[Any, Any]], str]:
        ios = []
        for i in range(1, N + 1):
            o = self.env.execute(p, i)
            if o is None:
                return None
            ios.append((i, o))
        msg = "Função proposta"
        return (p, ios, msg)


In [9]:
class TaskVerifier:
    def __init__(self, env):
        self.env = env

    def verify_deduction(self, p, i, o_star) -> bool:
        return self.env.execute(p, i) == o_star

    def verify_abduction(self, p, o, i_candidate) -> bool:
        return self.env.execute(p, i_candidate) == o

    def verify_induction(self, p_candidate, test_io: List[Tuple[Any, Any]]) -> bool:
        for i, o in test_io:
            if self.env.execute(p_candidate, i) != o:
                return False
        return True


In [11]:
class RewardFunctions:
    def __init__(self, env, solver, n=3):
        self.env = env
        self.solver = solver
        self.n = n  # Monte Carlo rollouts

    def r_propose(self, τ: Tuple[str, Any], y_star: Any) -> float:
        """Reward de proposição baseado em acerto médio"""
        successes = 0
        for _ in range(self.n):
            y_pred = self.solver.solve(τ)
            if y_pred == y_star:
                successes += 1
        avg = successes / self.n
        if avg == 0 or avg == 1:
            return 0.0
        return 1.0 - avg  # eq. (4) do artigo

    def r_solve(self, y_pred, y_star) -> int:
        """Reward binário da solução (eq. 5)"""
        return int(y_pred == y_star)


In [13]:
class Solver:
    def __init__(self, env):
        self.env = env

    def solve(self, task: Tuple[str, Any]) -> Any:
        p, i = task
        return self.env.execute(p, i)


In [15]:
# Laço de treinamento simples como no Algorithm 1
env = Environment()
solver = Solver(env)
rewards = RewardFunctions(env, solver)
buffers = TaskBuffers()
buffers.init_seeding()
construct = TaskConstructor(env)
verifier = TaskVerifier(env)

# Tarefa de dedução
program = "def f(x): return x * 3"
input_x = 4
τ = (program, input_x)

x, y_star = τ, env.execute(program, input_x)
y_pred = solver.solve(x)

rp = rewards.r_propose(x, y_star)
rs = rewards.r_solve(y_pred, y_star)
λ = 1.0
R_total = rp + λ * rs

print(f"Tarefa dedução: {x}, gabarito: {y_star}, predição: {y_pred}")
print(f"Recompensas → r_propose: {rp:.2f}, r_solve: {rs}, total: {R_total:.2f}")


Tarefa dedução: ('def f(x): return x * 3', 4), gabarito: 12, predição: 12
Recompensas → r_propose: 0.00, r_solve: 1, total: 1.00
