In [1]:
from google import genai
from google.genai import types

In [2]:
client = genai.Client()

In [3]:
with open('rl-train.py', 'r') as file:
    data = file.read()

In [4]:
print(data)

import random

import numpy as np

import gymnasium as gym

import torch
from torch import nn


device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")


class PolicyNet(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.stack = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.ReLU(),
            nn.Linear(64, act_dim)
        )
    
    def forward(self, x):
        return self.stack(x)
    

def get_new_reward(obs):
    reward = 1
    return reward

env = gym.make("CartPole-v1")

obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n

model_policy = PolicyNet(obs_dim, act_dim)

optimizer = torch.optim.Adam(model_policy.parameters(), lr=1e-3)

for episode in range(4000):
    obs, _ = env.reset()
    log_probs = []
    rewards = []

    done = False
    while not done:
        obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueez

In [13]:
observation_space = """
    ## Observation Space

    The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:

    | Num | Observation           | Min                 | Max               |
    |-----|-----------------------|---------------------|-------------------|
    | 0   | Cart Position         | -4.8                | 4.8               |
    | 1   | Cart Velocity         | -Inf                | Inf               |
    | 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
    | 3   | Pole Angular Velocity | -Inf                | Inf               |

    **Note:** While the ranges above denote the possible values for observation space of each element,
        it is not reflective of the allowed values of the state space in an unterminated episode. Particularly:
    -  The cart x-position (index 0) can be take values between `(-4.8, 4.8)`, but the episode terminates
       if the cart leaves the `(-2.4, 2.4)` range.
    -  The pole angle can be observed between  `(-.418, .418)` radians (or **±24°**), but the episode terminates
       if the pole angle is not in the range `(-.2095, .2095)` (or **±12°**)
"""

description_problem = """
    ## Description

    This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson in
    ["Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problem"](https://ieeexplore.ieee.org/document/6313077).
    A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
    The pendulum is placed upright on the cart and the goal is to balance the pole by applying forces
     in the left and right direction on the cart.
"""

episode_end = """
## Episode End

    The episode ends if any one of the following occurs:

    1. Termination: Pole Angle is greater than ±12°
    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
    3. Truncation: Episode length is greater than 500 (200 for v0)
"""

In [18]:
prompt = f"""
Tengo un problema de reinforcement learning definido como:

{description_problem}

para dicho problema, lo resolvimos usando el siguiente codigo de reinforcement learning:

{data}

quiero que modifiques la funcion llamada get_new_reward y que me des la nueva funcion. 
Esta funcion recibe como parametro la variable obs, que está definida como sigue:

{observation_space}

recuerda que el episodio termina con las siguientes caracteristicas, el caso de truncation, es un caso de éxito del problema:

{episode_end}

"""

In [19]:
print(prompt)


Tengo un problema de reinforcement learning definido como:


    ## Description

    This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson in
    ["Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problem"](https://ieeexplore.ieee.org/document/6313077).
    A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
    The pendulum is placed upright on the cart and the goal is to balance the pole by applying forces
     in the left and right direction on the cart.


para dicho problema, lo resolvimos usando el siguiente codigo de reinforcement learning:

import random

import numpy as np

import gymnasium as gym

import torch
from torch import nn


device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")


class PolicyNet(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__

In [20]:
response = client.models.generate_content(
    model="gemini-2.5-flash-lite", 
    config=types.GenerateContentConfig(
        system_instruction=[
            "You are a Reinforcement Learning Reward Function Design Specialist.",
            "Your mission is to help design, analyze, and improve reward functions for reinforcement learning agents across various environments and tasks.",
            "You are highly skilled in identifying sparse, deceptive, or poorly shaped rewards and proposing alternatives that improve learning efficiency and agent behavior.",
            "You understand how to align reward signals with long-term objectives, avoid reward hacking, and incorporate domain knowledge into reward shaping.",
            "You are familiar with a wide range of RL algorithms such as PPO, DDPG, SAC, A3C, Q-learning, and understand how reward design interacts with each algorithm's learning dynamics.",
            "You always respond with code only — no explanations, no comments, no additional text — just the raw code necessary to implement the reward function or solution requested.",
            "Always ask clarifying questions if the problem is underspecified or if the task goals are ambiguous."
        ],
        temperature=0.1
    ),   
    contents=prompt,
)

In [21]:
print(response.text)

```python
def get_new_reward(obs):
    cart_position, cart_velocity, pole_angle, pole_velocity = obs
    
    # Penalize for being close to the termination boundaries
    # Cart position penalty
    cart_pos_penalty = 0
    if abs(cart_position) > 2.0:  # Closer to the edge
        cart_pos_penalty = (abs(cart_position) - 2.0) * 0.5 
        
    # Pole angle penalty
    pole_angle_penalty = 0
    if abs(pole_angle) > 0.15: # Closer to the termination angle
        pole_angle_penalty = (abs(pole_angle) - 0.15) * 1.0

    # Penalize for high velocities (less stable)
    velocity_penalty = abs(cart_velocity) * 0.1 + abs(pole_velocity) * 0.2

    # Base reward for staying alive
    reward = 1.0 - cart_pos_penalty - pole_angle_penalty - velocity_penalty
    
    # Ensure reward is not negative, though the penalties are designed to be small
    reward = max(0, reward) 
    
    return reward
```


In [22]:
# tmp = response.text.split('\n')[1:-1]
tmp = response.text.splitlines()[1:-1]
codigo = "\n".join(tmp)

In [23]:
print(codigo)

def get_new_reward(obs):
    cart_position, cart_velocity, pole_angle, pole_velocity = obs
    
    # Penalize for being close to the termination boundaries
    # Cart position penalty
    cart_pos_penalty = 0
    if abs(cart_position) > 2.0:  # Closer to the edge
        cart_pos_penalty = (abs(cart_position) - 2.0) * 0.5 
        
    # Pole angle penalty
    pole_angle_penalty = 0
    if abs(pole_angle) > 0.15: # Closer to the termination angle
        pole_angle_penalty = (abs(pole_angle) - 0.15) * 1.0

    # Penalize for high velocities (less stable)
    velocity_penalty = abs(cart_velocity) * 0.1 + abs(pole_velocity) * 0.2

    # Base reward for staying alive
    reward = 1.0 - cart_pos_penalty - pole_angle_penalty - velocity_penalty
    
    # Ensure reward is not negative, though the penalties are designed to be small
    reward = max(0, reward) 
    
    return reward


In [None]:

# Nombre del archivo
nombre_archivo = "mi_reward.py"

# Guardar el contenido en el archivo .py
with open(nombre_archivo, "w", encoding="utf-8") as archivo:
    archivo.write(codigo)
