In [3]:
import torch
import torch.nn as nn
from torch.optim import Optimizer
from models import RosReinforceActor
import gymnasium as gym

In [4]:
#ROS optimizer from original paper
class AvgAccumGradOptimizer(Optimizer):

    def __init__(self, params, lr):
        self.lr = lr
        # self.step_i = 0

        super().__init__(params, {'lr': lr, 'avg_grad': 0})

    @torch.no_grad()
    def step(self, old_weight: float, new_weight: float, update_avg_grad=True):
        grad_sum = 0
        grad_num = 0
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad
                state = self.state[p]
                avg_grad = state.setdefault('avg_grad', 0)
                # avg_grad = self.step_i / (self.step_i + 1) * avg_grad + d_p / (self.step_i + 1)
                avg_grad = (old_weight * avg_grad + new_weight * d_p) / (old_weight + new_weight)
                if update_avg_grad:
                    state['avg_grad'] = avg_grad
                if group['lr'] > 0:
                    p.add_(avg_grad, alpha=-group['lr'])

                grad_sum += avg_grad.abs().sum().item()
                # grad_sum += avg_grad.pow(2).sum().item()
                grad_num += avg_grad.numel()
        # if update:
        #     self.step_i += 1

        return grad_sum / grad_num
        # return (grad_sum / grad_num) ** 0.5
        
# ROS sampling agent adapted from original paper
class RobustOnPolicySampler():
    def __init__(self, pi_e, env: gym.Env, max_time_step: int, **kwargs):
        self.pi_e = pi_e
        self.step = 0
        self.env = env
        self.max_time_step = max_time_step

        self.lr = kwargs['ros_lr']
        self.pi_b = copy.deepcopy(pi_e)

        self.dynamic_lr = kwargs['ros_dynamic_lr']
        self.optimizer = AvgAccumGradOptimizer(self.pi_b.parameters(), self.lr)

    def recover_parameters(self):
        for source_param, dump_param in zip(self.pi_b.parameters(), self.pi_e.parameters()):
            source_param.data.copy_(dump_param.data)

    def act(self, state):
        action, _, _, _ = self.pi_b.get_action_and_value(state)
        return action

    def update(self, state, action):
        self.step += 1
        self.pi_b.eval()  # fix BN layer
        self.recover_parameters() # reset pi_b to pi_e
        with torch.enable_grad():
            _, loss, _, _ = self.pi_b.get_action_and_value(state, action)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step(old_weight=self.step - 1, new_weight=1)



In [8]:
# create agents
device = torch.device('cpu')
env = gym.make("CartPole-v1")
eval_policy = RosReinforceActor(env,device)
eval_policy = torch.load('policies/model_CartPole_1000.pt')
ros_agent = RobustOnPolicySampler(eval_policy, env, 500, **{'ros_lr': 0.001, 'ros_dynamic_lr': False, 'ros_first_layer': False, 'ros_only_weight': False})

ModuleNotFoundError: No module named 'models.actor'; 'models' is not a package