In [1]:
from collections import defaultdict
from collections import deque, OrderedDict
from itertools import islice
class BaseSampler(object):
    def __init__(self,
                 max_path_length,
                 environment=None,
                 policy=None,
                 pool=None,
                 store_last_n_paths=10):
        self._max_path_length = max_path_length
        self._store_last_n_paths = store_last_n_paths
        self._last_n_paths = deque(maxlen=store_last_n_paths)

        self.environment = environment
        self.policy = policy
        self.pool = pool

    def initialize(self, environment, policy, pool):
        self.environment = environment
        self.policy = policy
        self.pool = pool

    def reset(self):
        pass

    def set_policy(self, policy):
        self.policy = policy

    def clear_last_n_paths(self):
        self._last_n_paths.clear()

    def get_last_n_paths(self, n=None):
        if n is None:
            n = self._store_last_n_paths

        last_n_paths = tuple(islice(self._last_n_paths, None, n))

        return last_n_paths

    def sample(self):
        raise NotImplementedError

    def terminate(self):
        self.environment.close()

    def get_diagnostics(self):
        diagnostics = OrderedDict({'pool-size': self.pool.size})
        return diagnostics

    def __getstate__(self):
        state = {
            key: value for key, value in self.__dict__.items()
            if key not in (
                'environment',
                'policy',
                'pool',
                '_last_n_paths',
                '_current_observation',
                '_current_path',
                '_is_first_step',
            )
        }

        return state

    def __setstate__(self, state):
        self.__dict__.update(state)

        self.environment = None
        self.policy = None
        self.pool = None
        # TODO(hartikainen): Maybe try restoring these from the pool?
        self._last_n_paths = deque(maxlen=self._store_last_n_paths)
import numpy as np
import tree
from dataclasses import dataclass
from typing import Union, Callable
from numbers import Number
import gzip
import pickle

import numpy as np
import tensorflow as tf
import tree
import abc


In [2]:
# replay buffer设定

@dataclass
class Field:
    name: str
    dtype: Union[str, np.dtype, tf.DType]
    shape: Union[tuple, tf.TensorShape]
    initializer: Callable = np.zeros
    default_value: Number = 0.0

INDEX_FIELDS = {
    'episode_index_forwards': Field(
        name='episode_index_forwards',
        dtype='uint64',
        shape=(1, ),
        default_value=0,
    ),
    'episode_index_backwards': Field(
        name='episode_index_backwards',
        dtype='uint64',
        shape=(1, ),
        default_value=0,
    ),
}

class ReplayPool(object):
    """A class used to save and replay data."""

    @abc.abstractmethod
    def add_sample(self, sample):
        """Add a transition tuple."""
        pass

    @abc.abstractmethod
    def terminate_episode(self):
        """Clean up pool after episode termination."""
        pass

    @property
    @abc.abstractmethod
    def size(self, **kwargs):
        pass

    @property
    @abc.abstractmethod
    def add_path(self, path):
        """Add a rollout to the replay pool."""
        pass

    @abc.abstractmethod
    def random_batch(self, batch_size):
        """Return a random batch of size `batch_size`."""
        pass

class FlexibleReplayPool(ReplayPool):
    def __init__(self, max_size, fields):
        super(FlexibleReplayPool, self).__init__()

        max_size = int(max_size)
        self._max_size = max_size

        self.fields = {**fields, **INDEX_FIELDS}
        # print('fields:{}'.format(fields))
        self.data = tree.map_structure(self._initialize_field, self.fields)

        self._pointer = 0
        self._size = 0
        self._samples_since_save = 0

    @property
    def size(self):
        return self._size

    def _initialize_field(self, field):
        # 这里的关键是field.shape，指的是要存储数据的维度
        field_shape = (self._max_size, *field.shape)
        # print('field.name:{},field_shape:{}'.format(field.name,field_shape))
        # np.zeros()
        field_values = field.initializer(
            field_shape, dtype=field.dtype)

        return field_values

    def _advance(self, count=1):
        """Handles bookkeeping after adding samples to the pool.

        * Moves the pointer (`self._pointer`)
        * Updates the size (`self._size`)
        * Fixes the `episode_index_backwards` field, which might have become
          out of date when the pool is full and we start overriding old
          samples.
        """
        self._pointer = (self._pointer + count) % self._max_size
        self._size = min(self._size + count, self._max_size)

        if self.data['episode_index_forwards'][self._pointer] != 0:
            episode_tail_length = int(self.data[
                                          'episode_index_backwards'
                                      ][self._pointer, 0] + 1)
            self.data[
                'episode_index_forwards'
            ][np.arange(
                self._pointer, self._pointer + episode_tail_length
            ) % self._max_size] = np.arange(episode_tail_length)[..., None]

        self._samples_since_save += count

    def add_sample(self, sample):
        samples = tree.map_structure(lambda x: x[..., np.newaxis], sample)
        self.add_samples(samples)

    def add_samples(self, samples):
        num_samples = tree.flatten(samples)[0].shape[0]

        assert (('episode_index_forwards' in samples.keys())
                is ('episode_index_backwards' in samples.keys()))
        if 'episode_index_forwards' not in samples.keys():
            samples['episode_index_forwards'] = np.full(
                (num_samples, *self.fields['episode_index_forwards'].shape),
                self.fields['episode_index_forwards'].default_value,
                dtype=self.fields['episode_index_forwards'].dtype)
            samples['episode_index_backwards'] = np.full(
                (num_samples, *self.fields['episode_index_backwards'].shape),
                self.fields['episode_index_backwards'].default_value,
                dtype=self.fields['episode_index_backwards'].dtype)

        index = np.arange(
            self._pointer, self._pointer + num_samples) % self._max_size

        def add_sample(path, data, new_values, field):
            assert new_values.shape[0] == num_samples, (
                new_values.shape, num_samples)
            data[index] = new_values

        tree.map_structure_with_path(
            add_sample, self.data, samples, self.fields)

        self._advance(num_samples)

    def add_path(self, path):
        # 给数据添加了 'episode_index_forwards'和'episode_index_backwards'字段，一个是【0，1，2，path_length】，另一个是倒序
        path = path.copy()
        path_length = tree.flatten(path)[0].shape[0]
        path.update({
            'episode_index_forwards': np.arange(
                path_length,
                dtype=self.fields['episode_index_forwards'].dtype
            )[..., np.newaxis],
            'episode_index_backwards': np.arange(
                path_length,
                dtype=self.fields['episode_index_backwards'].dtype
            )[::-1, np.newaxis],
        })

        return self.add_samples(path)

    def random_indices(self, batch_size):
        if self._size == 0: return np.arange(0, 0)
        return np.random.randint(0, self._size, batch_size)

    def random_batch(self, batch_size, field_name_filter=None, **kwargs):
        random_indices = self.random_indices(batch_size)
        return self.batch_by_indices(
            random_indices, field_name_filter=field_name_filter, **kwargs)

    def random_sequence_batch(self, batch_size, **kwargs):
        random_indices = self.random_indices(batch_size)
        return self.sequence_batch_by_indices(random_indices, **kwargs)

    def last_n_batch(self, last_n, field_name_filter=None, **kwargs):
        last_n_indices = np.arange(
            self._pointer - min(self.size, int(last_n)), self._pointer,
            dtype=int
        ) % self._max_size

        return self.batch_by_indices(
            last_n_indices, field_name_filter=field_name_filter, **kwargs)

    def last_n_sequence_batch(self, last_n, **kwargs):
        last_n_indices = np.arange(
            self._pointer - min(self.size, int(last_n)), self._pointer,
            dtype=int
        ) % self._max_size

        return self.sequence_batch_by_indices(last_n_indices, **kwargs)

    def filter_fields(self, field_names, field_name_filter):
        if isinstance(field_name_filter, str):
            field_name_filter = [field_name_filter]

        if isinstance(field_name_filter, (list, tuple)):
            field_name_list = field_name_filter

            def filter_fn(field_name):
                return field_name in field_name_list

        else:
            filter_fn = field_name_filter

        filtered_field_names = [
            field_name for field_name in field_names
            if filter_fn(field_name)
        ]

        return filtered_field_names

    def batch_by_indices(self,
                         indices,
                         field_name_filter=None,
                         validate_index=True):
        if validate_index and np.any(self.size <= indices % self._max_size):
            raise ValueError(
                "Tried to retrieve batch with indices greater than current"
                " size")

        if field_name_filter is not None:
            raise NotImplementedError("TODO(hartikainen)")

        batch = tree.map_structure(
            lambda field: field[indices % self._max_size], self.data)
        return batch

    def sequence_batch_by_indices(self,
                                  indices,
                                  sequence_length,
                                  field_name_filter=None):
        if np.any(self.size <= indices % self._max_size):
            raise ValueError(
                "Tried to retrieve batch with indices greater than current"
                " size")
        if indices.size < 1:
            return self.batch_by_indices(indices)

        sequence_indices = (
                indices[:, None] + np.arange(sequence_length)[None])
        sequence_batch = self.batch_by_indices(
            sequence_indices, validate_index=False)

        if 'mask' in sequence_batch:
            raise ValueError(
                "sequence_batch_by_indices adds a field 'mask' into the batch."
                " There already exists a 'mask' field in the batch. Please"
                " remove it before using sequence_batch. TODO(hartikainen):"
                " Allow mask name to be configured.")

        forward_diffs_0 = np.diff(
            sequence_batch['episode_index_forwards'].astype(np.int64), axis=1)
        forward_diffs_1 = np.pad(
            forward_diffs_0, ([0, 0], [0, 1], [0, 0]),
            mode='constant',
            constant_values=-1)
        cut_and_pad_sample_indices = (
                np.argmax(forward_diffs_1[:, ::1, :] < 1, axis=1)
                + 1)[..., 0]

        sequence_batch['mask'] = np.where(
            np.arange(sequence_length)[None, ...]
            < cut_and_pad_sample_indices[..., None],
            True,
            False)

        return sequence_batch

    def save_latest_experience(self, pickle_path):
        latest_samples = self.last_n_batch(self._samples_since_save)

        with gzip.open(pickle_path, 'wb') as f:
            pickle.dump(latest_samples, f)

        self._samples_since_save = 0

    def load_experience(self, experience_path):
        with gzip.open(experience_path, 'rb') as f:
            latest_samples = pickle.load(f)

        num_samples = tree.flatten(latest_samples)[0].shape[0]

        def assert_shape(data):
            assert data.shape[0] == num_samples, data.shape

        tree.map_structure(assert_shape, latest_samples)

        self.add_samples(latest_samples)
        self._samples_since_save = 0

# 这里要指定初始observation 和action的格式
class SimpleReplayPool(FlexibleReplayPool):
    def __init__(self,
                 environment,
                 *args,
                 extra_fields=None,
                 **kwargs):
        extra_fields = extra_fields or {}
        ################## 重新指定observation_space #################
        obs = environment.get_obs()
        # 处理后的observation
        observation_space = np.concatenate(([obs.rho.max()],[obs.rho.min()]))
        # 动作编号
        action_space = np.array(0)

        ############################################################
        # observation_space = environment.observation_space
        # action_space = environment.action_space

        self._environment = environment
        self._observation_space = observation_space
        self._action_space = action_space

        fields = {
            'observations':Field(
                name='observations',
                dtype=observation_space.dtype,
                shape=observation_space.shape),
            'next_observations':Field(
                name='next_observations',
                dtype=observation_space.dtype,
                shape=observation_space.shape),
            'actions': Field(
                name='actions',
                dtype=action_space.dtype,
                shape=action_space.shape),
            'rewards': Field(
                name='rewards',
                dtype='float32',
                shape=(1, )),
            # terminals[i] = a terminal was received at time i
            'terminals': Field(
                name='terminals',
                dtype='bool',
                shape=(1, )),
            **extra_fields
        }

        super(SimpleReplayPool, self).__init__(
            *args, fields=fields, **kwargs)

In [3]:
# Trajectory 采集器

class BaseSampler(object):
    def __init__(self,
                 max_path_length,
                 environment=None,
                 policy=None,
                 pool=None,
                 store_last_n_paths=10):
        self._max_path_length = max_path_length
        self._store_last_n_paths = store_last_n_paths
        self._last_n_paths = deque(maxlen=store_last_n_paths)

        self.environment = environment
        self.policy = policy
        self.pool = pool

    def initialize(self, environment, policy, pool):
        self.environment = environment
        self.policy = policy
        self.pool = pool

    def reset(self):
        pass

    def set_policy(self, policy):
        self.policy = policy

    def clear_last_n_paths(self):
        self._last_n_paths.clear()

    def get_last_n_paths(self, n=None):
        if n is None:
            n = self._store_last_n_paths

        last_n_paths = tuple(islice(self._last_n_paths, None, n))

        return last_n_paths

    def sample(self):
        raise NotImplementedError

    def terminate(self):
        self.environment.close()

    def get_diagnostics(self):
        diagnostics = OrderedDict({'pool-size': self.pool.size})
        return diagnostics

    def __getstate__(self):
        state = {
            key: value for key, value in self.__dict__.items()
            if key not in (
                'environment',
                'policy',
                'pool',
                '_last_n_paths',
                '_current_observation',
                '_current_path',
                '_is_first_step',
            )
        }

        return state

    def __setstate__(self, state):
        self.__dict__.update(state)

        self.environment = None
        self.policy = None
        self.pool = None
        # TODO(hartikainen): Maybe try restoring these from the pool?
        self._last_n_paths = deque(maxlen=self._store_last_n_paths)

# 使用时要处理下observation
class SimpleSampler(BaseSampler):
    def __init__(self, **kwargs):
        super(SimpleSampler, self).__init__(**kwargs)

        self._last_path_return = 0
        self._max_path_return = -np.inf
        self._n_episodes = 0
        self._total_samples = 0

        self._is_first_step = True

    def reset(self):
        if self.policy is not None:
            self.policy.reset()

        self._path_length = 0
        self._path_return = 0
        self._current_path = []
        #################### 修改observation #####################
        obs = self.environment.reset()
        self._current_observation = np.concatenate(([obs.rho.max()],[obs.rho.min()]))
        #########################################
        # self._current_observation = self.environment.reset()

    @property
    def _policy_input(self):
        return self._current_observation

    def _process_sample(self,
                        observation,
                        action,
                        reward,
                        terminal,
                        next_observation):
        processed_observation = {
            'observations': observation,
            'actions': action,
            'rewards': np.atleast_1d(reward),
            'terminals': np.atleast_1d(terminal),
            'next_observations': next_observation,
        }

        return processed_observation

    def sample(self):
        if self._is_first_step:
            self.reset()

        # action = self.policy.action(self._policy_input).numpy()
        ##########################  处理action #############################
        act = self.policy.action(self.environment)
        action = 0
        #######################################################

        next_obs, reward, terminal, info = self.environment.step(
            act)
        # print("next_obs:{},terminal:{}".format(next_obs,terminal))
        next_observation=np.concatenate(([next_obs.rho.max()],[next_obs.rho.min()]))
        self._path_length += 1
        self._path_return += reward
        self._total_samples += 1

        processed_sample = self._process_sample(
            observation=self._current_observation,
            action=action,
            reward=reward,
            terminal=terminal,
            next_observation=next_observation,
        )
        print("processed_sample:{}".format(processed_sample))

        self._current_path.append(processed_sample)

        if terminal or self._path_length >= self._max_path_length:
            print("self._current_path:{}".format(self._current_path))
            last_path = tree.map_structure(
                lambda *x: np.stack(x, axis=0), *self._current_path)

            self.pool.add_path({
                key: value
                for key, value in last_path.items()
                if key != 'infos'
            })

            self._last_n_paths.appendleft(last_path)

            self._max_path_return = max(self._max_path_return,
                                        self._path_return)
            self._last_path_return = self._path_return
            self._n_episodes += 1

            self.pool.terminate_episode()

            self._is_first_step = True
            # Reset is done in the beginning of next episode, see above.

        else:
            self._current_observation = next_observation
            self._is_first_step = False

        return next_observation, reward, terminal, info

    def get_diagnostics(self):
        diagnostics = super(SimpleSampler, self).get_diagnostics()
        diagnostics.update({
            'max-path-return': self._max_path_return,
            'last-path-return': self._last_path_return,
            'episodes': self._n_episodes,
            'total-samples': self._total_samples,
        })

        return diagnostics


In [4]:
class agent_policy():
    def __init__(self):
        pass
    def action(self,env):
        return env.action_space({})
    def reset(self):
        pass

In [5]:
# 环境加载
import grid2op
from grid2op.Reward import BaseReward, RedispReward, L2RPNSandBoxScore
import numpy as np
from grid2op.Parameters import Parameters

other_rewards = {}
other_rewards["tmp_score_codalab"] = L2RPNSandBoxScore
input_dir = '../input_data_local'
parameters = Parameters()
parameters.HARD_OVERFLOW_THRESHOLD = 3.0
parameters.MAX_SUB_CHANGED = 6
parameters.NB_TIMESTEP_OVERFLOW_ALLOWED = 4
parameters.MAX_LINE_STATUS_CHANGED = 100

env = grid2op.make(input_dir, param=parameters,
                   reward_class=RedispReward,
                   other_rewards=other_rewards)
env.seed(10)
env.set_id(0)
obs = env.reset()

In [6]:
path_length = 4
pool = SimpleReplayPool(environment=env, max_size=path_length)
agent = agent_policy()
sampler = SimpleSampler(
    environment=env,
    policy=agent,
    pool=pool,
    max_path_length=path_length)


In [7]:
print(sampler.pool.size)


0


In [8]:
for i in range(6):
    sampler.sample()

processed_sample:{'observations': array([0.6382429 , 0.01134157], dtype=float32), 'actions': 0, 'rewards': array([937.0288], dtype=float32), 'terminals': array([False]), 'next_observations': array([0.6378086 , 0.00742103], dtype=float32)}
processed_sample:{'observations': array([0.6378086 , 0.00742103], dtype=float32), 'actions': 0, 'rewards': array([938.5002], dtype=float32), 'terminals': array([False]), 'next_observations': array([0.63766575, 0.01766751], dtype=float32)}
processed_sample:{'observations': array([0.63766575, 0.01766751], dtype=float32), 'actions': 0, 'rewards': array([938.37775], dtype=float32), 'terminals': array([False]), 'next_observations': array([0.6369229 , 0.01489069], dtype=float32)}
processed_sample:{'observations': array([0.6369229 , 0.01489069], dtype=float32), 'actions': 0, 'rewards': array([939.53784], dtype=float32), 'terminals': array([False]), 'next_observations': array([0.63768613, 0.01580404], dtype=float32)}
self._current_path:[{'observations': array

In [9]:
print(sampler.pool.size)
print(sampler.pool.data)

4
{'observations': array([[0.6382429 , 0.01134157],
       [0.6378086 , 0.00742103],
       [0.63766575, 0.01766751],
       [0.6369229 , 0.01489069]], dtype=float32), 'next_observations': array([[0.6378086 , 0.00742103],
       [0.63766575, 0.01766751],
       [0.6369229 , 0.01489069],
       [0.63768613, 0.01580404]], dtype=float32), 'actions': array([0, 0, 0, 0]), 'rewards': array([[937.0288 ],
       [938.5002 ],
       [938.37775],
       [939.53784]], dtype=float32), 'terminals': array([[False],
       [False],
       [False],
       [False]]), 'episode_index_forwards': array([[0],
       [1],
       [2],
       [3]], dtype=uint64), 'episode_index_backwards': array([[3],
       [2],
       [1],
       [0]], dtype=uint64)}


In [10]:
def rollout(environment,policy,path_length,replay_pool_class=SimpleReplayPool,sampler_class=SimpleSampler,
            break_on_terminal=True):
    pool = replay_pool_class(environment, max_size=path_length)
    sampler = sampler_class(
        environment=environment,
        policy=policy,
        pool=pool,
        max_path_length=path_length)

    infos = defaultdict(list)

    t = 0
    for t in range(path_length):
        observation, reward, terminal, info = sampler.sample()
        for key, value in info.items():
            infos[key].append(value)

        if terminal:
            policy.reset()
            if break_on_terminal: break

    assert pool._size == t + 1

    path = pool.batch_by_indices(np.arange(pool._size))
    path['infos'] = infos

    return path


def rollouts(n_paths, *args, **kwargs):
    paths = [rollout(*args, **kwargs) for i in range(n_paths)]
    return paths

In [11]:
observations = sampler.pool.data['observations']
print("observations:{}".format(observations))
actions = sampler.pool.data['actions']
print("actions:{}".format(actions))
observation = observations[0]
action = [actions[0]]
print("observation:{},action:{}".format(observation,action))
Input_Demo = np.concatenate((observation,action),axis=-1)
Input_Demo = Input_Demo.reshape((1,len(Input_Demo)))
print(Input_Demo)


observations:[[0.6382429  0.01134157]
 [0.6378086  0.00742103]
 [0.63766575 0.01766751]
 [0.6369229  0.01489069]]
actions:[0 0 0 0]
observation:[0.6382429  0.01134157],action:[0]
[[0.6382429  0.01134157 0.        ]]


In [12]:
from q_value_nwtwork import double_feedforward_Q_function
Qs = double_feedforward_Q_function(input_demo=Input_Demo,hidden_layer_sizes=(50,50),name = 'Double_Q_Value_Function')

hidden_layer_sizes:(50, 50),output_shape:[1],output_size:1
hidden_layer_sizes:(50, 50),output_shape:[1],output_size:1


In [13]:
for Q in Qs:
    print("Q.values(observation,action):{}".format(Q.values(observations,actions)))

inputs:[[0.6382429  0.01134157 0.        ]
 [0.63780862 0.00742103 0.        ]
 [0.63766575 0.01766751 0.        ]
 [0.6369229  0.01489069 0.        ]]
Q.values(observation,action):[[-0.05132855]
 [-0.05176813]
 [-0.05051375]
 [-0.05078914]]
inputs:[[0.6382429  0.01134157 0.        ]
 [0.63780862 0.00742103 0.        ]
 [0.63766575 0.01766751 0.        ]
 [0.6369229  0.01489069 0.        ]]
Q.values(observation,action):[[0.03838278]
 [0.03934149]
 [0.03675331]
 [0.03740422]]


In [15]:
from discret_policy import GaussianPolicy
Action_Num = 360
Input_Demo = observation.reshape((1,len(observation)))
M = 50
policy = GaussianPolicy(input_demo=Input_Demo,output_shape=Action_Num,**{'hidden_layer_sizes':(M, M),'observation_keys': None,'preprocessors': None,})
print(policy.actions(Input_Demo))
print(policy.actions(observations))


hidden_layer_sizes:(50, 50),output_shape:(360,),output_size:360
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
tf.Tensor([198], shape=(1,), dtype=int64)
tf.Tensor([198 198 198 198], shape=(4,), dtype=int64)
