# Baseline is taken from https://github.com/alirezamika/bipedal-es

Нужно переписать на стратегию актор-критик

In [18]:
# rewritten

import numpy as np


class Model(object):

    def __init__(self):
        self.actor_weights = [np.zeros(shape=(24, 16)), np.zeros(shape=(16, 16)), np.zeros(shape=(16, 4))]
        self.critic_weights = [np.zeros(shape=(24, 12)), np.zeros(shape=(12, 4)), np.zeros(shape=(4, 1))]

    def predict(self, inp):
        #print(inp.shape)
        out = np.expand_dims(inp.flatten(), 0)
        out = out / np.linalg.norm(out)
        for layer in self.actor_weights:
            out = np.dot(out, layer)
#         noise = np.expand_dims(inp.flatten(), 0)
#         for layer in self.critic_weights:
#             noise = np.dot(noise, layer)
#         out = out + np.random.normal(size=out.shape) * noise
        return out[0]

    def get_weights(self):
        return np.stack([self.actor_weights, self.critic_weights], axis=0)

    def set_weights(self, weights):
        self.actor_weights = weights[0]
        self.critic_weights = weights[1]

In [19]:
np.stack([np.ones((1,)), np.zeros((1,))], axis=0)

array([[ 1.],
       [ 0.]])

In [20]:
import random
import cPickle as pickle
import numpy as np
from evostra import EvolutionStrategy
import gym


class Agent:

    AGENT_HISTORY_LENGTH = 1
    POPULATION_SIZE = 20
    EPS_AVG = 1
    SIGMA = 0.1
    LEARNING_RATE = 0.01
    INITIAL_EXPLORATION = 1.0
    FINAL_EXPLORATION = 0.0
    EXPLORATION_DEC_STEPS = 1000000

    def __init__(self):
        self.env = gym.make('BipedalWalker-v2')
        self.model = Model()
        self.es = EvolutionStrategy(self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE)
        self.exploration = self.INITIAL_EXPLORATION


    def get_predicted_action(self, sequence):
        prediction = self.model.predict(np.array(sequence))
        return prediction


    def load(self, filename='weights.pkl'):
        with open(filename,'rb') as fp:
            self.model.set_weights(pickle.load(fp))
        self.es.weights = self.model.get_weights()


    def save(self, filename='weights.pkl'):
        with open(filename, 'wb') as fp:
            pickle.dump(self.es.get_weights(), fp)


    def play(self, episodes, render=True):
        self.model.set_weights(self.es.weights)
        for episode in xrange(episodes):
            total_reward = 0
            observation = self.env.reset()
            done = False
            while not done:
                if render:
                    self.env.render()
                action = self.get_predicted_action(observation)
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
            print "total reward:", total_reward


    def train(self, iterations):
        self.es.run(iterations, print_step=1)


    def get_reward(self, weights):
        total_reward = 0.0
        self.model.set_weights(weights)

        for episode in xrange(self.EPS_AVG):
            observation = self.env.reset()
            done = False
            while not done:
                self.exploration = max(self.FINAL_EXPLORATION, self.exploration - self.INITIAL_EXPLORATION/self.EXPLORATION_DEC_STEPS)
                if random.random() < self.exploration:
                    action = self.env.action_space.sample()
                else:
                    action = self.get_predicted_action(observation)
                observation, reward, done, _ = self.env.step(action)
                total_reward += reward
                #print(sequence)

        return total_reward/self.EPS_AVG

In [21]:
agent = Agent()

In [22]:
agent.train(1000)

iter 0. reward: -120.552532
iter 1. reward: -115.421500
iter 2. reward: -119.123694
iter 3. reward: -112.726177
iter 4. reward: -63.199820
iter 5. reward: -115.041277


KeyboardInterrupt: 

In [None]:
agent.play(1)