In [4]:
import wandb
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda

import gym

import numpy as np
import matplotlib.pyplot as plt


In [5]:
class Pendulum:
    
    def __init__(self, env, iden = 0):
        self.env = env
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_bound = self.env.action_space.high[0]
        self.std_bound = [1e-2, 1.0]

        self.iden = iden
    
    def train(self, max_episodes=1000):
        for ep in range(max_episodes):
            state_batch = []
            action_batch = []
            reward_batch = []
            old_policy_batch = []

            episode_reward, done = 0, False

            state = self.env.reset()

            while not done:
                # self.env.render()
                log_old_policy, action = self.actor.get_action(state)

                next_state, reward, done, _ = self.env.step(action)

                state = np.reshape(state, [1, self.state_dim])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])
                log_old_policy = np.reshape(log_old_policy, [1, 1])

            print('EP{} EpisodeReward={}'.format(ep, episode_reward))
            wandb.log({'Reward' + str(self.iden): episode_reward})

In [6]:
class Agent:
    
    def __init__(self, env, Kp, gain):
        
        # initial proportional controller
        # real and test (test used for gradient descent)f
        # need to be numpy vectors
        self.Kp_r = Kp
        self.gain_r = gain
        
#         self.Kp_t = self.Kp_r
#         self.gain_t = self.gain_r
        
#         self.x0 = x0
        
        self.radius = 0.001
        
        # define the environment
        self.env = env
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.action_bound = self.env.action_space.high[0]
        self.std_bound = [1e-2, 1.0]

        self.iden = iden
        
    # Simulate the environment and get the reward out
    def simulate(self, prop, g):
        state_batch = []
        action_batch = []
        reward_batch = []
        old_policy_batch = []

        episode_reward, done = 0, False

        state = self.env.reset()

        # define the action taken
        def get_action(self, st):
            action = prop @ state + g
            return(action)
        
        while not done:
            action = get_action(state)
            next_state, reward, done, _ = self.env.step(action)

            state = np.reshape(state, [1, self.state_dim])
            action = np.reshape(action, [1, 1])
            next_state = np.reshape(next_state, [1, self.state_dim])
            reward = np.reshape(reward, [1, 1])
            
            episode_reward += reward

#         print('EP{} EpisodeReward={}'.format(ep, episode_reward))
#         wandb.log({'Reward' + str(self.iden): episode_reward})
        return episode_reward

    
    def learn(self):
        
         # pick a vector on unit sphere to move Kp in
        # vec = (np.random.rand(self.Kp.size + self.gain.size) - 0.5)
        vec = (np.random.rand(self.Kp_r.size + 1) - 0.5)
        vec = self.radius * vec / np.linalg.norm(vec)
        vec_k = np.reshape(vec[:self.Kp_r.size], self.Kp_r.shape)

        vec_g = vec[-1:]

        rand_start = self.x0#  + np.random.randn(self.x0.size) * 0.1

        # two-point gradient descent estimate
        self.Kp_t = self.Kp_r + vec_k
        self.gain_t = self.gain_r + vec_g
        _, err1 = self.simulate(rand_start)

        self.Kp_t = self.Kp_r - vec_k
        self.gain_t = self.gain_r - vec_g
        _, err2 = self.simulate(rand_start)
        
        

In [None]:


Kp = np.array([0, 0])
gain = np.array([0])

ag = Agent(Kp, gain)

ag.simulate()