In [None]:
# tensorflowでrlを実装するためのsandbox
# かんたんなアルゴから実装していく

In [None]:
from pydantic import BaseModel
import  gymnasium as gym

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

env = gym.make("LunarLander-v2", render_mode="human")

In [None]:
class Agent:

    def __init__(self, action_space: gym.Space, observation_space: gym.Space):
        self.action_space = action_space
        self.observation_space = observation_space

    def policy(self, observation):
        raise NotImplementedError

    def learn(self, action, observation, next_observation, reward):
        raise NotImplementedError



class Trainer:
    class Params(BaseModel):
        max_steps: int = 1000

    def __init__(self, params: Params, env: gym.Env, agent: Agent):
        self.params = params
        self.env = env
        self.agent = agent
        self.history = []

    def train(self):
        self.history = []
        observation, info = env.reset()
        history = [(observation, info, 0, None)]

        good_action = []
        good_observation = []
        good_next_observation = []
        good_reward = []

        bad_action = []
        bad_observation = []
        bad_next_observation = []
        bad_reward = []

        for epoch in range(self.params.max_steps):
            batch_action = []
            batch_observation = []
            batch_next_observation = []
            batch_reward = []

            total_reward = 0
            while True:
                action = self.agent.policy(observation)
                next_observation, reward, terminated, truncated, info = env.step(action)
                total_reward += reward

                batch_action.append(action)
                batch_observation.append(observation)
                batch_next_observation.append(next_observation)
                batch_reward.append(reward)

                observation = next_observation
                history.append((observation, info, reward, action))
                
                if len(good_action) + len(bad_action) > 100:
                    
                    train_action = []
                    train_observation = []
                    train_next_observation = []
                    train_reward = []

                    if len(good_action) > 0:
                        good_index = np.random.randint(0, len(good_action), 2000)
                        train_action.extend([good_action[i] for i in good_index])
                        train_observation.extend([good_observation[i] for i in good_index])
                        train_next_observation.extend([good_next_observation[i] for i in good_index])
                        train_reward.extend([good_reward[i] for i in good_index])
                    if len(bad_action) > 0:
                        bad_index = np.random.randint(0, len(bad_action), 2000)
                        train_action.extend([bad_action[i] for i in bad_index])
                        train_observation.extend([bad_observation[i] for i in bad_index])
                        train_next_observation.extend([bad_next_observation[i] for i in bad_index])
                        train_reward.extend([bad_reward[i] for i in bad_index])
                    
                    self.agent.learn(train_action, train_observation, train_next_observation, train_reward)
                        
                if terminated or truncated:
                    break

            self.history.append(history)
            observation, info = env.reset()
            history = [(observation, info, 0, None)]
            self.render(history)
            print("epoch : {}, total reward : {}".format(epoch, total_reward))

            if total_reward > 100:
                good_action.extend(batch_action)
                good_observation.extend(batch_observation)
                good_next_observation.extend(batch_next_observation)
                good_reward.extend(batch_reward)
            else:
                bad_action.extend(batch_action)
                bad_observation.extend(batch_observation)
                bad_next_observation.extend(batch_next_observation)
                bad_reward.extend(batch_reward)

        if len(history) > 1:
            self.history.append(history)
            
        env.close()

    def render(self, history):
        plt.plot([h[2] for h in history])

In [None]:
# random agent
class RandomAgent(Agent):
    def policy(self, observation):
        return self.action_space.sample()

    def learn(self, action, observation, next_observation, reward):
        pass

env = gym.make("LunarLander-v2", render_mode="human")
trainer = Trainer(Trainer.Params(max_steps=1000), env, RandomAgent(env.action_space, env.observation_space))
trainer.train()

In [None]:
class FCModel(tf.keras.Model):
    def __init__(self, output_dim: int):
        super().__init__()
        self.fc1 = tf.keras.layers.Dense(32, activation="relu")
        self.fc2 = tf.keras.layers.Dense(32, activation="relu")
        self.fc3 = tf.keras.layers.Dense(output_dim)

    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

# deep q network
class DQNAgent(Agent):
    def __init__(self, action_space: gym.Space, observation_space: gym.Space):
        super().__init__(action_space, observation_space)
        self.model = FCModel(action_space.n)
        self.optimizer = tf.keras.optimizers.Adam(1e-3)
        self.epsilon = 0.2
        self.count = 0
        self.factor = 1000000

    def policy(self, observation, return_softmax=False, training=True):
        observation = np.array(observation)
        if len(observation.shape) == 1:
            observation = observation[None]
        x = tf.convert_to_tensor(observation, dtype=tf.float32)
        x = self.model(x)
        if return_softmax:
            return x
        if training:
            # epsilon greedy
            if np.random.rand() > self.epsilon - self.count / self.factor:
                return tf.argmax(x, axis=-1).numpy()[0]
            else:
                return self.action_space.sample()
        else:
            return tf.argmax(x, axis=-1).numpy()[0]
    

    def learn(self, action, observation, next_observation, reward):
        oh_action = tf.one_hot(action, self.action_space.n)
        with tf.GradientTape() as tape:
            q = tf.reduce_sum(self.policy(observation, return_softmax=True) * oh_action, axis=-1)
            q_next = tf.reduce_max(self.policy(next_observation, return_softmax=True), axis=-1)
            loss = tf.reduce_mean(tf.square(q - (reward + q_next)))

        grads = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
        #self.count += 1

In [None]:
# sarsa

In [None]:
agent = DQNAgent(env.action_space, env.observation_space)
env = gym.make("LunarLander-v2", render_mode="ansi")
trainer = Trainer(Trainer.Params(max_steps=1000), env, agent)
try:
    trainer.train()
except:
    env.close()

In [None]:
plt.plot([sum([d[2] for d in h]) for h in trainer.history])

In [None]:
plt.plot([d[2] for h in trainer.history  for d in h])

In [None]:
trainer.history[0][8]

In [None]:
data = [[d[2] for d in record] for record in trainer.history]
plt.plot(data[-2])
plt.grid()

In [None]:
len(trainer.history)

In [None]:
#