<a href="https://colab.research.google.com/github/yueguo1997/Apply_transformer_in_reinforcement_learning/blob/main/Project_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

class TransformerLayer(layers.Layer):
    def __init__(self, num_heads, hidden_size, dropout_rate=0.1):
        super().__init__()
        self.multi_head_attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=hidden_size)
        self.norm1 = layers.LayerNormalization()
        self.dropout1 = layers.Dropout(dropout_rate)
        self.feed_forward = tf.keras.Sequential([
            layers.Dense(hidden_size * 4, activation='relu'),
            layers.Dense(hidden_size)
        ])
        self.norm2 = layers.LayerNormalization()
        self.dropout2 = layers.Dropout(dropout_rate)
        
    def call(self, inputs, training=False):
        attention_output = self.multi_head_attention(inputs, inputs)
        attention_output = self.dropout1(attention_output, training=training)
        attention_output = self.norm1(inputs + attention_output)
        feed_forward_output = self.feed_forward(attention_output)
        feed_forward_output = self.dropout2(feed_forward_output, training=training)
        return self.norm2(attention_output + feed_forward_output)

class TransformerNet(tf.keras.Model):
    def __init__(self, num_layers, num_heads, hidden_size, dropout_rate=0.1):
        super().__init__()
        self.embedding = layers.Dense(hidden_size)
        self.layers = [TransformerLayer(num_heads, hidden_size, dropout_rate)
                       for _ in range(num_layers)]
        self.flatten = layers.Flatten()
        self.policy = layers.Dense(2, activation='softmax')
        self.value = layers.Dense(1)
        
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        for layer in self.layers:
            x = layer(x, training=training)
        x = self.flatten(x)
        policy = self.policy(x)
        value = self.value(x)
        return policy, value


In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

class PPOAgent:
    def __init__(self, env, net, clip_ratio=0.2, gamma=0.99, entropy_coefficent=0.01,
                 value_coefficent=0.5, optimizer=None):
        self.env = env
        self.net = net
        self.clip_ratio = clip_ratio
        self.gamma = gamma
        self.entropy_coefficent = entropy_coefficent
        self.value_coefficent = value_coefficent
        self.optimizer = optimizer or tf.keras.optimizers.Adam(learning_rate=0.001)
        
    def train(self, episodes, batch_size):
        # Collect experience
        states = []
        actions = []
        rewards = []
        next_states = []
        dones = []
        episode_reward = 0
        state = self.env.reset()
        for episode in range(episodes):
            for step in range(batch_size):
                action_probs, value = self.net(tf.constant([state]))
                action = np.random.choice(2, p=action_probs[0])
                next_state, reward, done, _ = self.env.step(action)
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                next_states.append(next_state)
                dones.append(done)
                episode_reward += reward
                state = next_state
                if done:
                    state = self.env.reset()
                    print("Episode {}: Reward = {}".format(episode, episode_reward))
                    episode
