In [None]:
# load a pretrained llm or even a SFT llm from huggingface - let's call it the BaseLLM
# Have a dataset and evaluate the BaseLLM on the dataset - define the metrics/benchmarks that you want to use
# RLHF setup
    # Setup #1 - Rule based reward model with PPO
    # Setup #2 - Model based reward model with PPO
       # We have to collect a dataset for training the reward model - this dataset is also known as the preference dataset



In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch

In [4]:
# PPO is on-policy: learns from current policy or current experience
# in the contrary, DQN is off-policy: learns from past experiences as well

class RobotEnv():
    def __init__(self, goal_position=10, max_steps=100):
        self.position = 0 # initial position of the robot
        self.goal_position = goal_position # goal position of the robot
        self.max_steps = max_steps
        self.current_step = 0      # indicates number of steps taken by the robot

    def reset(self):
        # reset environment and actor's current state
        self.position = 0
        self.current_step = 0
        return self.position

    def step(self, action):
        # compute next state based on actor's current action
        self.position += action      # TODO: What values can action take?
        self.current_step += 1

        # compute reward based on actor's current action
        reward = -abs(self.position - self.goal_position)    # close the robot to the goal position, higher is the reward
        
        # check if actor has reached final state
        # final state is achieved if the robot has completed max_steps
        # or it has reached the vicinity of the goal position
        done = self.current_step >= self.max_steps or (abs(self.position - self.goal_position) < 1)

        # return new state state, reward, done
        return self.position, reward, done

    def get_experiences_from_curr_policy(self, actor, device, num_episodes, max_steps):
        """
        Collects experiences from the environment using the current policy.

        Args:
            actor: The actor network.
            device: The device to run the computations on (e.g., 'cpu', 'cuda', 'mps').
            num_episodes: The number of episodes to collect data from.
            max_steps: The maximum number of steps per episode.

        Returns:
            A list of episodes where each episodes consists of states, actions, rewards, next states, dones and log_probs.
        """
        episodes = []
        for _ in range(num_episodes):
            episode_data = {
                "states": [],
                "actions": [],
                "rewards": [],
                "next_states": [],
                "dones": [],
                "log_probs": [],
            }
            state = self.reset()       # start from the beginning
            done = False
            step = 0

            while not done and step < max_steps:
                state = torch.FloatTensor(state).unsqueeze(0).to(device)       # convert state to tensor & add the batch dimension
                mean, stddev = actor(state)                                    # get the mean and stddev of the action distribution given the current policy
                distribution = torch.distributions.Normal(mean, stddev)                     # create a normal distribution
                action = distribution.sample()                                 # sample an action from the distribution
                log_prob = distribution.log_prob(action).sum(dim=1)                           # compute the log probability of the action. # TODO: check if sum is needed


                # perform this action and get the reward from the environment
                next_state, reward, done = self.step(action.cpu().detach().numpy()[0])

                episode_data["states"].append(state)
                episode_data["actions"].append(action.cpu().detach().numpy()[0])
                episode_data["rewards"].append(reward)
                episode_data["next_states"].append(next_state)
                episode_data["dones"].append(done)
                episode_data["log_probs"].append(log_prob.cpu().detach().numpy()[0])

                # update state
                state = next_state
                step += 1

            episodes.append(episode_data)

        return episodes

# Actor & Critic Networks

*1. Both of these can be any neural network as long as they are fulfilling the main purpose of the Actor and Critic.*<br>
*2. Actor uses the current policy to generate the next action.*<br>
*3. Critic estimates the expected reward of the current state*

In [None]:

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.mean_layer = nn.Linear(128, action_dim)
        self.std_layer = nn.Linear(128, action_dim)


    def forward(self, state):
        # outputs the distribution of actions to sample from based on the current policy
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))

        mean = self.mean_layer(x)
        std = F.softplus(self.std_layer(x))

        return mean, std

class Critic(nn.Module):
    def __init__(self, state_dim):
        # critic is the value function so it only needs the state_dim
        super().__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.value_layer = nn.Linear(128, 1)

    def forward(self, state):
        # computes the value (expected reward) of the current state of the actor
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        value = self.value_layer(x)
        return value

In [None]:
"""
Temporal Difference Error:
Difference b/w Actual Reward and Expected Reward of the current state
td_error = r_t + gamma * V(t+1) - V(t)

Monte Carlo Return:
Difference b/w Actual Reward (collected for all the steps so far) and the Expected Reward of the current state
mc_return = r_t + gamma * r_t+1 * gammae^2 * r_t+2 + ... + gamma^(T-t) * r_T - V(t)

Generalized Advantage Return


"""

In [5]:
def calc_advantage(rewards, values, dones, gamma=0.99, tau=0.95):
    """
    Generalized Advantage Estimation (GAE)
    Calculates how much better an action is wrt the average action in a given state
    
    """

    advantages = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i+1] + (1 - dones[i]) - values[i]
        gae = delta + gamma * tau * (1 - dones[i]) * gae

        advantages.insert(0, gae)

    return advantages
    


In [None]:
def ppo_update_loop():
    
    num_episodes = 50
    max_steps = 10


    # setup device -> gpu or mps or cpu
    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

    # initalise environment
    env = RobotEnv(goal_position=10, max_steps=100)

    # initialise actor network
    actor = Actor(state_dim=1, action_dim=1).to(device) # state_dim = 1 since the robot's position is a single value

    # initialise critic network
    critic = Critic(state_dim=1).to(device)     # state_dim = 1 since the robot's position is a single value

    # collect experiences
    episode_experiences = env.get_experiences_from_curr_policy(env, actor, device, num_episodes, max_steps)

    for episode_data in episode_experiences:
        # Process each episode's data
        advantages = calc_advantage(episode_data["rewards"], ..., episode_data["dones"])
        # ... (rest of PPO update)

In [None]:
import torch.optim as optim


def main():
    # setup device -> gpu or mps or cpu
    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

    # initalise environment
    env = RobotEnv(goal_position=10, max_steps=100)

    # initialise actor network
    actor = Actor(state_dim=1, action_dim=1).to(device) # state_dim = 1 since the robot's position is a single value

    # initialise critic network
    critic = Critic(state_dim=1).to(device)     # state_dim = 1 since the robot's position is a single value

    optimizer_actor = optim.Adam(actor.parameters(), lr=1e-3)
    optimizer_critic = optim.Adam(critic.parameters(), lr=1e-3)

    NUM_EPISODES = 1000
    max_steps = 100

    for episode in range(NUM_EPISODES):
        experiences = env.get_experiences_from_curr_policy(actor, device, )    