Imports

In [1]:
import gym
import numpy as np

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 10)

from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam
from torch.distributions import Categorical
from torch.utils.data import TensorDataset, DataLoader

Our functions

In [2]:
from Train_policy_func import Policy, load_policy, evaluate_all_policies
from Generate_traj_func import generate_trajectory
from Plot_Functions import plot_suboptimality, plot_trajectory_performance, plot_Scores, plot_suboptimality_three_policies, plot_scores_RLHF
from OPPO import baseline_CartPole_v0_Fla, OPPO_update, set_seed
from PPO import evaluate_policy
from pairs_generator import sample_preference_pairs
from RLHF import RewardModel, train_policy_from_rollouts_n_updates



beta by default 0.5


## Step 1:
Creating the policies of references, here for three different seeds.

The policies, for CartPole at least are optimised with an OPPO algorithm.
They are saved as "pi1\_ref\_{env_name}\_seed\_{seed}.pth" and "pi2\_ref\_{env_name}\_seed\_{seed}.pth"

For Pendulum, do not know.

In [None]:
# env 1: CartPole-v0
env_name='CartPole-v0'
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
baseline = baseline_CartPole_v0_Fla
target_score = 185 # for CartPole-v0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ── Hyperparams for OPPO ──
lr_oppo                 = 0.003
n_episodes              = 1200 # keep same “number of iterations” for fair comparison
max_t                   = 200  # cap on steps per episode
print_every             = 20   # print every x episodes and avg also the score for every x episodes
gamma                   = 0.99


Policy_list = []
load_policies = True
plot_scores = False
n_eval = 100

seeds_list=[35, 42, 100]

for seed in seeds_list:
    set_seed(seed, env)

    if load_policies:
        print(fr"Loading pi_1 and pi_2 with {seed=}")
        pi_1 = load_policy(f"pi1_ref_{env_name}_seed_{seed}.pth", obs_dim, action_dim, device)
        pi_2 = load_policy(f"pi2_ref_{env_name}_seed_{seed}.pth", obs_dim, action_dim, device)

    else:
        print(f"Training pi_1 and pi_2 with seed {seed}")

        pi_1 = Policy(state_size=obs_dim, action_size=action_dim).to(device)
        opt1 = torch.optim.Adam(pi_1.parameters(), lr=lr_oppo)


        scores_oppo = OPPO_update(
            policy          = pi_1,
            optimizer       = opt1,
            env             = env,
            baseline        = baseline,
            n_episodes      = n_episodes,
            max_t           = max_t,
            gamma           = gamma,      # you can reuse your PPO γ
            print_every     = print_every,
            early_stop      = False,
            seed            = seed,
            target_score    = target_score,
            env_name        = env_name,
            display_every   = False
        )


        pi_2 = load_policy(f"pi2_ref_{env_name}_seed_{seed}.pth", obs_dim, action_dim, device)
    Policy_list.append([pi_1, pi_2])
    # print("------------------------------------------------------")

print("=================================")
# ── Evaluate the policies ──
for seed, (pi_1, pi_2) in zip(seeds_list, Policy_list):
    
    print(f"Evaluating pi_1 and pi_2 with {seed=} over {n_eval} episodes")
    # Evaluate the policies
    seed_eval = seed+32
    _, pi1_rewards = evaluate_policy(pi_1, env, n_episodes=n_eval, seed=seed_eval)
    _, pi2_rewards = evaluate_policy(pi_2, env, n_episodes=n_eval, seed=seed_eval)
    print("Evaluations over", n_eval, "episodes done for both policies, using the seed {seed}")
    print(f"pi_1({seed}) \t mean reward: \t {np.mean(pi1_rewards):.2f} ± {np.std(pi1_rewards):.2f}")
    print(f"pi_2({seed}) \t mean reward: \t {np.mean(pi2_rewards):.2f} ± {np.std(pi2_rewards):.2f}")

    if plot_scores:
        plot_Scores(
            pi1_rewards,
            pi2_rewards,
        )
    print("------------------------------------------------------")


print("=================================")


  logger.warn(
  deprecation(
  deprecation(


Loading $\pi_{ref}$ 1|2 with seed 35
Loading $\pi_{ref}$ 1|2 with seed 42
Loading $\pi_{ref}$ 1|2 with seed 100
Evaluating pi_1 and pi_2 with seed=35 over 100 episodes
Evaluations over 100 episodes done for both policies, using the seed {seed}
pi_1(35) 	 mean reward: 	 186.93 ± 24.03
pi_2(35) 	 mean reward: 	 99.91 ± 53.53
------------------------------------------------------
Evaluating pi_1 and pi_2 with seed=42 over 100 episodes
Evaluations over 100 episodes done for both policies, using the seed {seed}
pi_1(42) 	 mean reward: 	 183.51 ± 40.43
pi_2(42) 	 mean reward: 	 127.82 ± 50.95
------------------------------------------------------
Evaluating pi_1 and pi_2 with seed=100 over 100 episodes
Evaluations over 100 episodes done for both policies, using the seed {seed}
pi_1(100) 	 mean reward: 	 187.64 ± 28.03
pi_2(100) 	 mean reward: 	 115.26 ± 49.30
------------------------------------------------------


### Now the Creation/loading of $\pi_{ref}^{1|2}$ is done
***
## We will implement the part RLHF (PPO nik)

In [None]:
# List of hyperparameters
lr_RLHF = 0.009
K       = 200
beta    = 0.5

# Parameters for the RLHF part
load_RM = True
load_rlhf_policy = True
pi_ref_rlhf = 1
load_pi2_start_rlhf = True
plot_scores = False
n_eval = 100

Policy_rlhf_list = []

for seed, (pi_1, pi_2) in zip(seeds_list, Policy_list):
    # Create the policy that will be trained with RLHF
    policy_RLHF = Policy(state_size= obs_dim, action_size= action_dim).to(device)
    if load_rlhf_policy:
        policy_RLHF = load_policy(fr"pi_RLHF_{env_name}_seed_{seed}_beta{beta}_K{K}.pth", obs_dim, action_dim, device)
        print(f"Loading policy trained with RLHF for {seed=}")
    else:
        # Creating the preference pairs
        prefs = sample_preference_pairs(pi_1, pi_2, env, K=K)
        print(f"Collected {K} preference pairs for {seed=}.")
        
        # Create the reward model
        reward_model = RewardModel(state_dim=obs_dim, action_dim=action_dim).to(device)
        
        if load_RM:
            print(f"Loading reward model (MERCI Youssef)")
            reward_model.load_state_dict(torch.load(r"reward_model_youss.pth"))
        else:
            # Train the reward model
            print(f"Training reward model trained on {K} preference pairs")

        policy_RLHF = Policy(state_size= obs_dim, action_size= action_dim).to(device)
        if load_pi2_start_rlhf: policy_RLHF.load_state_dict(torch.load(f"pi2_ref_{env_name}_seed_{seed}.pth"))
        opt_RLHF    = torch.optim.Adam(policy_RLHF.parameters(), lr=lr_RLHF)
    
        policy_ref = pi_1 if pi_ref_rlhf == 1 else pi_2

        print(f"Training policy with RLHF using pi_{pi_ref_rlhf} as reference policy")
        train_policy_from_rollouts_n_updates(policy_RLHF, policy_ref, reward_model, env, opt_RLHF, N=20, K=K, max_steps=500, beta=beta)
        torch.save(policy_RLHF.state_dict(), fr"pi_RLHF_{env_name}_seed_{seed}_beta{beta}_K{K}.pth")
        print(fr"Saved final policy as pi_RLHF_{env_name}_seed_{seed}_beta{beta}_K{K}.pth")
    Policy_rlhf_list.append(policy_RLHF)
print("=================================")



for seed, (pi_1, pi_2), pi_rlhf in zip(seeds_list, Policy_list, Policy_rlhf_list):

    # print(fr"Evaluating pi_1, pi_2 and pi_rlhf with seed {seed}")
    
    seed_eval = seed+32
    _, pi1_rewards      = evaluate_policy(pi_1,     env, n_episodes=n_eval, seed=seed_eval)
    _, pi2_rewards      = evaluate_policy(pi_2,     env, n_episodes=n_eval, seed=seed_eval)
    _, pi_rlhf_rewards  = evaluate_policy(pi_rlhf,  env, n_episodes=n_eval, seed=seed_eval)
    
    print("Evaluations over", n_eval, "episodes done for the 3 policies, using the seed {seed}")
    print(f"pi_1({seed}) \t mean reward: \t {np.mean(pi1_rewards):.2f} ± {np.std(pi1_rewards):.2f}")
    print(f"pi_2({seed}) \t mean reward: \t {np.mean(pi2_rewards):.2f} ± {np.std(pi2_rewards):.2f}")
    print(f"pi_RLHF({seed}) \t mean reward: \t {np.mean(pi_rlhf_rewards):.2f} ± {np.std(pi_rlhf_rewards):.2f}")
    if plot_scores:
        plot_scores_RLHF(pi2_rewards, pi1_rewards, pi_rlhf_rewards, algo="RLHF")

    print("------------------------------------------------------")




Loading policy trained with RLHF
Loading policy trained with RLHF
Loading policy trained with RLHF
Evaluations over 100 episodes done for the 3 policies, using the seed {seed}
pi_1(35) 	 mean reward: 	 188.64 ± 23.17
pi_2(35) 	 mean reward: 	 110.04 ± 53.85
pi_{RLHF}(35) 	 mean reward: 	 186.77 ± 22.00
------------------------------------------------------
Evaluations over 100 episodes done for the 3 policies, using the seed {seed}
pi_1(42) 	 mean reward: 	 187.78 ± 31.87
pi_2(42) 	 mean reward: 	 118.25 ± 59.47
pi_{RLHF}(42) 	 mean reward: 	 172.92 ± 44.14
------------------------------------------------------
Evaluations over 100 episodes done for the 3 policies, using the seed {seed}
pi_1(100) 	 mean reward: 	 190.67 ± 22.10
pi_2(100) 	 mean reward: 	 115.44 ± 48.00
pi_{RLHF}(100) 	 mean reward: 	 187.74 ± 28.16
------------------------------------------------------


***
## Bonne chance j'ai pas touché la suite....

Averaging over the different seeds

In [5]:
# file_pi1=[f"pi1_ref_{env_name}_seed_{seed}.pth"]
# file_pi2=[f"pi2_ref_{env_name}_seed_{seed}.pth"]
# file_pi_DPO=[f"pi_DPO_oppo_{env_name}_seed_{seed}.pth",f"pi_DPO_oppo_{env_name}_seed_{seed}_K200.pth",f"pi_DPO_oppo_{env_name}_seed_{seed}_beta0.005.pth",f"pi_DPO_oppo_{env_name}_seed_{seed}_beta1_invverted.pth"]

# results=evaluate_all_policies(env, seeds_list, env_name, num_episodes=50, device=device)

# # Example labels: 'pi_DPO_', 'pi1', 'pi2'
# print(results.keys())
# reward_hist_init = results["pi2"]["graph"]
# reward_hist_ref = results["pi1"]["graph"]
# reward_hist_RLHF = results["pi_RLHF_pi_RLHF_CartPole-v0_seed_35_beta0.5_K200"]["graph"]

# plot_suboptimality_three_policies(reward_hist_RLHF, reward_hist_init, reward_hist_ref, max_reward=200,algo="RLHF")


#================================================
#------------------------------------
# reward_hist_dpo = results["pi_DPO_pi_DPO_CartPole-v0_seed_35_K200"]["graph"]

# plot_suboptimality_three_policies(reward_hist_dpo, reward_hist_init, reward_hist_ref, max_reward=200)
# reward_hist_dpo = results["pi_DPO_pi_DPO_CartPole-v0_seed_35_beta0.005"]["graph"]

# plot_suboptimality_three_policies(reward_hist_dpo, reward_hist_init, reward_hist_ref, max_reward=200)
# reward_hist_dpo = results["pi_DPO_pi_DPO_CartPole-v0_seed_35_beta0.5_K10"]["graph"]

# plot_suboptimality_three_policies(reward_hist_dpo, reward_hist_init, reward_hist_ref, max_reward=200)


# reward_hist_dpo = results["pi_DPO_pi_DPO_CartPole-v0_seed_35_beta1_inverted"]["graph"]
# reward_hist_init = results["pi1"]["graph"]
# reward_hist_ref = results["pi2"]["graph"]
# plot_suboptimality_three_policies(reward_hist_dpo, reward_hist_init, reward_hist_ref, max_reward=200)