In [1]:
import pandas as pd
import wandb

# Initialize wandb
wandb.init(project="your_project_name")

# Fetch runs from your project
api = wandb.Api()
runs = api.runs("multi_reward_feedback_final_lul", filters={"display_name": {"$regex": "^(ppo|sac).*"}})

# Create a list to store data from filtered runs
filtered_run_data = []

# Iterate through the runs
for run in runs:
    # Check if the run name starts with "ppo_"
    if run.name.startswith("ppo_") or run.name.startswith("sac_") and "noise" not in run.name:
        # Get the summary statistics (includes final values of metrics)
        summary = run.summary._json_dict

        # Get the history (includes all logged metrics)
        history = run.history()

        # Combine summary and history data
        run_data = {
            "run_id": run.id,
            "run_name": run.name,
            **summary,
            **{f"{k}_history": v.tolist() for k, v in history.items()}
        }

        filtered_run_data.append(run_data)

# Create a DataFrame from filtered run data
df = pd.DataFrame(filtered_run_data)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mymetz[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# Function to extract environment and feedback type from run name
def extract_info(run_name):
    parts = run_name.split('_')
    return parts[1], parts[-2]

# Function to interpolate NaN values in a series
def interpolate_nans(series):
    return pd.Series(series).interpolate().ffill().bfill().values

# Group runs by feedback type and environment
grouped_runs = defaultdict(lambda: defaultdict(list))
for _, row in df.iterrows():
    env, feedback = extract_info(row['run_name'])
    # Convert string "nan" to np.nan and other values to float
    if isinstance(row['val_loss_history'], float):
        continue
    row['val_loss_history'] = [np.nan if x == "nan" else x for x in row['val_loss_history']]
    # Interpolate NaN values in val_loss_history
    row['val_loss_history'] = interpolate_nans(row['val_loss_history'])
    grouped_runs[feedback][env].append(row)

In [None]:
# Plotting function
def plot_feedback_type(feedback, env_runs):
    plt.figure(figsize=(15, 10))
    
    # Increase font size for all text elements
    plt.rcParams.update({'font.size': 16})  # Adjust this value as needed

    colors = plt.cm.tab20(np.linspace(0, 1, 12))  # adjust colormap as needed
    
    for color_idx, (env, runs) in enumerate(env_runs.items()):
        
        if "v5" in env:
            continue

        if env == "Humanoid-v5":
            continue
        # Find the maximum length of steps
        max_steps = max(len(run['_step_history']) for run in runs)
        
        # Initialize arrays for losses and steps
        all_losses = np.full((len(runs), max_steps), np.nan)
        all_steps = np.full((len(runs), max_steps), np.nan)
        
        # Fill the arrays with available data
        for i, run in enumerate(runs):
            length = len(run['_step_history'])
            all_losses[i, :length] = run['val_loss_history']
            all_steps[i, :length] = run['_step_history']
        
        # Calculate statistics
        mean_loss = np.nanmean(all_losses, axis=0)
        min_loss = np.nanmin(all_losses, axis=0)
        max_loss = np.nanmax(all_losses, axis=0)
        
        # Use the mean of steps for x-axis (ignoring NaNs)
        steps = np.nanmean(all_steps, axis=0)
        
        # Remove NaN entries
        valid = ~np.isnan(mean_loss)
        steps = steps[valid]
        mean_loss = mean_loss[valid]
        min_loss = min_loss[valid]
        max_loss = max_loss[valid]
        
        plt.plot(steps, mean_loss, label=f"{env}", color=colors[color_idx])
        plt.fill_between(steps, min_loss, max_loss, alpha=0.2, color=colors[color_idx])

    display_feedback = feedback
    print(display_feedback)
    if display_feedback == "preference":
        display_feedback = "descriptive Preference"
    plt.title(f"Reward Model: Validation Loss Curves for {display_feedback.capitalize()} Feedback", fontsize=20)
    plt.xlabel("Steps", fontsize=18)
    plt.ylabel("Validation Loss", fontsize=18)
    plt.legend(loc = "upper right")
    plt.grid(True)

    # Use log scale for y-axis if the range of values is large
    if display_feedback == "evaluative" or display_feedback == "descriptive" and np.nanmax(mean_loss) / np.nanmin(mean_loss[np.isfinite(mean_loss)]) > 100:
        plt.yscale('log')

    plt.tight_layout()
    plt.savefig(f"loss_curves_{feedback}_2.png")
    plt.close()

    print(f"Loss curves for {feedback} feedback have been saved to loss_curves_{feedback}.png")

# Create plots for each feedback type
for feedback, env_runs in grouped_runs.items():
    plot_feedback_type(feedback, env_runs)

descriptive
Loss curves for descriptive feedback have been saved to loss_curves_descriptive.png
evaluative
Loss curves for evaluative feedback have been saved to loss_curves_evaluative.png
comparative
Loss curves for comparative feedback have been saved to loss_curves_comparative.png
preference
Loss curves for preference feedback have been saved to loss_curves_preference.png
corrective


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


Loss curves for corrective feedback have been saved to loss_curves_corrective.png
noise
Loss curves for noise feedback have been saved to loss_curves_noise.png
demonstrative


In [3]:
import pickle as pkl

import numpy as np

from rlhf.networks import LightningNetwork

with open("feedback/ppo_HalfCheetah-v5_1337.pkl", "rb") as file:
    data = pkl.load(file)

all_obs = []
for idx, seg in enumerate(data["segments"]):
    obs = np.array([np.concatenate((s[0].squeeze(0),s[1])) for s in seg])
    all_obs.append(obs)
states = np.concatenate(all_obs, axis=0)

import os

import gymnasium as gym
import torch

env_name = "HalfCheetah-v5"
environment = gym.make(env_name)

reward_model_path = "reward_models/ppo_HalfCheetah-v5_12_evaluative_12.ckpt"
reward_net = LightningNetwork.load_from_checkpoint(
            reward_model_path,
            map_location="cuda:0"
        )

all_obs = []
for idx, seg in enumerate(data["segments"]):
    obs = np.array([s[0].squeeze(0) for s in seg])
    all_obs.append(obs)
    all_rews.append([s[2] for s in seg])
    
observation = np.concatenate(all_obs, axis=0)

if norm_env is not None:
    observation = norm_env.normalize_obs(observation)

observation = expert_model.policy.obs_to_tensor(observation)[0]
with torch.no_grad():
    pred_rews = reward_net(observation)

NameError: name 'all_rews' is not defined