In [1]:
import pandas as pd
import wandb

# Initialize wandb
wandb.init(project="your_project_name")

# Fetch runs from your project
api = wandb.Api()
runs = api.runs("multi_reward_feedback_final_lul", filters={"display_name": {"$regex": "^RL_.*"}})

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mymetz[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
# Create a list to store data from filtered runs
filtered_run_data = []

# Iterate through the runs
for run in runs:
    # Check if the run name starts with "ppo_"
    if run.name.startswith("RL_") and "noise" not in run.name and "ensemble" not in run.name:
        # Get the summary statistics (includes final values of metrics)
        summary = run.summary._json_dict

        # Get the history (includes all logged metrics)
        history = run.history(keys=["eval/mean_reward", "global_step"])
        #history = run.history(keys=["rollout/ep_rew_mean", "global_step"])

        # Combine summary and history data
        run_data = {
            "run_id": run.id,
            "run_name": run.name,
            **summary,
            **{f"{k}_history": v.tolist() for k, v in history.items()}
        }

        filtered_run_data.append(run_data)

# Create a DataFrame from filtered run data
orig_df = pd.DataFrame(filtered_run_data)

In [6]:
orig_df.columns

Index(['run_id', 'run_name', '_runtime', '_step', '_timestamp', '_wandb',
       'eval/mean_ep_length', 'eval/mean_reward', 'global_step',
       'rollout/ep_len_mean', 'rollout/ep_rew_mean', 'time/fps',
       'train/approx_kl', 'train/clip_fraction', 'train/clip_range',
       'train/entropy_loss', 'train/explained_variance', 'train/learning_rate',
       'train/loss', 'train/policy_gradient_loss', 'train/std',
       'train/value_loss', '_step_history', 'rollout/ep_rew_mean_history',
       'global_step_history', 'train/actor_loss', 'train/critic_loss',
       'train/ent_coef', 'train/ent_coef_loss', 'eval/success_rate'],
      dtype='object')

In [10]:
from bisect import bisect_left
from collections import OrderedDict, defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import hex2color, rgb2hex
from scipy.ndimage import gaussian_filter1d


def generate_muted_colors(hex_color, num_colors=10, mute_factor=0.5):
    rgb = hex2color(hex_color)
    muted_colors = []
    for i in range(num_colors):
        muted_rgb = tuple(c + (1 - c) * mute_factor for c in rgb)
        muted_hex = rgb2hex(muted_rgb)
        muted_colors.append(muted_hex)
    return muted_colors

df = orig_df.copy()

#score_field = "rollout/ep_rew_mean_history"
score_field = "eval/mean_reward_history"

# Function to extract environment and feedback type from run name
def extract_info(run_name):
    parts = run_name.split('_')
    return parts[2], parts[-2]

# Function to interpolate NaN values in a series
def interpolate_nans(series):
    return pd.Series(series).interpolate().ffill().bfill().values

# Group runs by environment and feedback type
grouped_runs = defaultdict(lambda: defaultdict(list))
for _, row in df.iterrows():
    env, feedback = extract_info(row['run_name'])
    # Convert string "nan" to np.nan and other values to float
    if isinstance(row[score_field], float):
        continue
    row[score_field] = [np.nan if x == "nan" else x for x in row[score_field]]
    # Interpolate NaN values in val_loss_history
    row[score_field] = interpolate_nans(row[score_field])
    grouped_runs[env][feedback].append(row)

# Read the CSV file with evaluation scores
eval_df = pd.read_csv("../../main/gt_agents/collected_results.csv")

# Define a color scale for feedback types
color_scale = OrderedDict([
    ('evaluative', '#1f77b4'),     # blue
    ('comparative', '#ff7f0e'),    # orange
    ('demonstrative', '#2ca02c'),  # green
    ('corrective', '#d62728'),     # red
    ('descriptive', '#9467bd'),    # purple
    ('preference', '#8c564b'),  # brown
])
# Plotting function
def plot_environment(env, feedback_runs):
    plt.figure(figsize=(15, 10))
    
    # Increase font size for all text elements
    plt.rcParams.update({'font.size': 18})  # Adjust this value as needed
    
    for feedback in color_scale.keys():
        if feedback not in feedback_runs:
            continue  # Skip if this feedback type is not present for this environment
        
        runs = feedback_runs[feedback]
        color = color_scale.get(feedback, '#7f7f7f')  # Default to gray if feedback type not in scale
        muted_colors = generate_muted_colors(color)
        
        # Find the maximum length of steps
        max_steps = max(len(run['global_step_history']) for run in runs)
        
        # Initialize arrays for losses and steps
        all_losses = np.full((len(runs), max_steps), np.nan)
        all_steps = np.full((len(runs), max_steps), np.nan)
        
        # Fill the arrays with available data
        for i, run in enumerate(runs):
            length = len(run['global_step_history'])
            length = bisect_left(run['global_step_history'], int(1e6))
            all_losses[i, :length] = run[score_field][:length]
            all_steps[i, :length] = run['global_step_history'][:length]

            #smoothed_scores =  gaussian_filter1d(run[score_field], sigma=2)
            #plt.plot(run['global_step_history'], smoothed_scores, color=muted_colors[i], linewidth=1.5) 


        # Calculate statistics
        mean_loss = np.nanmean(all_losses, axis=0)
        min_loss = mean_loss - np.nanstd(all_losses, axis=0)
        max_loss = mean_loss + np.nanstd(all_losses, axis=0)
        
        # Use the mean of steps for x-axis (ignoring NaNs)
        steps = np.nanmean(all_steps, axis=0)
        
        # Remove NaN entries
        valid = ~np.isnan(mean_loss)
        steps = steps[valid]
        mean_loss = mean_loss[valid]
        min_loss = min_loss[valid]
        max_loss = max_loss[valid]

        display_feedback = feedback
        if display_feedback == "preference":
            display_feedback = "descriptive Preferences"
        plt.plot(steps, mean_loss, label=f"{display_feedback.capitalize()}", color=color, linewidth=3.0)
        plt.fill_between(steps, min_loss, max_loss, alpha=0.2, color=color)

    # Filter eval scores for the current environment
    env_eval_scores = eval_df[eval_df['env'] == env]
    
    # Sort and select the best four scores
    best_scores = env_eval_scores.nlargest(4, 'eval_score')
    
    # Calculate statistics for the best scores
    mean_score = best_scores['eval_score'].mean()
    min_score = best_scores['eval_score'].min()
    max_score = best_scores['eval_score'].max()
    
    # Plot evaluation scores as horizontal lines with updated styles
    plt.axhline(y=mean_score, color='grey', linewidth=3.5)
    plt.axhline(y=min_score, color='grey', linestyle='--', linewidth=2.5)
    plt.axhline(y=max_score, color='grey', linestyle='--', linewidth=2.5)
    
    #plt.title(f"Episode Reward for {env.capitalize()} Environment", fontsize=20)
    plt.xlabel("Env. Steps", fontsize=18)
    plt.ylabel("Episode Rew.", fontsize=18)
    plt.legend()
    #plt.grid(True)
    # Use log scale for y-axis if the range of values is large
    #if np.nanmax(mean_loss) / np.nanmin(mean_loss[np.isfinite(mean_loss)]) > 100:
    #    plt.yscale('log')
    plt.tight_layout()
    plt.savefig(f"rl_reward_curves_{env}.png")
    plt.close()
    print(f"Reward plot for {env} environment has been saved to rl_reward_curves_{env}.png")

# Create plots for each environment
for env, feedback_runs in grouped_runs.items():
    plot_environment(env, feedback_runs)

  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for Swimmer-v5 environment has been saved to rl_reward_curves_Swimmer-v5.png


  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for HalfCheetah-v5 environment has been saved to rl_reward_curves_HalfCheetah-v5.png


  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for Ant-v5 environment has been saved to rl_reward_curves_Ant-v5.png


  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for Humanoid-v5 environment has been saved to rl_reward_curves_Humanoid-v5.png


  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for Hopper-v5 environment has been saved to rl_reward_curves_Hopper-v5.png


  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for Walker2d-v5 environment has been saved to rl_reward_curves_Walker2d-v5.png


  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for metaworld-button-press-v2 environment has been saved to rl_reward_curves_metaworld-button-press-v2.png


  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for metaworld-sweep-into-v2 environment has been saved to rl_reward_curves_metaworld-sweep-into-v2.png


  mean_loss = np.nanmean(all_losses, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  steps = np.nanmean(all_steps, axis=0)


Reward plot for metaworld-pick-place-v2 environment has been saved to rl_reward_curves_metaworld-pick-place-v2.png
Reward plot for merge-v0 environment has been saved to rl_reward_curves_merge-v0.png
Reward plot for highway-fast-v0 environment has been saved to rl_reward_curves_highway-fast-v0.png
Reward plot for roundabout-v0 environment has been saved to rl_reward_curves_roundabout-v0.png
