# Notebook for experiment tracking with Weight and Biases 

In [1]:
import matplotlib.pyplot as plt

# from gridworlds.grid_env import GridEnvironment
from src.Generalist.generalist_meta_env import Generalist_MetaEpisodeEnv
from src.Generalist.draw_gridworld import draw_policy

# import gymnasium as gym
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed

# wandb 
import wandb
from wandb.integration.sb3 import WandbCallback

#stablebaselines feature extractor
from src.Generalist.feature_extractor import Custom_Flatten

#For evaluation
from src.Generalist.evals_utils import average_evals

# Load Gridworlds

In [2]:
#Load the gridworlds
from classes import Object
import pickle

with open('src/world_builder/worlds/master_set_train.pkl','rb') as f:
    train_gridworlds = pickle.load(f)
print(f'{len(train_gridworlds)} train gridworlds loaded')  
for grid in train_gridworlds:
    grid.early_stopping = False   

with open('src/world_builder/worlds/master_set_val.pkl','rb') as f:
    val_gridworlds = pickle.load(f)
print(f'{len(val_gridworlds)} val gridworlds loaded')  
for grid in val_gridworlds:
    grid.early_stopping = False   

with open('src/world_builder/worlds/master_set_test.pkl','rb') as f:
    test_gridworlds = pickle.load(f)
print(f'{len(test_gridworlds)} test gridworlds loaded')  
for grid in test_gridworlds:
    grid.early_stopping = False        

976 train gridworlds loaded
96 val gridworlds loaded
200 test gridworlds loaded


# Training Cell

In [3]:
# Original sweep configuration (commented out for reference)
# sweep_config = {
#     "method": "grid",
#     "metric": {"goal": "maximize", "name": "train_metrics/Usefulness"},
#     "parameters": {
#         "lambda_factor": {"value": 0.9},
#         "meta_ep_size": {"value": 32},
#         "hidden_layer_depth": {'value': 128},
#         "num_hidden_layers": {'value': 3},
#         "ent_coef": {'value': 0.015},
#         "learning_rate": {'values': [0.0007, 0.00001, 0.000001, 0.0000005]},
#         "total_timesteps": {'value': 2000},
#         "n_steps_a2c": {'value': 8192},
#         "vf_coef": {'value': 0.55},
#         "timesteps_per_run": {'value': 2000}
#     },
# }

# Single agent configuration
config = {
    "lambda_factor": 0.9,
    "meta_ep_size": 32,
    "hidden_layer_depth": 128,
    "num_hidden_layers": 3,
    "ent_coef": 0.015,
    "learning_rate": 0.0007,  # Choose one learning rate
    "total_timesteps": 4000,
    "n_steps_a2c": 8192,
    "vf_coef": 0.55,
    "timesteps_per_run": 2000
}

In [4]:
# Original sweep initialization (commented out for reference)
# sweep_id = wandb.sweep(sweep_config, project="IPP-second-paper-generalist")
# print(sweep_id)

In [None]:
## BEFORE RUNNING, MAKE SURE TO MANUALLY CHANGE ENVIRONMENT LIST AND FEATURE EXTRACTOR ##

import torch

# Original sweep-based train function (commented out for reference)
# def train(config=None):
#     run = wandb.init(config=config)
#     config = wandb.config

# Modified train function for single agent
def train():
    # Initialize WandB for single run
    run = wandb.init(project="IPP-second-paper-generalist", config=config)

    # device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    device = 'cpu'

    ## PICK ENVIRONMENT
    train_env_list = train_gridworlds           #MANUALLY CHANGE
    test_env_list = val_gridworlds              #MANUALLY CHANGE

    hld = config["hidden_layer_depth"]
    num_layers = config["num_hidden_layers"]

    def net_arch(hidden_layer_depth, num_hidden_layers):
        net_arch_list = []
        for n in range(num_hidden_layers):
            net_arch_list.append(hidden_layer_depth)
        return net_arch_list      

    net_arch_list = net_arch(hld, num_layers)

    policy_kwargs = dict(features_extractor_class=Custom_Flatten, #MANUALLY CHANGE
                        features_extractor_kwargs=dict(features_dim=250),
                        net_arch=dict(pi=net_arch_list, 
                                    vf=net_arch_list))

    #Number of vectorised environments
    num_cpu=3

    #Set-up for vectorised environments
    def make_env(rank, seed=0):
            """
            Utility function for multiprocessed env.

            :param env_id: (str) the environment ID
            :param seed: (int) the inital seed for RNG
            :param rank: (int) index of the subprocess
            """

            def _init():
                env = Generalist_MetaEpisodeEnv(
                        train_env_list, 
                        meta_ep_size=config["meta_ep_size"],
                        lambda_factor=config["lambda_factor"],
                    )
                # use a seed for reproducibility
                # Important: use a different seed for each environment
                # otherwise they would generate the same experiences
                env.reset(seed=seed + rank)
                return env

            set_random_seed(seed)
            return _init


    def vec_learning_run(model, timesteps):

        env = SubprocVecEnv([make_env(i) for i in range(num_cpu)],start_method="fork")

        model.set_env(env)

        model.learn(total_timesteps=timesteps,
                    callback=WandbCallback(verbose=0)) 

        return model


    def vec_learning(train_env_list,timesteps_per_run, total_timesteps):

        wandb.define_metric("custom_step")

        # Define which metrics to plot against that x-axis
        wandb.define_metric("train_metrics/Usefulness", step_metric='custom_step')
        wandb.define_metric("train_metrics/Neutrality", step_metric='custom_step')

        steps_count = 0

        env = SubprocVecEnv([make_env(i) for i in range(num_cpu)],start_method="fork")

        # Create the A2C model with the custom architecture
        model = A2C("MlpPolicy",                                  #MAUALLY CHANGE with feature_extractor_class
                    env,                                      #Change for vectorised Envs
                    device=device,
                    verbose=1,
                    ent_coef=config["ent_coef"],
                    learning_rate=config["learning_rate"],
                    n_steps=config["n_steps_a2c"],
                    vf_coef=config["vf_coef"],
                    policy_kwargs=policy_kwargs,           #MANUALLY CHANGE
                    tensorboard_log=f"runs/{run.id}")
        
        while steps_count < total_timesteps:

            model = vec_learning_run(model, timesteps_per_run)
            train_av_traj_ratio, train_av_usefulness, train_av_entropy = average_evals(train_env_list,model)
            steps_count += timesteps_per_run
            print(f'Step count: {steps_count}')
            print(f'Average Usefulness: {train_av_usefulness}')
            print(f'Average NEUTRALITY: {train_av_entropy}')
            print(f'Average Trajectory Ratio: {train_av_traj_ratio}')


            wandb.log({
                "custom_step": steps_count,
                "train_metrics/Usefulness": train_av_usefulness,
                "train_metrics/Neutrality": train_av_entropy,
                        })

        return model, train_av_usefulness, train_av_entropy

    model, train_av_usefulness, train_av_entropy = vec_learning(train_env_list, config["timesteps_per_run"], config["total_timesteps"])

    model.save(f"models/{run.id}")

    print('Average evals for train data')
    print(f'Average USEFULNESS:{train_av_usefulness}')
    print(f'Average NEUTRALITY:{train_av_entropy}')
    print('\n')

    run.summary["train_av_usefulness"]=train_av_usefulness
    run.summary["train_av_NEUTRALITY"]=train_av_entropy

    test_av_traj, test_av_usefulness, test_av_entropy = average_evals(test_env_list,model)

    print('Average evals for test data')
    print(f'Average Trajectory Ratio:{"{:.3f}".format(test_av_traj)}')
    print(f'Average USEFULNESS:{test_av_usefulness}')
    print(f'Average NEUTRALITY:{test_av_entropy}')
    print('\n')

    run.summary["test_av_usefulness"]=test_av_usefulness
    run.summary["test_av_NEUTRALITY"]=test_av_entropy

In [6]:
# Original sweep agent call (commented out for reference)
# wandb.agent(sweep_id, train)

# Direct training call for single agent
train()

[34m[1mwandb[0m: Currently logged in as: [33mdr-alexroman[0m ([33malex-roman[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using mps device




Logging to runs/ygs737gj/A2C_1


RuntimeError: Tensor for argument input is on cpu but expected on mps