In [None]:
!pip install git+https://github.com/farm-gym/farm-gym
print("Successfully installed")

In [31]:
# Initialise the environment and add wrappers

from farmgym_games.game_builder.utils_sb3 import farmgym_to_gym_observations_flattened, wrapper
from farmgym_games.game_catalogue.farm0.farm import env as Farm0
from stable_baselines3.common.monitor import Monitor

env = Farm0()
orignal_obs, _  = env.reset()
print(f"Original observation : \n{orignal_obs}\n")

# Wrap to change observation and action spaces and the step function
env.farmgym_to_gym_observations = farmgym_to_gym_observations_flattened
env = wrapper(env)
obs, _ = env.reset()
print(f"Wrapped observation : \n{obs}\n")

Original observation : 
[{'Free': {'Field-0': {'Weather-0': {'day#int365': 1}}}}, {'Free': {'Field-0': {'Weather-0': {'air_temperature': {'max#°C': [3.8], 'mean#°C': [1.1], 'min#°C': [-1.9]}}}}}, {'Free': {'Field-0': {'Weather-0': {'consecutive_dry#day': [1]}}}}, {'Free': {'Field-0': {'Soil-0': {'available_Water#L': {'[(0, 0)]': [125.0]}}}}}, {'Free': {'Field-0': {'Soil-0': {'microlife_health_index#%': {'[(0, 0)]': [75.0]}}}}}, {'Free': {'Field-0': {'Plant-0': {'stage': {'[(0, 0)]': 1}}}}}, {'Free': {'Field-0': {'Plant-0': {'population#nb': {'[(0, 0)]': [1.0]}}}}}, {'Free': {'Field-0': {'Plant-0': {'size#cm': {'[(0, 0)]': [0]}}}}}, {'Free': {'Field-0': {'Plant-0': {'fruits_per_plant#nb': {'[(0, 0)]': [0]}}}}}, {'Free': {'Field-0': {'Plant-0': {'fruit_weight#g': {'[(0, 0)]': [0]}}}}}, {'Free': {'Field-0': {'Pollinators-0': {'occurrence#bin': {'[(0, 0)]': 1}}}}}]

Wrapped observation : 
[1, 3.8, 1.1, -1.9, 1, 125.0, 75.0, 1, 1.0, 0, 0, 0, 0]



In [32]:
# Initialise the expert agent

from stable_baselines3 import PPO

expert = PPO.load("Expert_Agent_1m")

In [33]:
# Evaluate the expert agent

from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(expert, env, n_eval_episodes=100)

print(f"Mean reward = {mean_reward} +/- {std_reward}")



Mean reward = 96.34551623334526 +/- 16.014804626530733


In [34]:
import numpy as np
import gym

# Function to generate offline data

def generate_offline_data(interactions):
    num_interactions = interactions

    if isinstance(env.action_space, gym.spaces.Box):
        expert_observations = np.empty((num_interactions,) + env.observation_space.shape)
        expert_actions = np.empty((num_interactions,) + (env.action_space.shape[0],))

    else:
        expert_observations = np.empty((num_interactions,) + env.observation_space.shape)
        expert_actions = np.empty((num_interactions,) + env.action_space.shape)

    obs, _ = env.reset()

    for i in range(num_interactions):
        action, _ = expert.predict(obs)
        expert_observations[i] = obs
        expert_actions[i] = action
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        if done:
            obs, _ = env.reset()

    np.savez_compressed(
        "expert_data_{}".format(interactions),
        expert_actions=expert_actions,
        expert_observations=expert_observations,
    )

    return expert_observations, expert_actions

In [36]:
# Expert dataset

from torch.utils.data.dataset import Dataset, random_split

class ExpertDataSet(Dataset):
    def __init__(self, expert_observations, expert_actions):
        self.observations = expert_observations
        self.actions = expert_actions

    def __getitem__(self, index):
        return (self.observations[index], self.actions[index])

    def __len__(self):
        return len(self.observations)

100 Interactions

In [35]:
expert_observations, expert_actions = generate_offline_data(100)

In [37]:
expert_dataset = ExpertDataSet(expert_observations, expert_actions)

train_size = int(0.8 * len(expert_dataset))

test_size = len(expert_dataset) - train_size

train_expert_dataset, test_expert_dataset = random_split(
    expert_dataset, [train_size, test_size]
)

print("test_expert_dataset: ", len(test_expert_dataset))
print("train_expert_dataset: ", len(train_expert_dataset))

test_expert_dataset:  20
train_expert_dataset:  80


In [38]:
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

def pretrain_agent(student, batch_size=64, epochs=20, scheduler_gamma=0.7, learning_rate=0.001, log_interval=100, no_cuda=True, seed=1, test_batch_size=64):
    
    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    if isinstance(env.action_space, gym.spaces.Box):
        criterion = nn.MSELoss()
    else:
        criterion = nn.CrossEntropyLoss()

    # Extract initial policy
    model = student.policy.to(device)

    def train(model, device, train_loader, optimizer):
        model.train()

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()

            if isinstance(env.action_space, gym.spaces.Box):
                # A2C/PPO policy outputs actions, values, log_prob
                # SAC/TD3 policy outputs actions only
                if isinstance(student, (PPO)):
                    action, _, _ = model(data)
                else:
                    # SAC/TD3:
                    action = model(data)
                action_prediction = action.double()
            else:
                # Retrieve the logits for A2C/PPO when using discrete actions
                dist = model.get_distribution(data)
                action_prediction = dist.distribution.logits
                target = target.long()
                print(type(dist))
                print(type(action_prediction))

            loss = criterion(action_prediction, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)

                if isinstance(env.action_space, gym.spaces.Box):
                    # A2C/PPO policy outputs actions, values, log_prob
                    # SAC/TD3 policy outputs actions only
                    if isinstance(student, (PPO)):
                        action, _, _ = model(data)
                    else:
                        # SAC/TD3:
                        action = model(data)
                    action_prediction = action.double()
                else:
                    # Retrieve the logits for A2C/PPO when using discrete actions
                    dist = model.get_distribution(data)
                    action_prediction = dist.distribution.logits
                    target = target.long()

                test_loss = criterion(action_prediction, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set: Average loss: {test_loss:.4f}")

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing
    train_loader = th.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader = th.utils.data.DataLoader(
        dataset=test_expert_dataset,
        batch_size=test_batch_size,
        shuffle=True,
        **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)
        scheduler.step()

    # Implant the trained policy network back into the RL student agent
    student.policy = model

In [39]:
# Create agent

ppo_student_100 = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0001, n_epochs=15, tensorboard_log='pre_training')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [40]:
# Pretrain on 100 experiences

pretrain_agent(ppo_student_100)
ppo_student_100.save("PPO_student_100")

<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
Test set: Average loss: 0.1037
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
Test set: Average loss: 0.1037
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
Test set: Average loss: 0.1037
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
Test set: Average loss: 0.1037
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stab

In [41]:
# Evaluate

mean_reward, std_reward = evaluate_policy(ppo_student_100, env, n_eval_episodes=100)

print(f"Mean reward = {mean_reward} +/- {std_reward}")

Mean reward = 12.395387148986337 +/- 27.463741587716346


1000 Interactions

In [42]:
expert_observations, expert_actions = generate_offline_data(1000)

expert_dataset = ExpertDataSet(expert_observations, expert_actions)

train_size = int(0.8 * len(expert_dataset))

test_size = len(expert_dataset) - train_size

train_expert_dataset, test_expert_dataset = random_split(
    expert_dataset, [train_size, test_size]
)

print("test_expert_dataset: ", len(test_expert_dataset))
print("train_expert_dataset: ", len(train_expert_dataset))

test_expert_dataset:  200
train_expert_dataset:  800


In [43]:
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

def pretrain_agent(student, batch_size=64, epochs=20, scheduler_gamma=0.7, learning_rate=0.001, log_interval=100, no_cuda=True, seed=1, test_batch_size=64):
    
    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    if isinstance(env.action_space, gym.spaces.Box):
        criterion = nn.MSELoss()
    else:
        criterion = nn.CrossEntropyLoss()

    # Extract initial policy
    model = student.policy.to(device)

    def train(model, device, train_loader, optimizer):
        model.train()

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()

            if isinstance(env.action_space, gym.spaces.Box):
                # A2C/PPO policy outputs actions, values, log_prob
                # SAC/TD3 policy outputs actions only
                if isinstance(student, (PPO)):
                    action, _, _ = model(data)
                else:
                    # SAC/TD3:
                    action = model(data)
                action_prediction = action.double()
            else:
                # Retrieve the logits for A2C/PPO when using discrete actions
                dist = model.get_distribution(data)
                action_prediction = dist.distribution.logits
                target = target.long()
                print(type(dist))
                print(type(action_prediction))

            loss = criterion(action_prediction, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)

                if isinstance(env.action_space, gym.spaces.Box):
                    # A2C/PPO policy outputs actions, values, log_prob
                    # SAC/TD3 policy outputs actions only
                    if isinstance(student, (PPO)):
                        action, _, _ = model(data)
                    else:
                        # SAC/TD3:
                        action = model(data)
                    action_prediction = action.double()
                else:
                    # Retrieve the logits for A2C/PPO when using discrete actions
                    dist = model.get_distribution(data)
                    action_prediction = dist.distribution.logits
                    target = target.long()

                test_loss = criterion(action_prediction, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set: Average loss: {test_loss:.4f}")

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing
    train_loader = th.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader = th.utils.data.DataLoader(
        dataset=test_expert_dataset,
        batch_size=test_batch_size,
        shuffle=True,
        **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)
        scheduler.step()

    # Implant the trained policy network back into the RL student agent
    student.policy = model

In [44]:
# Create agent

ppo_student_1000 = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0001, n_epochs=15, tensorboard_log='pre_training')

# Pretrain on 1000 experiences

pretrain_agent(ppo_student_1000)
ppo_student_1000.save("PPO_student_1000")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distribut

In [46]:
# Evaluate

mean_reward, std_reward = evaluate_policy(ppo_student_1000, env, n_eval_episodes=100)

print(f"Mean reward = {mean_reward} +/- {std_reward}")

Mean reward = 88.96232194670826 +/- 13.234582627883219


10,000 Interactions

In [47]:
expert_observations, expert_actions = generate_offline_data(10000)

expert_dataset = ExpertDataSet(expert_observations, expert_actions)

train_size = int(0.8 * len(expert_dataset))

test_size = len(expert_dataset) - train_size

train_expert_dataset, test_expert_dataset = random_split(
    expert_dataset, [train_size, test_size]
)

print("test_expert_dataset: ", len(test_expert_dataset))
print("train_expert_dataset: ", len(train_expert_dataset))

test_expert_dataset:  2000
train_expert_dataset:  8000


In [48]:
# Create agent

ppo_student_10000 = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0001, n_epochs=15, tensorboard_log='pre_training')

# Pretrain on 10000 experiences

pretrain_agent(ppo_student_10000)
ppo_student_10000.save("PPO_student_10000")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distribut

In [49]:
# Evaluate

mean_reward, std_reward = evaluate_policy(ppo_student_10000, env, n_eval_episodes=100)

print(f"Mean reward = {mean_reward} +/- {std_reward}")



Mean reward = 99.92296840608935 +/- 13.870551115389814


100,000 Interactions

In [50]:
expert_observations, expert_actions = generate_offline_data(100000)

expert_dataset = ExpertDataSet(expert_observations, expert_actions)

train_size = int(0.8 * len(expert_dataset))

test_size = len(expert_dataset) - train_size

train_expert_dataset, test_expert_dataset = random_split(
    expert_dataset, [train_size, test_size]
)

print("test_expert_dataset: ", len(test_expert_dataset))
print("train_expert_dataset: ", len(train_expert_dataset))

test_expert_dataset:  20000
train_expert_dataset:  80000


In [51]:
# Create agent

ppo_student_100000 = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0001, n_epochs=15, tensorboard_log='pre_training')

# Pretrain on 10000 experiences

pretrain_agent(ppo_student_100000)
ppo_student_100000.save("PPO_student_100000")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distributions.CategoricalDistribution'>
<class 'torch.Tensor'>
<class 'stable_baselines3.common.distribut

In [52]:
# Evaluate

mean_reward, std_reward = evaluate_policy(ppo_student_100000, env, n_eval_episodes=100)

print(f"Mean reward = {mean_reward} +/- {std_reward}")

Mean reward = 94.32071288459585 +/- 14.994806901650959


Fine-tuning with RL

In [53]:
timesteps = 200000

In [56]:
ppo_student_100.learn(total_timesteps=timesteps)

Logging to pre_training\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 33.2     |
|    ep_rew_mean     | 16.7     |
| time/              |          |
|    fps             | 452      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 39.4        |
|    ep_rew_mean          | 21.8        |
| time/                   |             |
|    fps                  | 378         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011033402 |
|    clip_fraction        | 0.0351      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.93       |
|    explained_variance   | 0.823       |


<stable_baselines3.ppo.ppo.PPO at 0x20e197fe380>

In [57]:
ppo_student_1000.learn(total_timesteps=timesteps)

Logging to pre_training\PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 38.8     |
|    ep_rew_mean     | 17.6     |
| time/              |          |
|    fps             | 477      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 42.1        |
|    ep_rew_mean          | 18.8        |
| time/                   |             |
|    fps                  | 386         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014632122 |
|    clip_fraction        | 0.0715      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.07       |
|    explained_variance   | -0.0387     |


<stable_baselines3.ppo.ppo.PPO at 0x20e236b5570>

In [58]:
ppo_student_10000.learn(total_timesteps=timesteps)

Logging to pre_training\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 43.7     |
|    ep_rew_mean     | 15.3     |
| time/              |          |
|    fps             | 465      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 32.7        |
|    ep_rew_mean          | 13.2        |
| time/                   |             |
|    fps                  | 374         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.021251507 |
|    clip_fraction        | 0.0992      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.07       |
|    explained_variance   | 0.104       |


<stable_baselines3.ppo.ppo.PPO at 0x20e235d5780>

In [59]:
ppo_student_100000.learn(total_timesteps=timesteps)

Logging to pre_training\PPO_4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 71.7     |
|    ep_rew_mean     | 56.9     |
| time/              |          |
|    fps             | 465      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 70.5        |
|    ep_rew_mean          | 67.1        |
| time/                   |             |
|    fps                  | 385         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013023134 |
|    clip_fraction        | 0.077       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.56       |
|    explained_variance   | 0.00299     |


<stable_baselines3.ppo.ppo.PPO at 0x20e23685e40>

In [60]:
ppo_no_pretrain = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0001, n_epochs=15, tensorboard_log='pre_training')

ppo_no_pretrain.learn(total_timesteps=timesteps)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to pre_training\PPO_5
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 42       |
|    ep_rew_mean     | 17.5     |
| time/              |          |
|    fps             | 451      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 31.8        |
|    ep_rew_mean          | 13.2        |
| time/                   |             |
|    fps                  | 378         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011931511 |
|    clip_fraction        | 0.0787      |
|    clip_range           | 0.2 

<stable_baselines3.ppo.ppo.PPO at 0x20e22d79420>

Evaluation of Policies with Harvest Lists

In [61]:
def generate_harvest_list(model):
    
    obs, _ = env.reset()

    harvest_list = []

    for _ in range (100):
        done = False
        obs, _ = env.reset()
        #print(obs)

        while not done:
            action, _ = model.predict(obs)

            #print(obs)
            #print("Action: ", action)
            
            new_obs, reward, done, _, _ = env.step(action) 
            harvest = obs[11] * obs[10]
            
            obs = new_obs
        #print("End")
        #print(obs)
        if(obs[7]==11):
            harvest_list.append(harvest)
            #print("Final yield: ", harvest)
        else:
            harvest_list.append(0)
            #print("Plant died")

    return harvest_list

In [66]:
import statistics

list1 = generate_harvest_list(ppo_no_pretrain)
list2 = generate_harvest_list(ppo_student_100)
list3 = generate_harvest_list(ppo_student_1000)
list4 = generate_harvest_list(ppo_student_10000)
list5 = generate_harvest_list(ppo_student_100000)

print("No pretraining: {} +/- {}".format(statistics.mean(list1), statistics.stdev(list1)))
print("100 samples pretraining: {} +/- {}".format(statistics.mean(list2), statistics.stdev(list2)))
print("1000 samples pretraining: {} +/- {}".format(statistics.mean(list3), statistics.stdev(list3)))
print("10000 samples pretraining: {} +/- {}".format(statistics.mean(list4), statistics.stdev(list4)))
print("100000 samples pretraining: {} +/- {}".format(statistics.mean(list5), statistics.stdev(list5)))

No pretraining: 227.81556509231575 +/- 184.19569289639566
100 samples pretraining: 221.44974199517824 +/- 209.2242914237117
1000 samples pretraining: 159.628414281688 +/- 187.6729622588512
10000 samples pretraining: 330.7006841052369 +/- 281.7693631958459
100000 samples pretraining: 279.7214095653245 +/- 242.11590489648094


In [65]:
list_expert = generate_harvest_list(expert)

print("Expert: {} +/- {}".format(statistics.mean(list_expert), statistics.stdev(list_expert)))

Expert: 307.9328381867117 +/- 218.14567190541908


Additional things to do:

1. Generate harvest lists before and after RL training on the pretrained models
2. Plot graphs on Tensorboard