In [1]:
!pip install git+https://github.com/farm-gym/farm-gym
print("Successfully installed")

Collecting git+https://github.com/farm-gym/farm-gym
  Cloning https://github.com/farm-gym/farm-gym to c:\users\abdul\appdata\local\temp\pip-req-build-lbja58xu
  Resolved https://github.com/farm-gym/farm-gym to commit 9458f2944272b64850a43382f5e0c719f7bcef3f
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Successfully installed


  Running command git clone --filter=blob:none --quiet https://github.com/farm-gym/farm-gym 'C:\Users\abdul\AppData\Local\Temp\pip-req-build-lbja58xu'


In [2]:
# Initialise the environment and add wrappers

from farmgym_games.game_builder.utils_sb3 import farmgym_to_gym_observations_flattened, wrapper
from farmgym_games.game_catalogue.farm0.farm import env as Farm0
from stable_baselines3.common.monitor import Monitor

env = Farm0()
orignal_obs, _  = env.reset()
print(f"Original observation : \n{orignal_obs}\n")

# Wrap to change observation and action spaces and the step function
env.farmgym_to_gym_observations = farmgym_to_gym_observations_flattened
env = wrapper(env)
obs, _ = env.reset()
print(f"Wrapped observation : \n{obs}\n")

Original observation : 
[{'Free': {'Field-0': {'Weather-0': {'day#int365': 1}}}}, {'Free': {'Field-0': {'Weather-0': {'air_temperature': {'max#°C': [3.8], 'mean#°C': [1.1], 'min#°C': [-1.9]}}}}}, {'Free': {'Field-0': {'Weather-0': {'consecutive_dry#day': [1]}}}}, {'Free': {'Field-0': {'Soil-0': {'available_Water#L': {'[(0, 0)]': [125.0]}}}}}, {'Free': {'Field-0': {'Soil-0': {'microlife_health_index#%': {'[(0, 0)]': [75.0]}}}}}, {'Free': {'Field-0': {'Plant-0': {'stage': {'[(0, 0)]': 1}}}}}, {'Free': {'Field-0': {'Plant-0': {'population#nb': {'[(0, 0)]': [1.0]}}}}}, {'Free': {'Field-0': {'Plant-0': {'size#cm': {'[(0, 0)]': [0]}}}}}, {'Free': {'Field-0': {'Plant-0': {'fruits_per_plant#nb': {'[(0, 0)]': [0]}}}}}, {'Free': {'Field-0': {'Plant-0': {'fruit_weight#g': {'[(0, 0)]': [0]}}}}}, {'Free': {'Field-0': {'Pollinators-0': {'occurrence#bin': {'[(0, 0)]': 1}}}}}]

Wrapped observation : 
[1, 3.8, 1.1, -1.9, 1, 125.0, 75.0, 1, 1.0, 0, 0, 0, 0]



In [3]:
def expert_policy(obs):

    action = 0

    if obs[0] == 1:
        action = 6
    if obs[5] < 124:
        action = 1
    if obs[5] < 123:
        action = 2
    if obs[5] < 122:
        action = 3
    if obs[5] < 121:
        action = 4
    if obs[5] < 120:
        action = 5
    if obs[7] == 9:
        action = 7
    else:
        action = 6

    return action

In [22]:
def compute_returns(rewards, gamma=0.99):
    """
    Compute discounted cumulative rewards (returns) for an episode.
    """
    returns = []
    G = 0
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    return returns

In [23]:
# Expert dataset

from torch.utils.data.dataset import Dataset, random_split

class ExpertDataSetActor(Dataset):
    def __init__(self, expert_observations, expert_actions):
        self.observations = expert_observations
        self.actions = expert_actions

    def __getitem__(self, index):
        return (self.observations[index], self.actions[index])

    def __len__(self):
        return len(self.observations)
    
class ExpertDataSetCritic(Dataset):
    def __init__(self, expert_observations, expert_actions, expert_returns):
        self.observations = expert_observations
        self.actions = expert_actions
        self.returns = expert_returns

    def __getitem__(self, index):
        return (self.observations[index], self.actions[index], self.returns[index])

    def __len__(self):
        return len(self.observations)

In [24]:
import numpy as np
import gym

# Function to generate offline data

def generate_offline_data(num_interactions, gamma=0.99):
    expert_observations = []
    expert_actions = []
    expert_returns = []

    interaction_count = 0
    while interaction_count < num_interactions:
        obs, _ = env.reset()
        episode_rewards = []
        episode_observations = []
        episode_actions = []
        
        done = False
        while not done and interaction_count < num_interactions:
            action = expert_policy(obs)
            next_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            episode_observations.append(obs)
            episode_actions.append(action)
            episode_rewards.append(reward)

            obs = next_obs
            interaction_count += 1

        # Compute returns for the episode
        episode_returns = compute_returns(episode_rewards, gamma)
        
        # Append episode data to expert data
        expert_observations.extend(episode_observations)
        expert_actions.extend(episode_actions)
        expert_returns.extend(episode_returns)

    # Convert lists to numpy arrays
    expert_observations = np.array(expert_observations)
    expert_actions = np.array(expert_actions)
    expert_returns = np.array(expert_returns)

    # Save data to compressed file
    np.savez_compressed(
        "WS_data_{}".format(num_interactions),
        expert_observations=expert_observations,
        expert_actions=expert_actions,
        expert_returns=expert_returns
    )

    return expert_observations, expert_actions, expert_returns

In [25]:
expert_observations, expert_actions, expert_returns = generate_offline_data(num_interactions=100000)

expert_dataset_actor = ExpertDataSetActor(expert_observations, expert_actions)
expert_dataset_critic = ExpertDataSetCritic(expert_observations, expert_actions, expert_returns)

train_size = int(0.8 * len(expert_dataset_actor))

test_size = len(expert_dataset_actor) - train_size

train_expert_dataset_actor, test_expert_dataset_actor = random_split(
    expert_dataset_actor, [train_size, test_size]
)
train_expert_dataset_critic, test_expert_dataset_critic = random_split(
    expert_dataset_critic, [train_size, test_size]
)

print("train_expert_dataset: ", len(train_expert_dataset_actor))
print("test_expert_dataset: ", len(test_expert_dataset_actor))

train_expert_dataset:  80000
test_expert_dataset:  20000


In [29]:
len(expert_returns)

100000

In [93]:
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

def pretrain_agent(student, batch_size=64, epochs=20, scheduler_gamma=0.7, learning_rate=0.001, log_interval=100, no_cuda=True, seed=1, test_batch_size=64):
    
    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    criterion1 = nn.CrossEntropyLoss()
    criterion2 = nn.CrossEntropyLoss()

    # Extract initial policy
    model = student.policy.to(device)

    def train_actor(model, device, train_loader, optimizer):
        model.train()

        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()

            # Retrieve the logits for A2C/PPO when using discrete actions
            dist = model.get_distribution(data)
            action_prediction = dist.distribution.logits
            target = target.long()

            loss = criterion1(action_prediction, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Actor Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def train_critic(model, device, train_loader, optimizer):
        model.train()

        for batch_idx, (obs, action, target) in enumerate(train_loader):
            obs, action, target = obs.to(device), action.to(device), target.to(device)
            optimizer.zero_grad()

            # Retrieve the logits for A2C/PPO when using discrete actions
            # model.evaluate_actions(obs, actions)
            value, _, _ = model.evaluate_actions(obs, action)
            target = target.float()
            value = value.squeeze()

            loss = criterion2(value, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Critic Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(obs),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def test_actor(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)

                # Retrieve the logits for A2C/PPO when using discrete actions
                dist = model.get_distribution(data)
                action_prediction = dist.distribution.logits
                target = target.long()

                test_loss = criterion1(action_prediction, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set actor: Average loss: {test_loss:.4f}")

    def test_critic(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for obs, action, target in test_loader:
                obs, action, target = obs.to(device), action.to(device), target.to(device)

                # Retrieve the logits for A2C/PPO when using discrete actions
                value, _, _ = model.evaluate_actions(obs, action)
                target = target.float()
                value = value.squeeze()

                test_loss = criterion2(value, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set critic: Average loss: {test_loss:.4f}")

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing
    train_loader_actor = th.utils.data.DataLoader(
        dataset=train_expert_dataset_actor, batch_size=batch_size, shuffle=True, **kwargs
    )
    train_loader_critic = th.utils.data.DataLoader(
        dataset=train_expert_dataset_critic, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader_actor = th.utils.data.DataLoader(
        dataset=test_expert_dataset_actor,
        batch_size=test_batch_size,
        shuffle=True,
        **kwargs,
    )
    test_loader_critic = th.utils.data.DataLoader(
        dataset=test_expert_dataset_critic,
        batch_size=test_batch_size,
        shuffle=True,
        **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train_actor(model, device, train_loader_actor, optimizer)
        train_critic(model, device, train_loader_critic, optimizer)
        test_actor(model, device, test_loader_actor)
        test_critic(model, device, test_loader_critic)
        scheduler.step()

    # Implant the trained policy network back into the RL student agent
    student.policy = model

In [94]:
# Create agent

from stable_baselines3 import PPO

ppo_student_ws = PPO("MlpPolicy", env, verbose=1, learning_rate=0.0001, n_epochs=15, tensorboard_log='ws')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [95]:
pretrain_agent(ppo_student_ws, epochs=50)

Test set actor: Average loss: 0.0001
Test set critic: Average loss: 2.0752
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 2.4433
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 1.6752
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 1.7564
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 1.5854
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 1.9894
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 2.0265
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 1.8195
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 1.5331
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 2.1068
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 1.6316
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 1.6231
Test set actor: Average loss: 0.0000
Test set critic: Average loss: 2.0487
Test set actor: Average l

In [96]:
# Evaluate

from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(ppo_student_ws, env, n_eval_episodes=1000)

print(f"Mean reward = {mean_reward} +/- {std_reward}")



Mean reward = 98.45445863606326 +/- 15.295892413796253


In [97]:
ppo_student_ws.learn(total_timesteps=500000)

Logging to ws\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 157      |
|    ep_rew_mean     | 287      |
| time/              |          |
|    fps             | 283      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 164         |
|    ep_rew_mean          | 294         |
| time/                   |             |
|    fps                  | 290         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.000982789 |
|    clip_fraction        | 0.0125      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.332      |
|    explained_variance   | 0.00473     |
|    learn

<stable_baselines3.ppo.ppo.PPO at 0x1ce7b4a9660>

In [1]:
# Evaluate

from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(ppo_student_ws, env, n_eval_episodes=100)

print(f"Mean reward = {mean_reward} +/- {std_reward}")

NameError: name 'ppo_student_ws' is not defined

In [None]:
obs, _ = env.reset()

list_dqn = []

for _ in range (10):
    done = False
    obs, _ = env.reset()
    #print(obs)

    while not done:
        action, _ = ppo_student_ws.predict(obs)

        #print(obs)
        #print("Action: ", action)
        
        new_obs, reward, done, _, _ = env.step(action) 
        harvest = obs[11] * obs[10]
        
        obs = new_obs
    print(reward)
    print(harvest)
    #print("End")
    #print(obs)
    if(obs[7]==11):
        list_dqn.append(harvest)
        #print("Final yield: ", harvest)
    else:
        list_dqn.append(0)
        #print("Plant died")

print("DQN: {} +/- {}".format(np.mean(list_dqn), np.std(list_dqn)))

: 