In [None]:
from mlagents_envs.environment import ActionTuple, UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
import numpy as np
import torch
import matplotlib.pyplot as plt

In [None]:
channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name='./Wave', seed=1, side_channels=[channel])
channel.set_configuration_parameters(time_scale = 20)
print("WAVE environment created.")

In [None]:
l1 = 64
l2 = 150
l3 = 150
l4 = 2

model_policy = torch.nn.Sequential(
  torch.nn.Linear(l1, l2),
  torch.nn.ReLU(),
  torch.nn.Linear(l2,l3),
  torch.nn.ReLU(),
  torch.nn.Linear(l3,l4),
  torch.nn.Softmax(dim=1)
)
learning_rate = 1e-4
optimizer_policy = torch.optim.Adam(model_policy.parameters(), lr=learning_rate)

In [None]:
l1 = 64
l2 = 150
l3 = 1

model_value = torch.nn.Sequential(
  torch.nn.Linear(l1, l2),
  torch.nn.ReLU(),
  torch.nn.Linear(l2, l3),
)

learning_rate = 1e-4
optimizer_value = torch.optim.Adam(model_value.parameters(), lr=learning_rate)

In [None]:
def discount_rewards(rewards: np.ndarray, gamma):
    reversed = np.copy(rewards)[::-1]
    discounted_rewards = []
    for i, reward in enumerate(reversed):
        discounted_rewards.append(reward + (0 if i == 0 else reversed[i - 1]))
        reversed[i] = reward * gamma
        if i > 0:
            reversed[i] += reversed[i - 1] * gamma
    discounted_rewards = np.array(discounted_rewards[::-1])
    # discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
    return discounted_rewards

def loss_fn(predictions, advantages):
    return -1 * torch.mean(advantages * torch.log(predictions))

def preprocess_input(inp):
    return np.append(inp.obs[1], inp.obs[0], axis=1).reshape(-1)

def get_trajectories(model, max_iter=300):
    states = []
    actions = []
    action_sets = []
    rewards = []

    env.reset()
    behavior_name = list(env.behavior_specs)[0]
    timestep = 0
    while timestep < max_iter:
        timestep += 1
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        if (len(terminal_steps) > 0):
            break

        state = preprocess_input(decision_steps)

        states.append(state)
        pred = model(torch.Tensor(np.array([state])))

        action = np.random.choice(np.array([0, 1]), p=pred.detach().numpy().flatten())
        actions.append(action)
        action_tuple = ActionTuple()
        action_tuple.add_discrete(np.array([[action]]))
        env.set_actions(behavior_name, action_tuple)
        env.step()
        
        new_decision_steps, new_terminal_steps = env.get_steps(behavior_name)
        current_step = new_decision_steps if len(new_terminal_steps) == 0 else new_terminal_steps
        reward = current_step.reward[0]
        rewards.append(float(reward))

    return torch.from_numpy(np.array(states)).float(), np.array(actions), np.array(rewards), np.array(action_sets), timestep

In [None]:
def get_advantages(values, masks, rewards):
    adv = np.zeros(len(values))
    for i in reversed(range(len(rewards))):
        next_value = 0
        if i + 1 < len(rewards):
            next_value = values[i + 1]
        delta = rewards[i] + GAMMA * next_value * masks[i] - values[i]
        adv[i] = delta

    adv = np.array(adv)
    return (adv - np.mean(adv)) / (np.std(adv) + 1e-10)

In [None]:
EPOCH = 2000
GAMMA = 0.9
all_rewards = []
all_timesteps = []
all_actor_losses = []
all_critic_losses = []

for i in range(EPOCH):
    states, actions, rewards, action_sets, timestep = get_trajectories(model_policy)
    current_reward = np.sum(rewards)
    all_rewards.append(current_reward)
    all_timesteps.append(timestep)
    print(f'EPOCH: {i}, total reward: {current_reward}, timestep: {timestep}')

    predictions = model_policy(states)
    discounted_rewards = torch.tensor(discount_rewards(rewards, GAMMA))

    values = model_value(states)
    critic_loss = 0.5 * torch.pow(values - discounted_rewards, 2).mean()

    detached_values = values.detach().numpy()
    masks = np.ones_like(detached_values)
    masks[-1] = 0
    advantages = torch.Tensor(get_advantages(detached_values.flatten(), masks, rewards))
    actions = torch.tensor(actions.reshape(-1, 1)).long()
    prob_batch = predictions.gather(dim=1,index=actions).squeeze()
    actor_loss = (advantages * -torch.log(prob_batch)).mean()

    all_actor_losses.append(actor_loss.item())
    all_critic_losses.append(critic_loss.item())

    optimizer_value.zero_grad()
    critic_loss.backward()
    optimizer_value.step()

    optimizer_policy.zero_grad()
    actor_loss.backward()
    optimizer_policy.step()

env.close()

In [None]:
def avg_per_x_element(data, x=10):
    avg = []
    sum = 0
    for i, el in enumerate(data):
        sum += el
        if i % x == 0:
            avg.append(sum / x)
            sum = 0
    return avg

In [None]:
plt.plot(avg_per_x_element(all_critic_losses))

In [None]:
plt.plot(avg_per_x_element(all_actor_losses))

In [None]:
plt.plot(avg_per_x_element(all_rewards))

In [None]:
plt.plot(avg_per_x_element(all_timesteps))

In [None]:
channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name='./Wave', seed=1, side_channels=[channel])
channel.set_configuration_parameters(time_scale = 3)
print("WAVE environment created.")

i = 0
env.reset()
while True:
    i += 1
    behavior_name = list(env.behavior_specs)[0]

    decision_steps, terminal_steps = env.get_steps(behavior_name)
    if (len(terminal_steps) > 0):
        break

    preds = model_policy(torch.Tensor([preprocess_input(decision_steps)])).detach().numpy()
    preds_value = model_value(torch.Tensor([preprocess_input(decision_steps)])).detach().numpy()
    print(f'TIMESTEP {i}, policy {preds}, values {preds_value}')

    action = np.argmax(preds)
    action_tuple = ActionTuple()
    action_tuple.add_discrete(np.array([[action]]))
    env.set_actions(behavior_name, action_tuple)
    env.step()

env.close()

In [None]:
env.close()