In [None]:
from mlagents_envs.environment import ActionTuple, UnityEnvironment, BaseEnv
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
import numpy as np
import torch
import random
from collections import deque
import copy
import matplotlib.pyplot as plt

In [None]:
channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name='./Wave', seed=1, side_channels=[channel])
channel.set_configuration_parameters(time_scale = 40)
print("WAVE environment created.")

In [None]:
def copy_model(model):
    model2 = copy.deepcopy(model)
    model2.load_state_dict(model.state_dict())
    return model2

In [None]:
l1 = 64
l2 = 512
l3 = 512
l4 = 2

model = torch.nn.Sequential(
  torch.nn.Linear(l1, l2),
  torch.nn.ReLU(),
  torch.nn.Linear(l2, l3),
  torch.nn.ReLU(),
  torch.nn.Linear(l3, l4),
)
model_copy = copy_model(model)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def loss_fn(pred, target):
    return torch.mean(0.5 * (pred - target) ** 2)

def preprocess_input(inp):
    return np.append(inp.obs[1], inp.obs[0], axis=1)

In [None]:
model_path = './models/qlearning-target-multi-obs.path'
try:
    model.load_state_dict(torch.load(model_path))
    print('Model loaded')
except:
    print('No model available')

In [None]:
EPOCH = 2000
MEM_SIZE = 2500
MIN_EPSILON = .05
MAX_STEP = 500
BATCH_SIZE = 256
GAMMA = 0.9
SYNC_INTERVAL = 1000

epsilon = 1

losses = []
all_rewards = []
all_timesteps = []
experiences = deque(maxlen=MEM_SIZE)

k = 0
for i in range(EPOCH):
    env.reset()
    behavior_name = list(env.behavior_specs)[0]

    j = 0
    epoch_rewards = []
    while j < MAX_STEP:
        j += 1
        k += 1
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        if (len(terminal_steps) > 0):
            break

        state = torch.Tensor(preprocess_input(decision_steps))

        qval = model(state)
        if np.random.rand() > epsilon:
            # exploit
            action = np.argmax(qval.detach().numpy())
        else:
            # explore
            action = np.random.randint(0, 2)

        action_tuple = ActionTuple()
        action_tuple.add_discrete(np.array([[action]]))
        env.set_actions(behavior_name, action_tuple)
        env.step()

        new_decision_steps, new_terminal_steps = env.get_steps(behavior_name)
        current_step = new_decision_steps if len(new_terminal_steps) == 0 else new_terminal_steps
        reward = current_step.reward[0]

        epoch_rewards.append(reward)
    
        state2 = torch.Tensor(preprocess_input(current_step))
        done = len(new_terminal_steps) > 0

        current_exp = (state, action, reward, state2, done)
        experiences.append(current_exp)

        if len(experiences) >= BATCH_SIZE:
            batch = random.sample(experiences, BATCH_SIZE)

            states = torch.cat([s for (s, a, r, s2, done) in batch])
            actions = torch.Tensor([a for (s, a, r, s2, done) in batch])
            states2 = torch.cat([s2 for (s, a, r, s2, done) in batch])
            done_data = torch.Tensor([done for (s, a, r, s2, done) in batch])
            rewards = torch.Tensor([r for (s, a, r, s2, done) in batch])

            qvals = model(states)

            with torch.no_grad():
                qvals_2 = model_copy(states2)

            target = rewards + GAMMA * ((1 - done_data) * torch.max(qvals_2, dim=1)[0])
            action_qval_pred = qvals.gather(dim=1, index=actions.long().unsqueeze(dim=1)).squeeze()
            err = loss_fn(action_qval_pred, target.detach())
            losses.append(err.item())

            optimizer.zero_grad()
            err.backward()
            optimizer.step()
        
        if k % SYNC_INTERVAL == 0:
            model_copy = copy_model(model)

    current_reward = np.sum(epoch_rewards)
    all_rewards.append(current_reward)
    all_timesteps.append(j)
    print(f'EPOCH: {i}, total reward: {current_reward}, timestep: {j}, epsilon: {epsilon}')
    if epsilon > MIN_EPSILON:
        epsilon -= 1 / EPOCH

env.close()

In [None]:
def avg_per_x_element(data, x=10):
    avg = []
    sum = 0
    for i, el in enumerate(data):
        sum += el
        if i % x == 0:
            avg.append(sum / x)
            sum = 0
    return avg

In [None]:
plt.plot(avg_per_x_element(losses))

In [None]:
plt.plot(avg_per_x_element(all_rewards))

In [None]:
torch.save(model.state_dict(), model_path)
print('Saved model')

In [None]:
def test_model(model, time_scale=3, debug=False):
    channel = EngineConfigurationChannel()
    env = UnityEnvironment(file_name='./Wave', seed=1, side_channels=[channel])
    channel.set_configuration_parameters(time_scale = time_scale)
    print("WAVE environment created.")

    env.reset()
    i = 0
    GAMMA = 0.95
    total_reward = 0
    while True:
        i += 1
        behavior_name = list(env.behavior_specs)[0]

        decision_steps, terminal_steps = env.get_steps(behavior_name)
        if (len(terminal_steps) > 0):
            break

        preds = model(torch.Tensor(preprocess_input(decision_steps))).detach().numpy()

        action = np.argmax(preds)
        action_tuple = ActionTuple()
        action_tuple.add_discrete(np.array([[action]]))
        env.set_actions(behavior_name, action_tuple)
        env.step()

        new_decision_steps, new_terminal_steps = env.get_steps(behavior_name)
        current_step = new_decision_steps if len(new_terminal_steps) == 0 else new_terminal_steps
        reward = current_step.reward[0]

        total_reward += reward

        qvals_2 = model(torch.Tensor(preprocess_input(current_step)))
        target = reward + GAMMA * ((1 - int(len(new_terminal_steps) > 0)) * torch.max(qvals_2, dim=1)[0])
        if debug:
            print(i, 'Decision Steps', preprocess_input(decision_steps), 'Preds', preds, 'Selected Preds', np.argmax(preds), 'Done?', int(len(new_terminal_steps) > 0), 'Next Preds', ((1 - int(len(new_terminal_steps) > 0)) * torch.max(qvals_2, dim=1)[0]), 'Target', target, 'Reward', reward, sep='\n')
            print('==================================')

    env.close()
    return total_reward, i

def test_model_avg(model, count=20, time_scale=10):
    sum_reward = 0
    sum_timesteps = 0
    for _ in range(count):
        reward, timesteps = test_model(model, time_scale)
        sum_timesteps += timesteps
        sum_reward += reward
    return sum / count

In [None]:
test_model(model)

In [None]:
test_model_avg(model)