In [1]:
from game_env_copy import GameEnv
import torch
import torch.nn as nn
import tqdm
import numpy as np

### Control Environment

In [2]:
env = GameEnv((20,20), (3,3), (18,18))

In [3]:
def layer_init(layer, bias_const=0.0):
    """Initialize the weights and biases of a layer.

    Args:
        layer (nn.Module): The layer to initialize.
        std (float): Standard deviation for orthogonal initialization.
        bias_const (float): Constant value for bias initialization.

    Returns:
        nn.Module: The initialized layer.
    """
    torch.nn.init.xavier_uniform_(layer.weight)  # Orthogonal initialization
    torch.nn.init.constant_(layer.bias, bias_const)  # Constant bias
    return layer

In [4]:
class Agent(nn.Module):
    def __init__(self, actor_input_size, critic_input_size, actor_output_size, critic_output_size):
        super(Agent, self).__init__()
        hidden_size1, hidden_size2, hidden_size3 = 64, 128, 64
        self.actor = nn.Sequential(
            layer_init(nn.Linear(actor_input_size, hidden_size1)),
            nn.LeakyReLU(),
            layer_init(nn.Linear(hidden_size1, hidden_size2)),
            nn.LeakyReLU(),
            layer_init(nn.Linear(hidden_size2, hidden_size3)),
            nn.LeakyReLU(),
            layer_init(nn.Linear(hidden_size3, actor_output_size))
        )
        self.critic = nn.Sequential(
            layer_init(nn.Linear(critic_input_size, hidden_size1)),
            nn.LeakyReLU(),
            layer_init(nn.Linear(hidden_size1, hidden_size2)),
            nn.LeakyReLU(),
            layer_init(nn.Linear(hidden_size2, hidden_size3)),
            nn.LeakyReLU(),
            layer_init(nn.Linear(hidden_size3, critic_output_size))
        )

    def get_value(self, state):
        return self.critic(state)

    def get_action_probs(self, state):
        return torch.distributions.categorical.Categorical(logits = self.actor(state))

    def get_action(self, probs):
        return probs.sample()

    def get_action_logprob(self, probs, action):
        return probs.log_prob(action)

    def get_entropy(self, probs):
        return probs.entropy()

    def get_action_logprob_entropy(self, state):
        probs = self.get_action_probs(state)
        action = self.get_action(probs)
        logprob = self.get_action_logprob(probs, action)
        entropy = self.get_entropy(probs)
        return action, logprob, entropy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

agent_final = Agent(404, 404, 4, 1).to(device)
agent_adv = Agent(404, 404, 4, 1).to(device)
agent_trunc = Agent(404, 404, 4, 1).to(device)
agent_less = Agent(404, 404, 4, 1).to(device)
agent_op = Agent(404, 404, 4, 1).to(device)

agent_final.load_state_dict(torch.load('model_final.pth', map_location=device))
agent_adv.load_state_dict(torch.load('model_adversary.pth', map_location=device))
agent_trunc.load_state_dict(torch.load('model_trunc.pth', map_location=device))
agent_less.load_state_dict(torch.load('model_less.pth', map_location=device))
agent_op.load_state_dict(torch.load('model_op.pth', map_location=device))

<All keys matched successfully>

In [5]:
agents = [agent_final, agent_adv, agent_trunc, agent_less, agent_op]

In [15]:
def test(agent, env=GameEnv((20,20), (3,3), (18,18)), num_eps=10):
    rewards = []
    goal_reached = np.zeros(num_eps)

    for episode in tqdm.tqdm(range(num_eps)):
        total_reward = 0

        state, _ = env.reset()
        for i in range(5000):
            state_tensor = torch.tensor(state,dtype=torch.float32).to(device)

            with torch.no_grad():
                action, _, _ = agent.get_action_logprob_entropy(state_tensor)

            next_state, reward, done, _, infos = env.step(action.cpu().item())
            total_reward += reward

            state = torch.tensor(next_state, dtype=torch.float32, device=device)
            
    #         print(state[-4:])
            
            if done:
                goal_reached[episode] = 1
                break

        rewards.append(total_reward)

    env.close()

    print(f"\nMean Rewards: {np.mean(rewards)}{', never reach goal' if not goal_reached.any() else ''}\n")
    return np.mean(rewards), goal_reached

In [23]:
adv_results, adv_goals = test(agent_op, env = GameEnv((20,20), (3,3), (np.random.randint(12, 21),np.random.randint(12,21))))

  state_tensor = torch.tensor(state,dtype=torch.float32).to(device)
100%|██████████| 10/10 [01:56<00:00, 11.68s/it]


Mean Rewards: -22508.1, never reach goal






In [25]:
def test_agents(agents, *args):
    rewards = []
    goal_reached = []
    for agent in agents:
        reward, goal_reach = test(agent, *args)
        rewards.append(reward)
        goal_reached.append(goal_reach)
    return rewards, goal_reached

In [None]:
results3 = [test_agents(agents, GameEnv((20,20), (np.random.randint(1, 7),np.random.randint(1,7)), (18, 18)), 1) for i in range(10) ]
results4 = [test_agents(agents, GameEnv((20,20), (np.random.randint(1, 21),np.random.randint(1,21)), (np.random.randint(1, 21),np.random.randint(1,21))), 1) for i in range(10) ]
results2= [test_agents(agents, GameEnv((20,20), (3,3), (np.random.randint(12, 21),np.random.randint(12,21))), 1) for i in range(10)]
results1= test_agents(agents, GameEnv((20,20), (3,3), (18, 18)), 10)

In [None]:
results3 = [test_agents(agents, GameEnv((20,20), (np.random.randint(1, 7),np.random.randint(1,7)), (np.random.randint(12, 21),np.random.randint(12,21))), 1) for i in range(10) ]
results5 = test_agents(agents, GameEnv((20, 20), (18, 18), (3, 3)))

In [None]:
def get_reward_goal(result):
    reward = []
    goal = []
    for res in result:
        reward.append(res[0])
        goal.append(res[1])
    return np.mean(np.array(reward), axis=0), np.hstack(np.array(goal))

(array([-75788. , -35158.4, -65468.2, -88416. , -33006.5]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]))

In [None]:
tuple(map(np.array, results1))

(array([-94686.8, -86408.8, -47670.1, -95161.7,   1553.3]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]))

In [None]:
get_reward_goal(results2)

(array([-93635.2, -21478.2, -64243.3, -96585.2,  -4737. ]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 1., 1., 1., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 1., 1., 1., 1., 1., 0., 1., 1.]]))

In [None]:
get_reward_goal(results3)

(array([-93186.3, -37477.5, -78592. , -95165.9,   1499.4]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]))

In [None]:
get_reward_goal(results4)

(array([-75788. , -35158.4, -65468.2, -88416. , -33006.5]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]))

In [None]:
results1 = test_agents(agents, GameEnv((20,20), (3,3), (18, 18)), 10) # training env
results2 = test_agents(agents, GameEnv((20, 20), (18, 18), (3, 3)), 10) # swapped start goal
results3 = [test_agents(agents, GameEnv((20,20), (np.random.randint(1, 7),np.random.randint(1,7)), (np.random.randint(12, 21),np.random.randint(12,21))), 1) for i in range(10) ] # up right
results4 = [test_agents(agents, GameEnv((20,20), (np.random.randint(12, 21),np.random.randint(12,21)), (np.random.randint(1, 7),np.random.randint(1,7))), 1) for i in range(10) ] # down left
results5 = [test_agents(agents, GameEnv((20,20), (np.random.randint(1, 7),np.random.randint(12,21)), (np.random.randint(12, 21),np.random.randint(1,7))), 1) for i in range(10) ] # down right
results6 = [test_agents(agents, GameEnv((20,20), (np.random.randint(12, 21),np.random.randint(1,7)), (np.random.randint(1, 7),np.random.randint(12,21))), 1) for i in range(10) ] # up left
results7 = [test_agents(agents, GameEnv((20,20), (np.random.randint(1, 21),np.random.randint(1,21)), (np.random.randint(1, 21),np.random.randint(1,21))), 1) for i in range(10) ] # random

In [65]:
results1

([-94354.7, -75600.0, -59692.4, -95139.0, 1552.2],
 [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])])

In [66]:
results2

([-32440.4, -9206.1, -26816.8, -95151.1, 1464.4],
 [array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  array([0., 0., 0., 1., 1., 1., 1., 1., 1., 1.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])])

In [67]:
get_reward_goal(results3)

(array([-78536.8, -27256.4, -67310.1, -81137.2, -22142. ]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 1., 0.]]))

In [68]:
get_reward_goal(results4)

(array([ -51251. ,  -54823.4,  -41434. , -100502.9,   -9650.8]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 1., 0., 1., 1., 0., 0., 1.]]))

In [69]:
get_reward_goal(results5)

(array([-145533.9,  -52801.8,  -30474.4, -157471.8,  -76985.1]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]))

In [70]:
get_reward_goal(results6)

(array([ -13841.7,  -61576.2, -145610. ,  -22170.5,  -75503.5]),
 array([[0., 0., 0., 0., 0., 1., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 1., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]))

In [71]:
get_reward_goal(results7)

(array([-82185.1, -30136.7, -62310.3, -93336.8, -34681.1]),
 array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 1., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 1., 0., 1., 0., 0., 0., 0., 0.]]))