In [16]:
import os 
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch
from gym.wrappers import FlattenObservation

In [35]:
from kube_gym.kube_env import KubeGymEnv
env = KubeGymEnv()
env.reset()
env = FlattenObservation(env)

In [84]:
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.values = []
        self.logprobs = []
        self.batch_size = batch_size
    
    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]
        return np.array(self.states),\
               np.array(self.actions),\
               np.array(self.rewards),\
               np.array(self.dones),\
               np.array(self.values),\
               np.array(self.logprobs),\
               batches

    def store_memory(self, state, action, reward, done, probs, vals):
        self.states.append(state)
        self.actions.append(action)       
        self.rewards.append(reward)
        self.dones.append(done)        
        self.values.append(vals)
        self.logprobs.append(probs)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

In [92]:
class ActorNetwork(nn.Module):
    def __init__(self, input_dims, hidden_dims, n_actions):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dims, hidden_dims, dtype=torch.float32), 
            nn.ReLU(),
            nn.Linear(hidden_dims, hidden_dims, dtype=torch.float32),
            nn.ReLU(),
            nn.Linear(hidden_dims, n_actions, dtype=torch.float32),
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        # Return the probabilities directly instead of creating a distribution
        return self.layers(x)

In [93]:
class CriticNetwork(nn.Module):
    def __init__(self, input_dims, hidden_dims):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dims, hidden_dims, dtype=torch.float32),
            nn.ReLU(),
            nn.Linear(hidden_dims, hidden_dims, dtype=torch.float32),
            nn.ReLU(),
            nn.Linear(hidden_dims, 1, dtype=torch.float32)
        )
    def forward(self, x):
        value = self.layers(x)
        return value


In [131]:
class Agent:
    def __init__(self, 
                n_actions, 
                input_dims, 
                hidden_dims,
                gamma=0.99, 
                alpha=0.003, 
                policy_clip=0.2, 
                batch_size=32, 
                n_epochs=10,
                gae_lambda=0.95,  # Add this parameter
                chkpt_dir='models/'
    ):
        self.memory = PPOMemory(batch_size)
        self.gamma = gamma
        self.policy_clip = policy_clip
        self.n_epochs = n_epochs
        self.gae_lambda = gae_lambda  # Store the parameter
        self.actor = ActorNetwork(input_dims, hidden_dims, n_actions)
        self.critic = CriticNetwork(input_dims, hidden_dims)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=alpha)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=alpha)
        self.chkpt_dir = chkpt_dir

    def store_transition(self, state, action, probs, vals, reward, done):
        self.memory.store_memory(state, action, probs, vals, reward, done)

    def save_models(self):
        print('... saving models ...')
        self.actor.save(self.actor.state_dict(), self.chkpt_dir + 'actor')
        self.critic.save(self.critic.state_dict(), self.chkpt_dir + 'critic')

    def load_models(self):
        print('... loading models ...')
        self.actor = self.actor.load_state_dict(torch.load(self.chkpt_dir + 'actor'))
        self.critic = self.critic.load_state_dict(torch.load(self.chkpt_dir + 'critic'))

    def choose_action(self, observation):
        state = torch.tensor([observation], dtype=torch.float32)

        probs = self.actor(state)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        value = self.critic(state)

        # Detach tensors and convert action to integer
        action = int(action.detach().numpy()[0])  # Convert to integer
        value = value.detach().numpy()[0]
        log_prob = log_prob.detach().numpy()[0]

        return action, log_prob, value
    
    def learn(self):
        for _ in range(self.n_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
                reward_arr, dones_arr, batches = \
                self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            # Calculate advantages
            for t in range(len(reward_arr)):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)):
                    # Calculate next value
                    if k < len(reward_arr) - 1:
                        next_val = values[k + 1]
                    else:
                        next_val = 0  # Terminal state has value 0
                    
                    delta = reward_arr[k] + self.gamma * next_val * (1 - int(dones_arr[k])) - values[k]
                    a_t += discount * delta
                    discount *= self.gamma * self.gae_lambda
                advantage[t] = float(a_t)  # Explicitly convert to float

            advantage = torch.tensor(advantage, dtype=torch.float32)
            values = torch.tensor(values, dtype=torch.float32)

            for batch in batches:
                states = torch.tensor(state_arr[batch], dtype=torch.float32)
                old_probs = torch.tensor(old_prob_arr[batch], dtype=torch.float32)
                actions = torch.tensor(action_arr[batch], dtype=torch.float32)

                # Actor loss
                probs = self.actor(states)
                dist = torch.distributions.Categorical(probs)
                new_probs = dist.log_prob(actions)
                
                prob_ratio = (new_probs - old_probs).exp()
                weighted_probs = advantage[batch] * prob_ratio
                clipped_probs = torch.clamp(prob_ratio, 1-self.policy_clip, 1+self.policy_clip)
                weighted_clipped_probs = clipped_probs * advantage[batch]
                actor_loss = -torch.min(weighted_probs, weighted_clipped_probs).mean()

                # Critic loss 
                critic_value = self.critic(states).squeeze()
                returns = advantage[batch] + values[batch]
                criterion = nn.MSELoss()  # Create MSELoss instance
                critic_loss = criterion(critic_value, returns)  # Use the instance

                # Take gradient steps
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()

                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                self.critic_optimizer.step()

        self.memory.clear_memory()


In [132]:
import gym
import numpy as np
import torch  # Using torch instead of tensorflow since project uses PyTorch
from datetime import datetime

def train_agent(batch_size, n_games, n_epochs, alpha, agent, N, n_steps):
    score_history = []
    learn_iters = 0  # Move this inside the function
    best_score = env.reward_range[0]  # Move this inside too
    
    for i in range(n_games):
        observation = env.reset()
        if isinstance(observation, tuple):
            observation = observation[0]

        done = False
        score = 0
        while not done:
            action, prob, val = agent.choose_action(np.expand_dims(observation, axis=0))
            print(f"action: {action}, prob: {prob}, val: {val}")

            observation, reward, done, _,  info = env.step(action)
            n_steps += 1
            score += reward
            agent.store_transition(observation, action,
                                    prob, val, reward, done)
            if n_steps % N == 0:
                agent.learn()
                learn_iters += 1
            observation = observation
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        
        if avg_score > best_score:
            best_score = avg_score
            agent.save_models()
        
        print('episode', i, 'score %.1f' % score, 'avg score %.1f' % avg_score,
                'time_steps', n_steps, 'learning_steps', learn_iters)
        env.render()



In [134]:
# Then call it with:
N = 20  # Steps before learning
batch_size = 5
n_games = 300
n_epochs = 10
alpha = 0.0003
agent = Agent(n_actions=3, 
              batch_size=batch_size,
              alpha=alpha, 
              n_epochs=n_epochs,
              input_dims=env.observation_space.shape[0], 
              hidden_dims=10,
              gae_lambda=0.95)

n_steps = 0
train_agent(batch_size, n_games, n_epochs, alpha, agent, N, n_steps)

action: 1, prob: [-0.4483865], val: [[-0.22839773]]
action: 0, prob: [-0.5292991], val: [[-0.15399393]]
action: 0, prob: [-0.9890362], val: [[-0.22839773]]
action: 0, prob: [-0.94832015], val: [[-0.30311644]]
action: 1, prob: [-0.74031675], val: [[-0.13404432]]
action: 1, prob: [-0.42432192], val: [[-0.22839773]]
action: 0, prob: [-0.9653784], val: [[-0.30266893]]
action: 1, prob: [-1.7424992], val: [[-0.26524082]]
action: 0, prob: [-1.2510605], val: [[-0.22839773]]
action: 0, prob: [-1.1628823], val: [[-0.22839773]]
action: 0, prob: [-0.56550276], val: [[-0.22106117]]
action: 1, prob: [-0.49120742], val: [[-0.17101063]]
action: 1, prob: [-0.39791977], val: [[-0.22839773]]
action: 1, prob: [-0.00229138], val: [[0.1409075]]
action: 0, prob: [-0.5137265], val: [[-0.23170257]]
action: 0, prob: [-0.99991786], val: [[-0.2660547]]
action: 1, prob: [-0.69412637], val: [[-0.25616357]]
action: 1, prob: [-0.5077441], val: [[-0.22839773]]
action: 1, prob: [-0.51320404], val: [[-0.22839773]]
actio

  action = int(action.detach().numpy()[0])  # Convert to integer
  advantage[t] = float(a_t)  # Explicitly convert to float
  return F.mse_loss(input, target, reduction=self.reduction)


IndexError: index 19 is out of bounds for axis 0 with size 19