In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from env import Env
import numpy as np
import wandb
import copy

# wandb setup
number = 1
NAME = "Reinforce" + str(number)
ID = "Reinforce" + str(number)
run = wandb.init(project='actorcritic_MachineReplacement', name = NAME, id = ID)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

0,1
Current_return,▁▁▅▄▄▆█▆▄▆▃▅▅▅
loss,▇▁▆▅▅▅▅▅▃▂▂▄▇█
n_episode,▁▁▂▂▃▃▄▅▅▆▆▇▇█

0,1
Current_return,-500.61152
loss,-176.38503
n_episode,12.0


In [23]:
class PolicyNetwork(nn.Module):
    def __init__(self, n_obs, n_act):
        super(PolicyNetwork, self).__init__()

        self.layer = nn.Linear(n_obs, 16)
        self.actor = nn.Linear(16, n_act)
        self.critic = nn.Linear(16, 1)

    def forward(self, state):
        x = self.layer(state)
        x = F.relu(x)
        action_prob = F.softmax(self.actor(x), dim=-1)
        state_vals = self.critic(x)

        return action_prob, state_vals

In [24]:
n_obs = 1
n_act = 2

model_policy = PolicyNetwork(n_obs=n_obs, n_act=n_act).to(device)
optimizer = optim.Adam(model_policy.parameters(), lr=3e-2)
steps_done = 0

def select_action(state):
    global steps_done
    steps_done += 1
    with torch.no_grad():
        action_probs, state_val = model_policy(state)
        action_dist = torch.distributions.Categorical(action_probs)

        action = action_dist.sample()
        logprob = action_dist.log_prob(action)

        return action, logprob, state_val

In [25]:
max_steps_per_episode = 500
n_episodes = 2000
GAMMA = 0.99
R = 3.5 # Cost of replacement of a machine

wandb.config.update({
    'max_timesteps_per_episode': max_steps_per_episode,
    'num_of_episodes': n_episodes,
    'R': R,
    'optimizer': 'Adam',
    'learning_rate': 'default',
    'n_actions': n_act,
    'n_observations': n_obs,
})

env = Env(R=R)

In [26]:
all_rewards = []
for i in range(n_episodes):
    episode_rewards = []
    episode_log_probs = []
    state = env.reset()
    for j in range(max_steps_per_episode):

        state = torch.tensor([state], dtype=torch.float32, device=device).unsqueeze(0)
        action_probs, state_val = model_policy(state)
        action_disr = torch.distributions.Categorical(action_probs)
        action = action_disr.sample()
        logprob = action_disr.log_prob(action)
        next_state, reward = env.step(action)
        episode_rewards.append(reward)
        episode_log_probs.append(logprob)
        state = next_state

    all_rewards.append(sum(episode_rewards))
    discounted_rewards = []
    total_cur_return = 0
    for t in range(len(episode_log_probs)):
        Gt = sum([GAMMA**(k-t-1)*episode_rewards[k] for k in range(t, len(episode_rewards))])
        discounted_rewards.append(Gt)
        if t == 0:
            total_cur_return = copy.deepcopy(Gt)
    discounted_rewards = torch.tensor(discounted_rewards)
    logprob = torch.stack(episode_log_probs)
    loss = -(logprob*discounted_rewards).mean()

    wandb.log({'loss': loss, 'Current_return': total_cur_return, 'n_episode': i}) #, 'batch': t})
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"\rEpisode: {i}\tLoss: {loss}\tCurrent Discounted Return: {total_cur_return}", end="")

    if i%100 == 0:
        SAVE_PATH = './checkpoints/AC/AC_{}.pt'.format(i)
        torch.save(model_policy.state_dict(), SAVE_PATH)

Episode: 1999	Loss: -0.00019332944066263735	Current Discounted Return: -949.8648891500291