In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from env import Env
import numpy as np
import wandb
import copy

# wandb setup
number = 1
NAME = "Reinforce2" + str(number)
ID = "Reinforce2" + str(number)
run = wandb.init(project='REINFORCE2_MachineReplacement', name = NAME, id = ID)

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Current_return,▃▃▄▇▄▄▃▅███▄▇▄▄▇▄█▆▇█▇▅▃▃▃▂▂▁▂▃▂▂▂▂▂▂▂▁▂
loss,▇▇▆▁▅▅▇▅▁▁▁▆▂▅▆▁▆▁▃▂▁▂▄▇▆▇▇▇█▇▇▇▇▇▇█▇██▇
n_episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
Current_return,-9266.39744
loss,3054.71851
n_episode,200.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670271116648414, max=1.0…

In [31]:
# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(PolicyNetwork, self).__init__()
        self.layer1 = nn.Linear(n_observations, 16)
        self.layer2 = nn.Linear(16, 16)
        self.layer3 = nn.Linear(16, n_actions)

    def forward(self, state):
        x = F.relu(self.layer1(state))
        x = F.relu(self.layer2(x))
        output = F.softmax(self.layer3(x), dim=1)
        return output

In [32]:
n_actions = 2
n_observations = 1

policy_net = PolicyNetwork(n_observations=n_observations, n_actions=n_actions).to(device)
optimizer = optim.Adam(policy_net.parameters())
steps_done = 0

def select_action(state):
    global steps_done
    steps_done += 1
    with torch.no_grad():
        action_probs = policy_net(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        log_prob = action_dist.log_prob(action)
        return action, log_prob

In [33]:
max_steps_per_episode = 500
n_episodes = 201
GAMMA = 1
R = 38 # Cost of replacement of a machine

wandb.config.update({
    'max_timesteps_per_episode': max_steps_per_episode,
    'num_of_episodes': n_episodes,
    'R': R,
    'optimizer': 'Adam',
    'learning_rate': 'default',
    'n_actions': n_actions,
    'n_observations': n_observations,
})

env = Env(R=R)

# REINFORCE
all_rewards = []
for i in range(n_episodes):
    episode_rewards = []
    episode_log_probs = []
    state = env.reset()
    for j in range(max_steps_per_episode):
        state = torch.tensor([state], dtype=torch.float32, device=device).unsqueeze(0)
        # action, log_prob = select_action(state)
        action_probs = policy_net(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        log_prob = action_dist.log_prob(action)
        next_state, reward = env.step(action)
        episode_rewards.append(reward)
        episode_log_probs.append(log_prob)
        state = next_state
    all_rewards.append(sum(episode_rewards))
    discounted_rewards = []
    total_cur_return = 0
    for t in range(len(episode_rewards)):
        Gt = sum([GAMMA**(k-t-1)*episode_rewards[k] for k in range(t, len(episode_rewards))])
        discounted_rewards.append(Gt)
        if t==0:
            total_cur_return = copy.deepcopy(Gt)
    discounted_rewards = torch.tensor(discounted_rewards).to(device)
    log_probs = torch.stack(episode_log_probs).to(device)
    loss = (log_probs*discounted_rewards).mean()

    # print(f'Log of probs: {log_probs[0:10]}')
    # print(f'Discounted returns: {discounted_rewards[0:10]}')

    wandb.log({'loss': loss, 'Current_return': total_cur_return, 'n_episode': i}) #, 'batch': t})
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"\rEpisode: {i}\tLoss: {loss}\tCurrent Discounted Return: {total_cur_return}", end="")

    if i %10 == 0:
        SAVE_PATH = './checkpoints/REINFORCE/REINFORCE_{}.pt'.format(i)
        torch.save(policy_net.state_dict(), SAVE_PATH)
    

Episode: 18	Loss: 4302.974609375	Current Discounted Return: -14265.20000000000474

KeyboardInterrupt: 

In [None]:
from test_policy import evaluate_policy, print_policy
from tqdm import tqdm
from copy import deepcopy
import torch

NUM_STEPS = 201

best_reward = -torch.inf
best_policy = PolicyNetwork(n_observations=1, n_actions=2)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i in tqdm(range(0, NUM_STEPS, 10), desc="Evaluating", leave=False):
    LOAD_PATH = f'./checkpoints/REINFORCE/REINFORCE_{i}.pt'
    policy_net = PolicyNetwork(1, 2).to(device)
    checkpoint = torch.load(LOAD_PATH, map_location='cpu')
    policy_net.load_state_dict(checkpoint)
    reward = evaluate_policy(env=env,  policy=policy_net)
    # writer.add_scalar("Reward (over policy)", reward, i)
    if reward > best_reward:
        best_reward = reward
        best_policy = deepcopy(policy_net)

print_policy(best_policy)

                                                           

1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 


