In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from env import Env
import numpy as np
import wandb
import copy

# wandb setup
number = 1
NAME = "Reinforce" + str(number)
ID = "Reinforce" + str(number)
run = wandb.init(project='REINFORCE_MachineReplacement', name = NAME, id = ID)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

0,1
Current_return,▇▅▇▆▆▅▆▅▇▅▆▄▆▆▄▄▅▆▄▄▄▅▄▂▁▃▆▃▆█▃▄▇▅▅▆▆▇▅▇
loss,▆▆▅▇█▆▅▅▆▆▅▅▆▃▃▂▃▃▂▂▁▂▂▂▂▂▄▃▃▆▄▄▅▄▅▅▇▆▅▅
n_episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
Current_return,-528.15024
loss,-303.4014
n_episode,2833.0


In [7]:
# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(PolicyNetwork, self).__init__()
        self.layer1 = nn.Linear(n_observations, 16)
        self.layer2 = nn.Linear(16, 16)
        self.layer3 = nn.Linear(16, n_actions)

    def forward(self, state):
        x = F.relu(self.layer1(state))
        x = F.relu(self.layer2(x))
        output = F.softmax(self.layer3(x))
        return output

In [8]:
n_actions = 2
n_observations = 1

policy_net = PolicyNetwork(n_observations=n_observations, n_actions=n_actions).to(device)
optimizer = optim.Adam(policy_net.parameters())
steps_done = 0

def select_action(state):
    global steps_done
    steps_done += 1
    with torch.no_grad():
        action_probs = policy_net(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        log_prob = action_dist.log_prob(action)
        return action, log_prob

In [9]:
max_steps_per_episode = 50
n_episodes = 200
GAMMA = 0.99
R = 3.5 # Cost of replacement of a machine

wandb.config.update({
    'max_timesteps_per_episode': max_steps_per_episode,
    'num_of_episodes': n_episodes,
    'R': R,
    'optimizer': 'Adam',
    'learning_rate': 'default',
    'n_actions': n_actions,
    'n_observations': n_observations,
})

env = Env(R=R)

# REINFORCE
all_rewards = []
for i in range(n_episodes):
    episode_rewards = []
    episode_log_probs = []
    state = env.reset()
    for j in range(max_steps_per_episode):
        state = torch.tensor([state], dtype=torch.float32, device=device).unsqueeze(0)
        # action, log_prob = select_action(state)
        action_probs = policy_net(state)
        action_dist = torch.distributions.Categorical(action_probs)
        action = action_dist.sample()
        log_prob = action_dist.log_prob(action)
        next_state, reward = env.step(action)
        episode_rewards.append(reward)
        episode_log_probs.append(log_prob)
        state = next_state
    all_rewards.append(sum(episode_rewards))
    discounted_rewards = []
    total_cur_return = 0
    for t in range(len(episode_rewards)):
        Gt = sum([GAMMA**(k-t-1)*episode_rewards[k] for k in range(t, len(episode_rewards))])
        discounted_rewards.append(Gt)
        if t==0:
            total_cur_return = copy.deepcopy(Gt)
    discounted_rewards = torch.tensor(discounted_rewards)
    log_probs = torch.stack(episode_log_probs)
    loss = -(log_probs*discounted_rewards).mean()

    wandb.log({'loss': loss, 'Current_return': total_cur_return, 'n_episode': i}) #, 'batch': t})
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"\rEpisode: {i}\tLoss: {loss}\tCurrent Discounted Return: {total_cur_return}", end="")

    if i%100 == 0:
        SAVE_PATH = './checkpoints/REINFORCE/REINFORCE_{}.pt'.format(i)
        torch.save(policy_net.state_dict(), SAVE_PATH)
    

  output = F.softmax(self.layer3(x))


Episode: 199	Loss: -80.01092529296875	Current Discounted Return: -209.93143693484947

In [11]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_outputs, learning_rate):
        super(PolicyNetwork, self).__init__()
        self.hidden_layer = nn.Linear(n_inputs, n_hidden)
        self.output_layer = nn.Linear(n_hidden, n_outputs)
        self.learning_rate = learning_rate

    def forward(self, state):
        hidden = torch.tanh(self.hidden_layer(state))
        output = torch.softmax(self.output_layer(hidden), dim=-1)
        return output

# Define the REINFORCE algorithm
def reinforce(env, policy_net, gamma, n_episodes, max_steps):
    optimizer = optim.Adam(policy_net.parameters(), lr=policy_net.learning_rate)
    all_rewards = []
    for i in range(n_episodes):
        episode_rewards = []
        episode_log_probs = []
        state = env.reset()
        for t in range(max_steps):
            action_probs = policy_net(torch.from_numpy(state).float())
            action_dist = torch.distributions.Categorical(action_probs)
            action = action_dist.sample()
            log_prob = action_dist.log_prob(action)
            next_state, reward, done, _ = env.step(action.item())
            episode_rewards.append(reward)
            episode_log_probs.append(log_prob)
            state = next_state
            if done:
                break
        all_rewards.append(sum(episode_rewards))
        discounted_rewards = []
        for t in range(len(episode_rewards)):
            Gt = sum([gamma**(k-t-1)*episode_rewards[k] for k in range(t, len(episode_rewards))])
            discounted_rewards.append(Gt)
        discounted_rewards = torch.tensor(discounted_rewards)
        log_probs = torch.stack(episode_log_probs)
        loss = -(log_probs*discounted_rewards).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return all_rewards

# Define the main function to train and test the policy network
def main():
    env = gym.make('CartPole-v0')
    n_inputs = env.observation_space.shape[0]
    n_outputs = env.action_space.n
    policy_net = PolicyNetwork(n_inputs, 32, n_outputs, 0.001)
    gamma = 0.99
    n_episodes = 1000
    max_steps = 200
    rewards = reinforce(env, policy_net, gamma, n_episodes, max_steps)
    print("Average reward over {} episodes: {}".format(n_episodes, np.mean(rewards)))

if __name__ == '__main__':
    main()


TypeError: expected np.ndarray (got tuple)