## A. Environment

In [799]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class matching_pennies(gym.Env):
    """
    A matching penny game
    """
    def __init__(self):
        self.action_space = spaces.Discrete(2)
        self.opponent_action_space = spaces.Discrete(2)
        self.opponent_action = None
        
    def step(self,action):
        # reward is 1 if matched opponent's previous action and 0 otherwise
        reward = 1 if action == self.opponent_action else 0
        # observation is what the opponent is using next
        observation = self.opponent_step()
        # game never terminates
        terminated = False
        truncated = False
        info = None
        
        return observation, reward, terminated, truncated, info    
        
    def reset(self,seed=None):
        super().reset(seed=seed)
        return self.opponent_step(), None

    def opponent_step(self):
        self.opponent_action = self.opponent_action_space.sample()
        return self.opponent_action

In [800]:
env = matching_pennies()
observation, info = env.reset(seed=42)
for i in range(5):
    obs_prev = observation
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions (agent opponent):",action,obs_prev,"reward:",reward,"New obs:",observation)

actions (agent opponent): 0 1 reward: 0 New obs: 0
actions (agent opponent): 0 0 reward: 1 New obs: 1
actions (agent opponent): 1 1 reward: 1 New obs: 0
actions (agent opponent): 0 0 reward: 1 New obs: 0
actions (agent opponent): 0 0 reward: 1 New obs: 0


## B. Policy Network

### B1. The Network

In [884]:
import torch
from torch import nn, cuda, optim

class PolicyNetwork(nn.Module):
    """A simple policy based on a fully-connected network."""
    def __init__(self, action_dim, state_dim, hidden_dim=1, state_onehot=False):
        super().__init__()

        self.state_dim = state_dim
        self.state_onehot = state_onehot

        # Use GPU if available, otherwise use CPU
        self.device = "cuda" if cuda.is_available() else "cpu"
        print(f"Using {self.device} device")

        self.logit = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax() # Output needs to be probabilities of each action 
        )
    
    def forward(self, state):
        if self.state_onehot:
            # Convert input to onehot encoding
            state = self.state_to_onehot(state)
        elif not isinstance(state, torch.Tensor):
            # Convert input to tensor
            state = torch.tensor(state, dtype=torch.float)
        return self.logit(state)    

    def get_action(self, state):
        # Input: a state
        # Output: a random action based on the policy's probability distribution
        probs = torch.flatten(self.forward([state]))
        action = probs.multinomial(num_samples=1,replacement=True)
        return action.detach().numpy()[0], probs[action]

    def state_to_onehot(self,state):
        # The neural network requires the state to be in one-hot encoding
        # This function converts a vector of integer states to one-hot encoding
        state = np.asarray(state).astype(int)
        onehot = np.zeros((len(state),self.state_dim),dtype=np.float32)
        onehot[np.arange(len(state)),state] = 1
        return torch.tensor(onehot)

Let us try using this policy to interact with the environment:

In [410]:
# Create env
env = matching_pennies()

# Create policy
policy = PolicyNetwork(env.action_space.n,
                       env.opponent_action_space.n,
                       state_onehot=True) #.to(device)

# Reset environment and obtain initial state
observation, info = env.reset(seed=42)

# Main Loop
for i in range(5):
    obs_prev = observation

    # Sample action
    action, prob = policy.get_action(obs_prev)

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,obs_prev,"reward:",reward,"obs:",observation,"log_prob:",prob.detach().numpy())


Using cpu device
actions: 1 1 reward: 1 obs: 0 log_prob: [0.654229]
actions: 1 0 reward: 0 obs: 1 log_prob: [0.654229]
actions: 1 1 reward: 1 obs: 1 log_prob: [0.654229]
actions: 0 1 reward: 0 obs: 1 log_prob: [0.345771]
actions: 1 1 reward: 1 obs: 0 log_prob: [0.654229]


We can see that the policy's performance is quite poor. This is expected, as we have yet to train the policy.

### B2. Training the Policy Network with the REINFORCE Algorithm

https://link.springer.com/article/10.1007/BF00992696

In [411]:
def REINFORCE_loss(rewards, log_probs, gamma=0.9):
    """
    Loss function based on the REINFORCE algorithm.
    """

    # Compute discounted rewards
    discounted_rewards = []
    for t in range(len(rewards)):
        r = torch.flip(rewards[t:],[0]) # rewards in descending order
        gammas = torch.flip(torch.cumprod(torch.full([len(r)],gamma),dim=0),[0]) # discount factors in descending order
        r_npv = torch.sum(r * gammas) # Take dot product and sum
        discounted_rewards.append(r_npv)
    discounted_rewards = torch.tensor(discounted_rewards)

    # Standardize discounted rewards
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) 

    # Compute loss
    loss = -1 * torch.dot(log_probs,discounted_rewards)

    return loss

In [412]:
# Training loop

# Settings
epochs = 2000
steps = 20
learning_rate=2e-3
gamma = 0.9

# Create environment, policy and optimizer
env = matching_pennies()
policy = PolicyNetwork(env.action_space.n,
                       env.opponent_action_space.n,
                       5,
                       state_onehot=True) #.to(device)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

policy.train()

observation, info = env.reset()
for i in range(epochs):
    log_probs = []
    rewards = []

    # REINFORCE needs multiple observations to update policy
    for s in range(steps):   
        obs_prev = observation
        action, prob = policy.get_action(obs_prev)
        observation, reward, terminated, truncated, info = env.step(action)

        log_probs.append(torch.log(prob))
        rewards.append(reward)

        if terminated:
            break
    
    # Convert reward and log_prob lists to tensors and move to device
    rewards_pt = torch.tensor(rewards) #.to(device)
    log_probs = torch.stack(log_probs).flatten() 
    loss = REINFORCE_loss(rewards_pt, log_probs, gamma=gamma)

    # Back propagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if i %100 == 0:
        # Print status every 100 epochs
        print("Epoch:",str(i).rjust(4),"Loss:",loss.detach().numpy(),"Mean reward:",round(np.mean(rewards),1))

    if terminated:
        # If the environment reached the termianate, reset it
        observation, info = env.reset()

Using cpu device
Epoch:    0 Loss: 0.28368735 Mean reward: 0.3
Epoch:  100 Loss: -0.54938984 Mean reward: 0.5
Epoch:  200 Loss: 1.3230834 Mean reward: 0.5
Epoch:  300 Loss: 1.3510773 Mean reward: 0.8
Epoch:  400 Loss: -4.4143143 Mean reward: 0.7
Epoch:  500 Loss: -2.1841493 Mean reward: 0.9
Epoch:  600 Loss: -2.6659837 Mean reward: 1.0
Epoch:  700 Loss: 1.6938107 Mean reward: 1.0
Epoch:  800 Loss: -5.585439 Mean reward: 0.9
Epoch:  900 Loss: -0.016009888 Mean reward: 1.0
Epoch: 1000 Loss: 0.06854717 Mean reward: 1.0
Epoch: 1100 Loss: -0.2960411 Mean reward: 1.0
Epoch: 1200 Loss: -2.1644385 Mean reward: 1.0
Epoch: 1300 Loss: 0.0043674503 Mean reward: 1.0
Epoch: 1400 Loss: -1.9839033e-05 Mean reward: 1.0
Epoch: 1500 Loss: -4.554878 Mean reward: 1.0
Epoch: 1600 Loss: -0.028746527 Mean reward: 1.0
Epoch: 1700 Loss: 0.0040667234 Mean reward: 1.0
Epoch: 1800 Loss: -0.002804216 Mean reward: 1.0
Epoch: 1900 Loss: -0.009262005 Mean reward: 1.0


As we can see, with reinforcement learning performance tends to fluctuate quite a bit even after extensive training,
so early stoppoing and model checkpointing is necessary to get good performance.

## C. Q Learning

Q Learning does not use the model to approximate the policy. Instead, it uses the model to 
estimate the net present value of the expected future stream of rewards for each action.
The action that maximizes the estimated NPV is then carried out.

- Deep Q Learning: https://arxiv.org/abs/1312.5602
- Double DQN: https://arxiv.org/abs/1509.06461


In [599]:
import torch
from torch import nn, cuda, optim

class DQN(nn.Module):
    """
    A simple Q value estimator based on a fully-connected network.
    """
    def __init__(self, action_dim, state_dim, hidden_dim=1, state_onehot=False):
        super().__init__()

        self.state_dim = state_dim
        self.state_onehot = state_onehot

        # Use GPU if available, otherwise use CPU
        self.device = "cuda" if cuda.is_available() else "cpu"
        print(f"Using {self.device} device")

        self.qvals = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim) # Output is the est. Q value of each action
        )
    
    def forward(self, state):
        if self.state_onehot:
            # Convert input to onehot encoding
            state = self.state_to_onehot(state)
        elif not isinstance(state, torch.Tensor):
            # Convert input to tensor
            state = torch.tensor(state, dtype=torch.float)
        return self.qvals(state)

    def get_action(self, state, epsilon=0,p=None):
        # Input: a state
        # Output: the value-maximizing action and the estimated value
        values = self.forward([state]).detach().numpy().flatten()
        action = np.argmax(values)

        # With probability epsilon select a random action
        if epsilon > 0 and np.random.binomial(1,epsilon) == 1:
            action = np.random.choice(np.arange(len(values)), p=p)
        
        return action, values[action]

    def state_to_onehot(self,state):
        # If the neural network requires the state to be in one-hot encoding,
        # this function converts a vector of integer states to one-hot encoding
        state = np.asarray(state).astype(int)
        onehot = np.zeros((len(state),self.state_dim),dtype=np.float32)
        onehot[np.arange(len(state)),state] = 1
        return torch.tensor(onehot)

    def copy_(self,source):
        # Copy parameters from source model
        for source_param, self_param in zip(source.parameters(), self.parameters()):
            self_param.data.copy_(source_param)

In [560]:
np.random.choice(np.arange(len([0.3,0.7])))

1

Let us try using this policy to interact with the environment:

In [562]:
# Create env
env = matching_pennies()

# Create the Q network
Q = DQN(env.action_space.n,
        env.opponent_action_space.n,
        state_onehot=True) #.to(device)

# Reset environment and obtain initial state
observation, info = env.reset(seed=42)

# Main Loop
for i in range(5):
    obs_prev = observation

    # Sample action
    action, value = Q.get_action(obs_prev,epsilon=0)

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,obs_prev,"reward:",reward,"obs:",observation,"value:",value)


Using cpu device
actions: 0 0 reward: 1 obs: 1 value: -0.26208562
actions: 0 1 reward: 0 obs: 0 value: -0.4125787
actions: 0 0 reward: 1 obs: 1 value: -0.26208562
actions: 0 1 reward: 0 obs: 1 value: -0.4125787
actions: 0 1 reward: 0 obs: 0 value: -0.4125787


In [565]:
# Training loop

# Settings
epochs = 2000
batch_size = 32
replay_size = 200
learning_rate=2e-3
gamma = 0.9
doubleDQN = True

# Create environment, Q network and optimizer
env = matching_pennies()
Q = DQN(env.action_space.n,
        env.opponent_action_space.n,
        5,
        state_onehot=True) #.to(device)
Q_target = DQN(env.action_space.n,
               env.opponent_action_space.n,
               5,
               state_onehot=True) #.to(device)
optimizer = optim.Adam(Q.parameters(), lr=learning_rate)

Q.train() # Set the PyTorch model to training mode

# Initialize memory for experience reply
replay_memory = np.empty((replay_size,4),dtype=float)

observation, info = env.reset(seed=42)
for i in range(epochs):

    # Interact with the environment
    obs_prev = observation
    action, value = Q.get_action(obs_prev,epsilon=max(0.1,(epochs - i)/epochs))
    observation, reward, terminated, truncated, info = env.step(action)

    # Add new experience to replay memory. Note the index of each item in memory
    j = i % replay_size 
    terminated_dummy = 1 if terminated else 0
    replay_memory[j] = [obs_prev,action,reward,observation,terminated_dummy]

    # Start training after replay memory is filled
    if i > replay_size:
       
        # Sample replay memory for gradient descent
        replay_idx = np.random.choice(replay_size,batch_size)
        replay_samples = torch.tensor(replay_memory[replay_idx],dtype=torch.float) #.to(device)

        s_t = replay_samples[:,0]
        actions = replay_samples[:,1].int()
        rewards = replay_samples[:,2]
        s_t1 = replay_samples[:,3]
            
        # Compute temporal difference loss
        q = Q(s_t)[np.arange(batch_size),actions]
        if doubleDQN:
            qt_actions = torch.max(Q(s_t1),dim=1)[1]
            q_target = Q_target(s_t1)[np.arange(batch_size),qt_actions]
        else:
            q_target = torch.max(Q_target(s_t1),dim=1)[0]
        y = rewards + (1 - terminated_dummy) * gamma * q_target
        loss_fn = nn.MSELoss()
        loss = loss_fn(q, y)
            
        # Back propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i % 32 == 0:
            # Periodically copy parameters to target network
            Q_target.copy_(Q)

        if i % 100 == 0:
            # Print status every 100 epochs
            print("Epoch:",str(i).rjust(4),
                  "Loss:",loss.detach().numpy(),
                  "Mean reward:",round(np.mean(rewards.detach().numpy()),1))

    if terminated:
        # If the environment reached the termianate, reset it
        observation, info = env.reset()

Using cpu device
Using cpu device
Epoch:  300 Loss: 0.2954119 Mean reward: 0.4
Epoch:  400 Loss: 0.37430918 Mean reward: 0.6
Epoch:  500 Loss: 0.2813556 Mean reward: 0.6
Epoch:  600 Loss: 0.22838762 Mean reward: 0.5
Epoch:  700 Loss: 0.3197466 Mean reward: 0.6
Epoch:  800 Loss: 0.17621846 Mean reward: 0.6
Epoch:  900 Loss: 0.16213207 Mean reward: 0.5
Epoch: 1000 Loss: 0.108626485 Mean reward: 0.7
Epoch: 1100 Loss: 0.03381476 Mean reward: 0.8
Epoch: 1200 Loss: 0.006733194 Mean reward: 0.9
Epoch: 1300 Loss: 0.0070919204 Mean reward: 1.0
Epoch: 1400 Loss: 0.003033378 Mean reward: 1.0
Epoch: 1500 Loss: 0.0008553036 Mean reward: 1.0
Epoch: 1600 Loss: 0.0043909918 Mean reward: 0.9
Epoch: 1700 Loss: 0.0047095013 Mean reward: 0.9
Epoch: 1800 Loss: 0.00028236024 Mean reward: 1.0
Epoch: 1900 Loss: 0.0013501876 Mean reward: 0.9


In [368]:
# Main Loop
observation, info = env.reset()
for i in range(5):
    obs_prev = observation

    # Sample action
    action, value = Q.get_action(obs_prev,epsilon=0)

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,obs_prev,"reward:",reward,"obs:",observation,"value:",value)

actions: 0 0 reward: 1 obs: 0 value: 9.752915
actions: 0 0 reward: 1 obs: 1 value: 9.752915
actions: 1 1 reward: 1 obs: 1 value: 9.75479
actions: 1 1 reward: 1 obs: 1 value: 9.75479
actions: 1 1 reward: 1 obs: 1 value: 9.75479


In [49]:
class market:
    def __init__(self):
        self.action_space = [0,1]
    def step(self):
        # observation is price and quantity transacted
        observation = [p, q]
        # reward is profit
        reward = [pi]
        # terminated
        
        return observation, reward, terminated, truncated, info

## Quasi-Hyperbolic Discounting Pleasant Task

In [566]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class movies(gym.Env):
    """
    Quasi-hyperbolic discounting pleasant task example.
    """

    def __init__(self):
        self.action_space = spaces.Discrete(2) # Possible actions are watch now or not
        self.movie_quality = [1,1.5,2.25,3.375]
        self.t = 0
        self.terminated = False
                
    def step(self,action):

        if self.terminated or action == 0:
            reward = 0
        elif action == 1:
            reward = self.movie_quality[self.t]
            self.terminated = True
        
        if self.t == 3:
            self.terminated = True

        self.t += 1       
        observation = self.t
        truncated = False
        info = None
        
        return observation, reward, self.terminated, truncated, info    
        
    def reset(self,seed=None):
        super().reset(seed=seed)
        self.t = 0
        self.terminated = False
        return self.t, None

In [578]:
# Create env
env = movies()

# Create the Q network
Q = DQN(env.action_space.n,1) #.to(device)

# Reset environment and obtain initial state
observation, info = env.reset()

# Main Loop
for i in range(5):
    obs_prev = observation

    # Sample action
    action, value = Q.get_action(obs_prev,epsilon=1)

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,"obs_prev:",obs_prev,"reward:",reward,"obs_next:",observation,"value:",value)

    if terminated:
        break


Using cpu device
actions: 0 obs_prev: 0 reward: 0 obs_next: 1 value: 0.4543814
actions: 1 obs_prev: 1 reward: 1.5 obs_next: 2 value: -0.038886547


In [732]:
# Training loop

# Settings
epochs = 2000
batch_size = 32
replay_size = 200
learning_rate=2e-2
gamma = 0.9
doubleDQN = True

# Create environment, Q network and optimizer
env = movies()
Q = DQN(env.action_space.n,1,10) #.to(device)
Q_target = DQN(env.action_space.n,1,10) #.to(device)
optimizer = optim.Adam(Q.parameters(),lr=learning_rate)

Q.train() # Set the PyTorch model to training mode

# Initialize memory for experience reply
replay_memory = np.empty((replay_size,5),dtype=float)

observation, info = env.reset(seed=42)
for i in range(epochs):

    # Interact with the environment
    obs_prev = observation
    action, value = Q.get_action(obs_prev,
                                 epsilon=max(0.1,(epochs - i)/epochs))
    observation, reward, terminated, truncated, info = env.step(action)

    # Add new experience to replay memory. Note the index of each item in memory
    j = i % replay_size 
    terminated_dummy = 1 if terminated else 0
    replay_memory[j] = [obs_prev,action,reward,observation,terminated_dummy]

    #if i == 200:
    #    print(replay_memory)

    # Start training after replay memory is filled
    if i > replay_size:

        # Sample replay memory for gradient descent
        replay_idx = np.random.choice(replay_size,batch_size)
        replay_samples = torch.tensor(replay_memory[replay_idx],dtype=torch.float) #.to(device)

        # State vector's shape is (batch_size,state_dim)
        # Action and reward vectors' shape is (batch_size)
        s_t = replay_samples[:,0].view(-1,1) 
        actions = replay_samples[:,1].int()  
        rewards = replay_samples[:,2]
        s_t1 = replay_samples[:,3].view(-1,1)
        terminated_dummy = replay_samples[:,4]
           
        # Compute temporal difference loss
        q = Q(s_t)[np.arange(batch_size),actions]
        if doubleDQN:
            qt_actions = torch.max(Q(s_t1),dim=1)[1]
            q_target = Q_target(s_t1)[np.arange(batch_size),qt_actions]
        else:
            q_target = torch.max(Q_target(s_t1),dim=1)[0]
        y = rewards + (1 - terminated_dummy) * gamma * q_target
        loss_fn = nn.MSELoss()
        loss = loss_fn(q,y)
            
        # Back propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #if i == 300:
        #    print(actions.size())
        #    print(q.size())
        #    print(qt_actions.size())
        #    print(q_target.size())
        #    print(rewards.size())
        #    print(y.size())
        
        if i % 64 == 0:
            # Periodically copy parameters to target network
            Q_target.copy_(Q)

        if i % 100 == 0:
            # Print status every 100 epochs
            print("Epoch:",str(i).rjust(4),
                  "Loss:",loss.detach().numpy(),
                  "Mean state:",round(np.mean(s_t.detach().numpy()),1),
                  "Mean reward:",round(np.mean(rewards.detach().numpy()),1),
                  "Mean y:",round(np.mean(y.detach().numpy()),1))

    if terminated:
        # If the environment reached the termianate, reset it
        observation, info = env.reset()

Using cpu device
Using cpu device
Epoch:  300 Loss: 0.32236484 Mean state: 0.6 Mean reward: 0.9 Mean y: 1.2
Epoch:  400 Loss: 0.37459922 Mean state: 1.1 Mean reward: 1.0 Mean y: 1.7
Epoch:  500 Loss: 0.042553537 Mean state: 0.8 Mean reward: 0.6 Mean y: 1.8
Epoch:  600 Loss: 0.0049793064 Mean state: 0.9 Mean reward: 1.0 Mean y: 2.2
Epoch:  700 Loss: 0.00027488632 Mean state: 1.0 Mean reward: 0.8 Mean y: 2.2
Epoch:  800 Loss: 0.00013354213 Mean state: 1.1 Mean reward: 0.7 Mean y: 2.2
Epoch:  900 Loss: 1.7203974e-05 Mean state: 1.2 Mean reward: 0.8 Mean y: 2.6
Epoch: 1000 Loss: 8.674604e-06 Mean state: 1.1 Mean reward: 0.6 Mean y: 2.4
Epoch: 1100 Loss: 1.5109847e-06 Mean state: 1.3 Mean reward: 0.5 Mean y: 2.4
Epoch: 1200 Loss: 2.5104993e-07 Mean state: 1.1 Mean reward: 0.6 Mean y: 2.7
Epoch: 1300 Loss: 9.723606e-08 Mean state: 0.9 Mean reward: 0.7 Mean y: 2.4
Epoch: 1400 Loss: 1.9826613e-08 Mean state: 1.2 Mean reward: 0.5 Mean y: 2.5
Epoch: 1500 Loss: 4.9950737e-09 Mean state: 1.3 Mean 

In [733]:
# Reset environment and obtain initial state
observation, info = env.reset()

# Main Loop
for i in range(5):
    obs_prev = observation

    # Sample actionv 
    action, value = Q.get_action(obs_prev,epsilon=0)

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,"obs_prev:",obs_prev,"reward:",reward,"obs_next:",observation,"value:",value)

    if terminated:
        break


actions: 0 obs_prev: 0 reward: 0 obs_next: 1 value: 2.4603784
actions: 0 obs_prev: 1 reward: 0 obs_next: 2 value: 2.7337518
actions: 0 obs_prev: 2 reward: 0 obs_next: 3 value: 3.0374982
actions: 1 obs_prev: 3 reward: 3.375 obs_next: 4 value: 3.3749976


In [735]:
for i in range(4):
    print(np.round(Q([i]).detach().numpy(),2))

[2.46 1.  ]
[2.73 1.5 ]
[3.04 2.25]
[0.   3.37]


## Quasi-Hyperbolic Discounting

In [795]:
# Training loop

# Settings
epochs = 4000
batch_size = 32
replay_size = 200
learning_rate=2e-2
gamma = 0.99
beta = 0.5
sophisticated = True

# Create environment, Q network and optimizer
env = movies()
Q = DQN(env.action_space.n,1,10) #.to(device)
Qd = DQN(env.action_space.n,1,10) #.to(device)
Qd_target = DQN(env.action_space.n,1,10) #.to(device)
optimizer = optim.Adam(Q.parameters(),lr=learning_rate)
optimizer_d = optim.Adam(Qd.parameters(),lr=learning_rate)

Q.train() # Set the PyTorch model to training mode
Qd.train()

# Initialize memory for experience reply
replay_memory = np.empty((replay_size,5),dtype=float)

observation, info = env.reset(seed=42)
for i in range(epochs):

    # Interact with the environment
    obs_prev = observation
    action, value = Qd.get_action(obs_prev,
                                 epsilon=max(0.1,(epochs - i)/epochs))
    observation, reward, terminated, truncated, info = env.step(action)

    # Add new experience to replay memory. Note the index of each item in memory
    j = i % replay_size 
    terminated_dummy = 1 if terminated else 0
    replay_memory[j] = [obs_prev,action,reward,observation,terminated_dummy]

    #if i == 200:
    #    print(replay_memory)

    # Start training after replay memory is filled
    if i > replay_size:

        # Sample replay memory for gradient descent
        replay_idx = np.random.choice(replay_size,batch_size)
        replay_samples = torch.tensor(replay_memory[replay_idx],dtype=torch.float) #.to(device)

        # State vector's shape is (batch_size,state_dim)
        # Action and reward vectors' shape is (batch_size)
        s_t = replay_samples[:,0].view(-1,1) 
        actions = replay_samples[:,1].int()  
        rewards = replay_samples[:,2]
        s_t1 = replay_samples[:,3].view(-1,1)
        terminated_dummy = replay_samples[:,4]

        # Compute temporal difference loss
        qd = Qd(s_t)[np.arange(batch_size),actions]
        
        # DoubleDQN
        qdt_actions = torch.max(Qd(s_t1),dim=1)[1]
        qd_target = Qd_target(s_t1)[np.arange(batch_size),qdt_actions]
        
        yd = rewards + (1 - terminated_dummy) * gamma * qd_target
        loss_fn_d = nn.MSELoss()
        loss_d = loss_fn_d(qd,yd)
            
        # Back propagation
        optimizer_d.zero_grad()
        loss_d.backward()
        optimizer_d.step()        

        if i % 64 == 0:
            # Periodically copy parameters to target network
            Qd_target.copy_(Qd)        
           
        # Compute temporal difference loss
        q = Q(s_t)[np.arange(batch_size),actions]
        with torch.no_grad():
            if sophisticated:
                qt_actions = torch.max(Q(s_t1),dim=1)[1]
                q_target = Qd_target(s_t1)[np.arange(batch_size),qt_actions]
            else:
                q_target = qd_target.detach()
        y = rewards + (1 - terminated_dummy) * beta * gamma * q_target.detach()
        loss_fn = nn.MSELoss()
        loss = loss_fn(q,y)
            
        # Back propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #if i == 300:
        #    print(actions.size())
        #    print(q.size())
        #    print(qt_actions.size())
        #    print(q_target.size())
        #    print(rewards.size())
        #    print(y.size())

        if i % 100 == 0:
            # Print status every 100 epochs
            print("Epoch:",str(i).rjust(4),
                  "Loss:",loss.detach().numpy(),
                  "Mean state:",round(np.mean(s_t.detach().numpy()),1),
                  "Mean reward:",round(np.mean(rewards.detach().numpy()),1),
                  "Mean y:",round(np.mean(y.detach().numpy()),1))

    if terminated:
        # If the environment reached the termianate, reset it
        observation, info = env.reset()

Using cpu device
Using cpu device
Using cpu device
Epoch:  300 Loss: 0.034831237 Mean state: 0.8 Mean reward: 0.5 Mean y: 1.2
Epoch:  400 Loss: 0.005780112 Mean state: 1.0 Mean reward: 0.9 Mean y: 1.3
Epoch:  500 Loss: 0.0019741203 Mean state: 0.7 Mean reward: 0.6 Mean y: 1.1
Epoch:  600 Loss: 0.0012389993 Mean state: 0.9 Mean reward: 0.7 Mean y: 1.3
Epoch:  700 Loss: 5.5109354e-05 Mean state: 0.8 Mean reward: 0.7 Mean y: 1.2
Epoch:  800 Loss: 4.3531458e-05 Mean state: 0.7 Mean reward: 0.7 Mean y: 1.2
Epoch:  900 Loss: 6.5375214e-05 Mean state: 0.9 Mean reward: 0.6 Mean y: 1.3
Epoch: 1000 Loss: 1.0686587e-05 Mean state: 1.0 Mean reward: 0.5 Mean y: 1.2
Epoch: 1100 Loss: 1.6823014e-05 Mean state: 0.9 Mean reward: 0.6 Mean y: 1.2
Epoch: 1200 Loss: 2.4930569e-07 Mean state: 1.1 Mean reward: 0.9 Mean y: 1.5
Epoch: 1300 Loss: 2.9762127e-06 Mean state: 0.9 Mean reward: 0.6 Mean y: 1.3
Epoch: 1400 Loss: 1.632975e-06 Mean state: 1.1 Mean reward: 0.9 Mean y: 1.3
Epoch: 1500 Loss: 4.0544082e-06 

In [796]:
# Reset environment and obtain initial state
observation, info = env.reset()

# Main Loop
for i in range(5):
    obs_prev = observation

    # Sample actionv 
    action, value = Q.get_action(obs_prev,epsilon=0)

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,"obs_prev:",obs_prev,"reward:",reward,"obs_next:",observation,"value:",value)

    if terminated:
        break


actions: 1 obs_prev: 0 reward: 1 obs_next: 1 value: 0.99475443


In [797]:
for i in range(4):
    print(np.round(Q([i]).detach().numpy(),2))

[0.74 0.99]
[1.11 1.5 ]
[1.66 2.25]
[-0.01  3.38]


In [798]:
for i in range(4):
    print(np.round(Qd([i]).detach().numpy(),2))

[3.27 1.  ]
[3.31 1.5 ]
[3.34 2.25]
[-0.    3.37]


# PPO

- GAE: https://arxiv.org/abs/1506.02438
- PPO: https://arxiv.org/abs/1707.06347

We add two things in comparison with REINFORCE:
1. Multiply the Generalized Advantage Estimate (GAE) instead of discounted sum of realized returns to the outputed probabilities from the policy network. This is a better choice because realized returns as a estimator for value has high variance.
2. Add the predicted future value from a value network whenever discounted sum of returns is used.

The objective function has an additional term that trains the value network by minimizing the MSE of the predicted future value with the NPV of actual returns.

In [879]:
def PPO_loss(probs, td_residuals, rewards, values, values_next,
             gamma=0.9, Lambda=0.9, epsilon=0.2):
    """
    PPO loss function
    """

    ### Part I. Policy Loss (L_CLIP in PPO paper) ###
    
    # Clipped probability ratio
    probs_old = probs.detach()
    probs_ratio = probs/probs_old
    clipped_ratio = torch.clamp(probs_ratio,1-epsilon,1+epsilon)
        
    # Generalized advantage estimate
    GAE_list = []
    for t in range(len(td_residuals)):
        a = torch.flip(td_residuals[t:],[0]) # td_residuals in descending order
        gl = torch.flip(torch.cumprod(torch.full([len(a)],gamma*Lambda),dim=0),[0]) # discount factors in descending order
        GAE = torch.sum(a * gl) # Take dot product and sum
        GAE_list.append(GAE)
    GAE_list = torch.tensor(GAE_list)
    
    # Policy loss
    policy_loss = -1 * torch.min(torch.dot(probs_ratio,GAE_list),
                                 torch.dot(clipped_ratio,GAE_list))

    ### Part II. Value Loss ###
    
    # Discounted rewards
    vals_next = values_next.detach().flatten()
    discounted_rewards = []
    for t in range(len(rewards)):
        r = torch.flip(rewards[t:],[0]) # rewards in descending order
        gammas = torch.flip(torch.cumprod(torch.full([len(r)],gamma),dim=0),[0]) # discount factors in descending order
        r_npv = torch.sum(r * gammas)  + gamma**len(rewards) * vals_next[-1] # Take dot product and sum
        discounted_rewards.append(r_npv)
    discounted_rewards = torch.tensor(discounted_rewards)    

    # Value loss
    value_mse = torch.nn.MSELoss()
    value_loss = value_mse(values,discounted_rewards)

    ### Part I + II: Total Loss (L_CLIP+VF in PPO paper) ###
    
    # If policy and value networks do not share parameters,
    # they are optimized separately so equal weight is fine.
    loss = policy_loss + value_loss

    return loss

In [880]:
# Training loop

# Settings
epochs = 2000
steps = 20
updates = 3
learning_rate=2e-3
gamma = 0.9
Lambda = 0.9

# Create environment, policy network, value network and optimizer
env = matching_pennies()
policy = PolicyNetwork(env.action_space.n,env.opponent_action_space.n,5) #.to(device)
value = DQN(1,env.opponent_action_space.n,5,state_onehot=True) 
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

policy.train()
value.train()

observation, info = env.reset()
for i in range(epochs):
    probs_list = []
    rewards_list = []
    obs_list = [] 
    obs_prev_list = []

    # Needs multiple observations to update policy
    for s in range(steps):   
        obs_prev = observation
        action, prob = policy.get_action(obs_prev)
        observation, reward, terminated, truncated, info = env.step(action)

        probs_list.append(prob)
        rewards_list.append(reward)
        obs_list.append(observation)
        obs_prev_list.append(obs_prev)

        if terminated:
            break

    # Compute value for observed states
    values = value(torch.tensor(obs_list)).flatten()
    values_prev = value(torch.tensor(obs_prev_list)).flatten()

    # Compute TD residual
    rewards = torch.tensor(rewards_list)
    td_residuals = rewards + gamma * values.detach() - values_prev.detach()
    
    # Compute PPO loss
    probs = torch.stack(probs_list).flatten() 
    loss = PPO_loss(  probs, 
                      td_residuals,
                      rewards_pt, 
                      values_prev,
                      values,
                      gamma=gamma,
                      Lambda=Lambda)

    # Back propagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if i %100 == 0:
        # Print status every 100 epochs
        print("Epoch:",str(i).rjust(4),"Loss:",loss.detach().numpy(),"Mean reward:",round(np.mean(rewards_list),1))

    if terminated:
        # If the environment reached the termianate, reset it
        observation, info = env.reset()


Using cpu device
Using cpu device
Epoch:    0 Loss: -6.9472485 Mean reward: 0.5
Epoch:  100 Loss: -25.05658 Mean reward: 0.7
Epoch:  200 Loss: 2.1252441 Mean reward: 0.4
Epoch:  300 Loss: -31.24781 Mean reward: 0.9
Epoch:  400 Loss: -29.000067 Mean reward: 0.8
Epoch:  500 Loss: -39.239563 Mean reward: 1.0
Epoch:  600 Loss: -38.19125 Mean reward: 1.0
Epoch:  700 Loss: -36.576694 Mean reward: 1.0
Epoch:  800 Loss: -36.789467 Mean reward: 1.0
Epoch:  900 Loss: -39.429123 Mean reward: 1.0
Epoch: 1000 Loss: -39.313236 Mean reward: 1.0
Epoch: 1100 Loss: -38.557446 Mean reward: 1.0
Epoch: 1200 Loss: -39.268036 Mean reward: 1.0
Epoch: 1300 Loss: -38.251858 Mean reward: 1.0
Epoch: 1400 Loss: -38.35975 Mean reward: 1.0
Epoch: 1500 Loss: -37.360153 Mean reward: 1.0
Epoch: 1600 Loss: -39.329643 Mean reward: 1.0
Epoch: 1700 Loss: -39.281906 Mean reward: 1.0
Epoch: 1800 Loss: -39.241333 Mean reward: 1.0
Epoch: 1900 Loss: -39.036697 Mean reward: 1.0


In [881]:
obs_list

[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1]

In [882]:
value(torch.tensor(obs_list)).flatten()

tensor([1.0046, 1.0046, 1.0046, 1.0046, 1.0046, 0.6212, 1.0046, 0.6212, 0.6212,
        1.0046, 1.0046, 0.6212, 1.0046, 0.6212, 0.6212, 1.0046, 1.0046, 0.6212,
        1.0046, 0.6212], grad_fn=<ViewBackward0>)

In [926]:
# Training loop

# Settings
epochs = 2000
steps = 20
updates = 3
learning_rate=2e-3
gamma = 0.9
Lambda = 0.9

# Create environment, policy network, value network and optimizer
env = movies()
policy = PolicyNetwork(env.action_space.n,1,10) #.to(device)
value = DQN(1,1,10) # Single output because we just need V(s_t), not V(a_t|s_t)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

policy.train()
value.train()

observation, info = env.reset()
for i in range(epochs):
    probs_list = []
    rewards_list = []
    obs_list = [] 
    obs_prev_list = []

    # Needs multiple observations to update policy
    for s in range(steps):   
        obs_prev = observation
        action, prob = policy.get_action(obs_prev)
        observation, reward, terminated, truncated, info = env.step(action)

        probs_list.append(prob)
        rewards_list.append(reward)
        obs_list.append(observation)
        obs_prev_list.append(obs_prev)

        if terminated:
            break

    # Compute value for observed states
    values = value(torch.tensor(obs_list,dtype=torch.float).view(-1,1)).flatten()
    values_prev = value(torch.tensor(obs_prev_list,dtype=torch.float).view(-1,1)).flatten()

    # Compute TD residual
    rewards = torch.tensor(rewards_list,dtype=torch.float)
    td_residuals = rewards + gamma * values.detach() - values_prev.detach()
    
    # Compute PPO loss
    probs = torch.stack(probs_list).flatten() 
    loss = PPO_loss(  probs, 
                      td_residuals,
                      rewards, 
                      values_prev,
                      values,
                      gamma=gamma,
                      Lambda=Lambda)

    # Back propagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if i %100 == 0:
        # Print status every 100 epochs
        print("Epoch:",str(i).rjust(4),"Loss:",loss.detach().numpy(),"Mean reward:",round(np.mean(rewards_list),1))

    if terminated:
        # If the environment reached the termianate, reset it
        observation, info = env.reset()


Using cpu device
Using cpu device
Epoch:    0 Loss: -0.010719001 Mean reward: 1.0
Epoch:  100 Loss: -0.010719001 Mean reward: 1.0
Epoch:  200 Loss: -0.60822475 Mean reward: 0.8
Epoch:  300 Loss: -0.010719001 Mean reward: 1.0
Epoch:  400 Loss: -0.010719001 Mean reward: 1.0
Epoch:  500 Loss: -0.60822475 Mean reward: 0.8
Epoch:  600 Loss: -0.60822475 Mean reward: 0.8
Epoch:  700 Loss: -1.1804235 Mean reward: 0.8
Epoch:  800 Loss: -0.010719001 Mean reward: 1.0
Epoch:  900 Loss: -0.010719001 Mean reward: 1.0
Epoch: 1000 Loss: -0.010719001 Mean reward: 1.0
Epoch: 1100 Loss: -1.1804235 Mean reward: 0.8
Epoch: 1200 Loss: -0.010719001 Mean reward: 1.0
Epoch: 1300 Loss: -0.60822475 Mean reward: 0.8
Epoch: 1400 Loss: -0.010719001 Mean reward: 1.0
Epoch: 1500 Loss: -1.1804235 Mean reward: 0.8
Epoch: 1600 Loss: -1.1804235 Mean reward: 0.8
Epoch: 1700 Loss: -0.010719001 Mean reward: 1.0
Epoch: 1800 Loss: -1.1804235 Mean reward: 0.8
Epoch: 1900 Loss: -0.60822475 Mean reward: 0.8


In [927]:
for i in range(4):
    print(np.round(value([i]).detach().numpy(),2))

[-0.12]
[-0.15]
[-0.28]
[-0.43]


In [928]:
for i in range(4):
    print(np.round(policy([i]).detach().numpy(),2))

[0.78 0.22]
[0.64 0.36]
[0.24 0.76]
[0.05 0.95]
