## A. Environment

In [8]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class matching_pennies(gym.Env):
    """
    A matching penny game
    """
    def __init__(self):
        self.action_space = spaces.Discrete(2)
        self.opponent_action_space = spaces.Discrete(2)
        self.opponent_action = None
        
    def step(self,action):
        # reward is 1 if matched opponent's previous action and 0 otherwise
        reward = 1 if action == self.opponent_action else 0
        # observation is what the opponent has used
        observation = self._opponent_action()
        # game never terminates
        terminated = False
        truncated = False
        info = None
        
        return observation, reward, terminated, truncated, info    
        
    def reset(self,seed=None):
        super().reset(seed=seed)
        return self._opponent_action(), None

    def _opponent_action(self):
        self.opponent_action = self.opponent_action_space.sample()
        return self.opponent_action

    def to_onehot(self,action,dim):
        a = np.zeros(dim,dtype=np.float32)
        a[action] = 1.0
        return a

In [9]:
env = matching_pennies()
observation, info = env.reset(seed=42)
for i in range(5):
    obs_prev = observation
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions (agent opponent):",action,obs_prev,"reward:",reward,"New obs:",observation)

actions (agent opponent): 1 0 reward: 0 New obs: 1
actions (agent opponent): 0 1 reward: 0 New obs: 1
actions (agent opponent): 1 1 reward: 1 New obs: 0
actions (agent opponent): 0 0 reward: 1 New obs: 0
actions (agent opponent): 0 0 reward: 1 New obs: 1


## B. Policy Network

### B1. The Network

In [18]:
import torch
from torch import nn, cuda, optim

class PolicyNetwork(nn.Module):
    """A simple policy based on a fully-connected network."""
    def __init__(self, action_dim, state_dim, hidden_dim=1):
        super().__init__()

        # Use GPU if available, otherwise use CPU
        self.device = "cuda" if cuda.is_available() else "cpu"
        print(f"Using {self.device} device")

        self.logit = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax() # Output needs to be probabilities of each action 
        )
    
    def forward(self, x):
        return self.logit(x)

    def get_action(self, state):
        probs = self.forward(state)
        action = probs.multinomial(num_samples=1,replacement=True)
        return action, probs[action]

Let us try using this policy to interact with the environment:

In [19]:
# Create env
env = matching_pennies()

# Create policy
policy = PolicyNetwork(env.action_space.n,env.opponent_action_space.n) #.to(device)

# Reset environment and obtain initial state
observation, info = env.reset(seed=42)

# Main Loop
for i in range(5):
    obs_prev = observation

    # The neural network requires the state to be in one-hot encoding
    state_onehot = torch.from_numpy(env.to_onehot(obs_prev,env.opponent_action_space.n))

    # Sample action
    action, prob = policy.get_action(state_onehot)

    # Convert action from PyTorch tensor to numpy
    action = action.detach().numpy()

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,obs_prev,"reward:",reward,"obs:",observation,"log_prob:",prob.detach().numpy())


Using cpu device
actions: [0] 0 reward: 1 obs: 1 log_prob: [0.18378031]
actions: [0] 1 reward: 0 obs: 0 log_prob: [0.18378031]
actions: [1] 0 reward: 0 obs: 1 log_prob: [0.8162197]
actions: [1] 1 reward: 1 obs: 0 log_prob: [0.8162197]
actions: [1] 0 reward: 0 obs: 1 log_prob: [0.8162197]


We can see that the policy's performance is quite poor. This is expected, as we have yet to train the policy.

### B2. Training the Policy Network with the REINFORCE Algorithm

https://link.springer.com/article/10.1007/BF00992696

In [20]:
def REINFORCE_loss(policy, rewards, log_probs, gamma=0.9):
    """
    Loss function based on the REINFORCE algorithm.
    """

    # Compute discounted rewards
    discounted_rewards = []
    for t in range(len(rewards)):
        r = torch.flip(rewards[t:],[0]) # rewards in descending order
        gammas = torch.flip(torch.cumprod(torch.full([len(r)],gamma),dim=0),[0]) # discount factors in descending order
        r_npv = torch.sum(r * gammas) # Take dot product and sum
        discounted_rewards.append(r_npv)
    discounted_rewards = torch.tensor(discounted_rewards)

    # Standardize discounted rewards
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) 

    # Compute loss
    loss = -1 * torch.dot(log_probs,discounted_rewards)

    return loss

In [22]:
# Training loop

# Settings
epochs = 2000
steps = 20
learning_rate=2e-3
gamma = 0.9

# Create environment, policy and optimizer
env = matching_pennies()
policy = PolicyNetwork(env.action_space.n,env.opponent_action_space.n,5) #.to(device)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

policy.train()

observation, info = env.reset()
for i in range(epochs):
    log_probs = []
    rewards = []

    # REINFORCE needs multiple observations to update policy
    for s in range(steps):   
        obs_prev = observation
        state_onehot = torch.from_numpy(env.to_onehot(obs_prev,env.opponent_action_space.n))
        action, prob = policy.get_action(state_onehot)
        action = action.detach().numpy()
        observation, reward, terminated, truncated, info = env.step(action)

        log_probs.append(torch.log(prob))
        rewards.append(reward)

        if terminated:
            break
    
    # Convert reward and log_prob lists to tensors and move to device
    rewards_pt = torch.tensor(rewards) #.to(device)
    log_probs = torch.stack(log_probs).flatten() 
    loss = REINFORCE_loss(policy, rewards_pt, log_probs, gamma=gamma)

    # Back propagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if i %100 == 0:
        # Print status every 100 epochs
        print("Epoch:",str(i).rjust(4),"Loss:",loss.detach().numpy(),"Mean reward:",round(np.mean(rewards),1))

    if terminated:
        # If the environment reached the termianate, reset it
        observation, info = env.reset()

Using cpu device
Epoch:    0 Loss: 0.11626309 Mean reward: 0.6
Epoch:  100 Loss: -0.8199182 Mean reward: 0.4
Epoch:  200 Loss: -1.1424112 Mean reward: 0.6
Epoch:  300 Loss: -0.25469792 Mean reward: 0.7
Epoch:  400 Loss: -2.2170753 Mean reward: 0.9
Epoch:  500 Loss: -2.707262 Mean reward: 0.8
Epoch:  600 Loss: -3.4863317 Mean reward: 0.8
Epoch:  700 Loss: -1.6021891 Mean reward: 0.9
Epoch:  800 Loss: 0.030924983 Mean reward: 1.0
Epoch:  900 Loss: -1.7986401 Mean reward: 1.0
Epoch: 1000 Loss: -5.0884314 Mean reward: 0.9
Epoch: 1100 Loss: 2.4223878 Mean reward: 1.0
Epoch: 1200 Loss: -0.010129217 Mean reward: 1.0
Epoch: 1300 Loss: -2.4018488 Mean reward: 1.0
Epoch: 1400 Loss: -0.010212412 Mean reward: 1.0
Epoch: 1500 Loss: 0.012451749 Mean reward: 1.0
Epoch: 1600 Loss: 0.015387452 Mean reward: 1.0
Epoch: 1700 Loss: 0.0010852888 Mean reward: 1.0
Epoch: 1800 Loss: 4.445414 Mean reward: 0.9
Epoch: 1900 Loss: 4.552398e-05 Mean reward: 1.0


As we can see, with reinforcement learning performance tends to fluctuate quite a bit even after extensive training,
so early stoppoing and model checkpointing is necessary to get good performance.

## C. Q Learning

Q Learning does not use the model to approximate the policy. Instead, it uses the model to 
estimate the net present value of the expected future stream of rewards for each action.
The action that maximizes the estimated NPV is then carried out.

https://arxiv.org/abs/1312.5602


In [307]:
import torch
from torch import nn, cuda, optim

class DQN(nn.Module):
    """
    A simple Q value estimator based on a fully-connected network.
    """
    def __init__(self, action_dim, state_dim, hidden_dim=1):
        super().__init__()

        self.state_dim = state_dim

        # Use GPU if available, otherwise use CPU
        self.device = "cuda" if cuda.is_available() else "cpu"
        print(f"Using {self.device} device")

        self.qvals = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim) # Output is the est. Q value of each action
        )
    
    def forward(self, state):
        state = self.state_to_onehot(state)
        return self.qvals(state)

    def get_action(self, state, epsilon=0):
        # Input: a state
        # Output: the value-maximizing action and the estimated value
        values = self.forward([state]).detach().numpy().flatten()
        action = np.argmax(values)

        # With probability epsilon select a random action
        if epsilon > 0 and np.random.binomial(1,epsilon) == 1:
            action = np.random.randint(len(values))
        
        return action, values[action]

    def state_to_onehot(self,state):
        # The neural network requires the state to be in one-hot encoding
        # This function converts a vector of integer states to one-hot encoding
        state = np.asarray(state).astype(int)
        onehot = np.zeros((len(state),self.state_dim),dtype=np.float32)
        onehot[np.arange(len(state)),state] = 1
        return torch.tensor(onehot)

Let us try using this policy to interact with the environment:

In [335]:
# Create env
env = matching_pennies()

# Create the Q network
Q = DQN(env.action_space.n,env.opponent_action_space.n) #.to(device)

# Reset environment and obtain initial state
observation, info = env.reset(seed=42)

# Main Loop
for i in range(5):
    obs_prev = observation

    # Sample action
    action, value = Q.get_action(obs_prev,epsilon=0)

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,obs_prev,"reward:",reward,"obs:",observation,"value:",value)


Using cpu device
actions: 1 1 reward: 1 obs: 0 value: 0.5923747
actions: 1 0 reward: 0 obs: 0 value: 0.5923747
actions: 1 0 reward: 0 obs: 0 value: 0.5923747
actions: 1 0 reward: 0 obs: 0 value: 0.5923747
actions: 1 0 reward: 0 obs: 0 value: 0.5923747


In [363]:
# Training loop

# Settings
epochs = 2000
batch_size = 32
replay_size = 200
learning_rate=2e-3
gamma = 0.9

# Create environment, Q network and optimizer
env = matching_pennies()
Q = DQN(env.action_space.n,env.opponent_action_space.n,5) #.to(device)
Q_target = DQN(env.action_space.n,env.opponent_action_space.n,5) #.to(device)
optimizer = optim.Adam(Q.parameters(), lr=learning_rate)

Q.train() # Set the PyTorch model to training mode

# Initialize memory for experience reply
replay_memory = np.empty((replay_size,4),dtype=float)

observation, info = env.reset(seed=42)
for i in range(epochs):
    
    obs_prev = observation
    action, value = Q.get_action(obs_prev,epsilon=min(0.1,(epochs - i)/epochs))
    observation, reward, terminated, truncated, info = env.step(action)

    # Add new experience to replay memory. Note the index of each item in memory
    j = i % replay_size 
    replay_memory[j] = [obs_prev,action,reward,observation]

    if i % 32 == 0:
        # Periodically copy parameters to target network
        for param, target_param in zip(Q.parameters(), Q_target.parameters()):
            target_param.data.copy_(param)

    if i > replay_size:
   
        # Sample replay memory for gradient descent
        replay_idx = np.random.choice(replay_size,batch_size)
        replay_samples = torch.tensor(replay_memory[replay_idx],dtype=torch.float) #.to(device)

        s_t = replay_samples[:,0]
        actions = replay_samples[:,1].int()
        rewards = replay_samples[:,2]
        s_t1 = replay_samples[:,3]
            
        # Compute temporal difference loss
        q = Q(s_t)[np.arange(batch_size),actions]
        q_target = torch.max(Q_target(s_t1),dim=1)[0]
        y = rewards + gamma * q_target
        loss_fn = nn.MSELoss()
        loss = loss_fn(q, y)
            
        # Back propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            # Print status every 100 epochs
            print("Epoch:",str(i).rjust(4),
                  "Loss:",loss.detach().numpy(),
                  "Mean reward:",round(np.mean(rewards.detach().numpy()),1))

    if terminated:
        # If the environment reached the termianate, reset it
        observation, info = env.reset()

Using cpu device
Using cpu device
Epoch:  300 Loss: 0.20024265 Mean reward: 0.4
Epoch:  400 Loss: 0.21400134 Mean reward: 0.5
Epoch:  500 Loss: 0.34511584 Mean reward: 0.5
Epoch:  600 Loss: 0.316028 Mean reward: 0.4
Epoch:  700 Loss: 0.18209685 Mean reward: 0.5
Epoch:  800 Loss: 0.19084479 Mean reward: 0.5
Epoch:  900 Loss: 0.2627156 Mean reward: 0.3
Epoch: 1000 Loss: 0.22860463 Mean reward: 0.4
Epoch: 1100 Loss: 0.40016484 Mean reward: 0.6
Epoch: 1200 Loss: 0.20767276 Mean reward: 0.4
Epoch: 1300 Loss: 0.13677607 Mean reward: 0.6
Epoch: 1400 Loss: 0.0608093 Mean reward: 0.8
Epoch: 1500 Loss: 0.00490437 Mean reward: 1.0
Epoch: 1600 Loss: 0.03430771 Mean reward: 1.0
Epoch: 1700 Loss: 0.020101698 Mean reward: 0.9
Epoch: 1800 Loss: 0.0052849497 Mean reward: 1.0
Epoch: 1900 Loss: 0.00017615841 Mean reward: 1.0


In [354]:
# Main Loop
observation, info = env.reset()
for i in range(5):
    obs_prev = observation

    # Sample action
    action, value = Q.get_action(obs_prev,epsilon=0)

    # Interact with environment
    observation, reward, terminated, truncated, info = env.step(action)    
    print("actions:",action,obs_prev,"reward:",reward,"obs:",observation,"value:",value)

actions: 0 0 reward: 1 obs: 0 value: 9.970876
actions: 0 0 reward: 1 obs: 1 value: 9.970876
actions: 1 1 reward: 1 obs: 0 value: 9.970706
actions: 0 0 reward: 1 obs: 1 value: 9.970876
actions: 1 1 reward: 1 obs: 0 value: 9.970706


In [49]:
class market:
    def __init__(self):
        self.action_space = [0,1]
    def step(self):
        # observation is price and quantity transacted
        observation = [p, q]
        # reward is profit
        reward = [pi]
        # terminated
        
        return observation, reward, terminated, truncated, info

In [None]:
class intertemporal_tradeoff:
