# Topic 4 Policy-Based Learning

## Policy Gradient for Cartpole Demo

In [None]:
import numpy as np
import gym
import matplotlib.pyplot as plt
from itertools import count

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import adam
from torch.distributions import Categorical

env = gym.make('CartPole-v0')

#Hyperparameters
learning_rate = 0.02
gamma = 0.995
episodes = 1000

eps = np.finfo(np.float32).eps.item()

action_space = env.action_space.n
state_space = env.observation_space.shape[0]C

### Create Neural Network Model

In [None]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_space, 20)
        self.fc2 = nn.Linear(20, 30)
        self.fc3 = nn.Linear(30, action_space)

        self.gamma = gamma
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=1)

        return x

policy = Policy()
optimizer = adam.Adam(policy.parameters(), lr=learning_rate)

### Select Action

In [None]:
def selct_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    c = Categorical(probs)
    action = c.sample()

    policy.saved_log_probs.append(c.log_prob(action))
    action = action.item()
    return action

### Update Policy Network
$$ R_{t} = \sum_{k=0}^{N} \gamma^{k}r_{t+k} \\
\Delta\theta_t = \alpha\nabla_\theta \, \log \pi_\theta (s_t,a_t)v_t  $$





In [None]:
def update_policy():
    R = 0
    policy_loss = []
    rewards = []

    for r in policy.rewards[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0, R)

    # Formalize reward
    rewards = torch.tensor(rewards)
    rewards = (rewards - rewards.mean())/(rewards.std() + eps)

    # get loss
    for reward, log_prob in zip(rewards, policy.saved_log_probs):
        policy_loss.append(-log_prob * reward)

    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()

    del policy.rewards[:]
    del policy.saved_log_probs[:]

### Training

In [None]:
for episode in range(1000):
    obs = env.reset()
    done = False
    score = 0 

    while not done:
        action = selct_action(obs)
        obs, reward ,done, info = env.step(action)
        env.render()
        policy.rewards.append(reward)
        score+=reward

    print(f'Episode:{episode} Score:{score}')
    if episode % 50 == 0:
        torch.save(policy, 'policyNet.pkl')

    update_policy()

### Testing

In [None]:
import gym
policy = torch.load("policyNet.pkl")

episode = 1
while True:
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = selct_action(obs)
        state, reward ,done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
    episode +=1


## Activity: Policy Gradient

In [None]:
import numpy as np
import gym
import matplotlib.pyplot as plt
from itertools import count

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim import adam
from torch.distributions import Categorical

env = gym.make('CartPole-v0')

#Hyperparameters
learning_rate = 0.02
gamma = 0.995
episodes = 1000

eps = np.finfo(np.float32).eps.item()

action_space = env.action_space.n
state_space = env.observation_space.shape[0]

In [None]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.fc1 = __________
        self.fc2 = _____________

        self.gamma = gamma
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = ____________
        x = _______________

        return x

policy = Policy()
optimizer = adam.Adam(policy.parameters(), lr=learning_rate)

In [None]:
def selct_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    c = Categorical(probs)
    action = c.sample()

    policy.saved_log_probs.append(c.log_prob(action))
    action = action.item()
    return action

In [None]:
def update_policy():
    R = 0
    policy_loss = []
    rewards = []

    for r in policy.rewards[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0, R)

    # Formalize reward
    rewards = torch.tensor(rewards)
    rewards = (rewards - rewards.mean())/(rewards.std() + eps)

    # get loss
    for reward, log_prob in zip(rewards, policy.saved_log_probs):
        policy_loss.append(-log_prob * reward)

    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()

    del policy.rewards[:]
    del policy.saved_log_probs[:]

In [None]:
for episode in range(1000):
    obs = env.reset()
    done = False
    score = 0 

    while not done:
        action = selct_action(obs)
        obs, reward ,done, info = env.step(action)
        env.render()
        policy.rewards.append(reward)
        score+=reward

    print(f'Episode:{episode} Score:{score}')
    if episode % 50 == 0:
        torch.save(policy, 'policyNet2.pkl')

    update_policy()

In [None]:
import gym
policy = torch.load("policyNet2.pkl")

episode = 1
while True:
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = selct_action(obs)
        state, reward ,done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
    episode +=1


### Solution

In [None]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_space, 20)
        self.fc2 = nn.Linear(20, action_space)

        self.gamma = gamma
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=1)

        return x

policy = Policy()
optimizer = adam.Adam(policy.parameters(), lr=learning_rate)