# Policy Iteration

## 1. Setup

In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from collections import deque 
from itertools import count
from itertools import accumulate
from PIL import Image
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = gym.make('CartPole-v0').unwrapped

state = env.reset()
print("initial state: ", state)

action = env.action_space.sample()
print("sample action: ", action)

n_action = env.action_space.n
n_state = env.observation_space.shape[0]
print("#state: ", n_state)
print("#action: ", n_action)

initial state:  [-0.02427268 -0.04352761  0.04125942 -0.00154008]
sample action:  1
#state:  4
#action:  2


## 2. Policy-Network

In [2]:
policy_net = nn.Sequential(
        nn.Linear(n_state, 20),
        nn.ReLU(),
        nn.Linear(20, n_action),
        nn.Softmax())

## 3. Hyperparameters


In [3]:
EPISODES = 20000
STEPS_EPISODE = 1000
GAMMA = 1

#optimizer = optim.Adam(policy_net.parameters(), lr=0.001)
optimizer = optim.SGD(policy_net.parameters(), lr=0.001)

## 4. Explore a Path

In [4]:
states = []
actions = []
rewards = []

In [5]:
def sample_action(state):
    with torch.no_grad():
        action_prob = policy_net(state).numpy()
    action = np.random.choice(np.arange(n_action), p=action_prob)
    return action

def running_rewards(rewards, gamma): #functional programming
    reversed_rewards = list(reversed(rewards))
    rewards_r = accumulate(reversed_rewards, lambda c1, c2: c1 * gamma + c2)
    return list(reversed(list(rewards_r)))

def run_episode():
    # clear path
    states.clear()
    actions.clear()
    rewards.clear()

    state = torch.tensor(env.reset(), dtype=torch.float)
    episode_steps = 0
    while episode_steps < STEPS_EPISODE:
        action = sample_action(state)
        next_state, reward, done, _ = env.step(action)
        states.append(state)
        actions.append(torch.tensor([action]))
        rewards.append(reward)

        state = torch.tensor(next_state, dtype=torch.float)
        episode_steps += 1
        
        if done:
                break # one episode

## 5. Policy Gradient

In [6]:
def loss(states, actions, rewards):
    batch_states = torch.stack(states)
    actions_prob = policy_net(batch_states)
    batch_actions = torch.stack(actions)
    actions_onehot = torch.scatter(torch.zeros(len(actions), n_action), 1, batch_actions, 1.0) #one-hot action
    actions_cross_entropy = -1.0 * torch.sum(torch.log(actions_prob) * actions_onehot, dim=1)
    actions_weight = torch.tensor(running_rewards(rewards, GAMMA))
    return torch.mean(actions_cross_entropy * actions_weight)

## 6. Train Loop

In [7]:
def evaluate():
    state = torch.tensor(env.reset(), dtype=torch.float)
    while True:
        env.render()
        action = torch.argmax(policy_net(state)).item()
        next_state, _, done, _ = env.step(action)
        state = torch.tensor(next_state, dtype=torch.float)
        if done:
            break # one episode

def train_loop():
    num_episode = 0
    while num_episode < EPISODES:
        run_episode()
        episode_loss = loss(states, actions, rewards)
        optimizer.zero_grad()
        episode_loss.backward()
        optimizer.step()

        if num_episode % 10 == 0:
            print("episode: ", num_episode, "reward: ", len(rewards))
        if num_episode % 100 == 0:
            evaluate()

        num_episode += 1

In [8]:
#train_loop()

## 7. Load Saved Model and Evaluate

In [None]:
#torch.save(policy_net.state_dict(), 'policy.pt')
policy_net.load_state_dict(torch.load('policy.pt'))

evaluate()