# Policy Iteration

## 1. Setup

In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from collections import deque 
from itertools import count
from itertools import accumulate
from PIL import Image
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

env = gym.make('CartPole-v0').unwrapped

state = env.reset()
print("initial state: ", state)

action = env.action_space.sample()
print("sample action: ", action)

n_action = env.action_space.n
n_state = env.observation_space.shape[0]
print("#state: ", n_state)
print("#action: ", n_action)

initial state:  [ 0.04188057  0.0363354  -0.0351086   0.01860545]
sample action:  1
#state:  4
#action:  2


## 2. Policy-Network

In [2]:
policy_net = nn.Sequential(
        nn.Linear(n_state, 20),
        nn.ReLU(),
        nn.Linear(20, n_action),
        nn.Softmax())

## 3. Hyperparameters


In [4]:
EPISODES = 1000
STEPS_EPISODES = 200
GAMMA = 0.99 #reward-discount: 0.99 is better than 0.90 !!!!!!

#optimizer = optim.Adam(policy_net.parameters(), lr=0.001)
optimizer = optim.SGD(policy_net.parameters(), lr=0.001)

## 4. asf 

In [128]:
states = []
actions = []
rewards = []

def sample_action(state):
    with torch.no_grad():
        action_prob = policy_net(state).detach().numpy()
    action = np.random.choice(np.arange(n_action), p=action_prob)
    return action

def running_rewards(rewards, gamma): #functional programming
    reversed_rewards = list(reversed(rewards))
    rewards_r = accumulate(reversed_rewards, lambda c1, c2: c1 * gamma + c2)
    return list(reversed(list(rewards_r)))

def loss(state):
    action_prob = policy_net(state)
    action = sample_action(state)
    action_onehot = torch.scatter(torch.zeros(n_action), 0, torch.tensor([action]), 1.0) #one-hot action
    return torch.dot(torch.log(action_prob), action_onehot)

def run_episode():
    states = []
    actions = []
    rewards = []

    state = torch.tensor(env.reset()) # s
    episode_steps = 0
    while episode_steps < EPISODE_STEPS:
        action = sample_action(state)
        next_state, reward, done, _ = env.step(action)
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state
        
        if done:
                break # one episode



    pass



## 5. Optimize



In [5]:
with torch.no_grad():

    # mean square loss
    loss = loss_fn(policy_q_value, target_q_value)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

## 7. Train Loop

In [7]:
def evaluate():
    state = torch.tensor(env.reset(), dtype=torch.float)
    env.render()
    while True:
        env.render()
        action = torch.argmax(policy_net(state)).item()
        next_state, _, done, _ = env.step(action)
        state = torch.tensor(next_state, dtype=torch.float)
        if done:
            break # one episode

def train_loop():
    update_policy_steps = 0
    for epoch in range(EPOCHS):
        explore_steps = 0
        reward = 0
        # Initialize the environment and state
        state = torch.tensor(env.reset(), dtype=torch.float) # s
        while explore_steps < EPOCH_STEPS:
            explore_steps += 1
            # generate experience
            done, next_state, r = explore_one_step(state, experience_pool)
            state = next_state
            reward += r
            # Perform one step of the optimization
            if len(experience_pool) > WARM_UP_SIZE:
                update_policy_net()
                update_policy_steps += 1
                # Update the target network, copying all weights and biases from policy network
                if update_policy_steps % TARGET_UPDATE == 0:
                    target_net.load_state_dict(policy_net.state_dict())
            if done:
                break # one episode

        if epoch % 50 == 0:
            print("epoch: ", epoch, "reward: ", reward)
            evaluate()

In [None]:
train_loop()

## 8. Load Saved Model

In [None]:
#torch.save(policy_net.state_dict(), 'policy-1.pt')
policy_net.load_state_dict(torch.load('policy-1.pt'))

evaluate()