## Cart and Pole Problem

# Visualization without Optimization

In [9]:
import gym, time

env = gym.make('CartPole-v0')
env.reset()

for _ in range(150):
    env.render()
    env.step(env.action_space.sample())
    time.sleep(0.02)
env.close()

In [12]:
%matplotlib inline
import math, random, matplotlib, torch
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

### Setup Display

In [13]:
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: from IPython import display

### Deep Q-Network

In [14]:
class DQN(nn.Module):
    def __init__(self, img_height, img_width):
        super.__init__()
        
        self.fc1 = nn.Linear(in_features=img_height*img_width*3, out_features=24)
        self.fc2 = nn.Linear(in_features=24, out_features=32)
        self.out = nn.Linear(in_features=32, out_features=2)
        
    def forward(self, t):
        t = t.flatten(start_dim=1)
        t = F.relu(self.fc1(t))
        t = F.relu(self.fc2(t))
        t = self.out(t)
        return t

### Experience Class

In [15]:
Experience = namedtuple('Experience',('state', 'action', 'next_state', 'reward'))

# Example of Experience Class
e = Experience(9, 3, 4, 8)
print(e)

Experience(state=9, action=3, next_state=4, reward=8)


### Replay Memory

In [16]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.push_count = 0
    
    def push(self, experience):
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.push_count % self.capacity] = experience
        self.push_count += 1
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def can_provide_sample(self, batch_size):
        return len(self.memory) >= batch_size

### Epsilon Greedy Strategy

In [17]:
class EpsilonGreedyStrategy():
    def __init__(self, start, end, decay):
        self.start = start
        self.end = end
        self.decay = decay
        
    def get_exploration_rate(self, current_step):
        return self.end + (self.start - self.end) *\
            math.exp(-1 * current_step * self.decay)

### RL Agent

In [18]:
class Agent():
    def __init__(self, strategy, num_actions):
        self.current_step = 0
        self.strategy = strategy
        self.num_actions = num_actions
        
    def select_action(self, state, policy_net):
        rate = self.strategy.get_exploration_rate(self.current_step)
        self.current_step += 1
        
        if rate > random.random():
            return random.randrange(self.num_actions) # Explore
        else:
            with torch.no_grad():
                return policy_net(state).argmax(dim=1).item() # Exploit