In [1]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4,256)
        self.fc2 = nn.Linear(256,2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,1)
        else : 
            return out.argmax().item()
            
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)

        q_out = q(s)
        q_a = q_out.gather(1,a)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
     
        
def main():
    env = gym.make('CartPole-v1')
    #env = gym.make('BeamRider-v0')
    
    q = Qnet()
    q_target = Qnet()
    q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer()

    print_interval = 20
    score = 0.0  
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)

    for n_epi in range(1000):
        epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
        s = env.reset()

        for t in range(600):
            a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
            s_prime, r, done, info = env.step(a)
            done_mask = 0.0 if done else 1.0
            memory.put((s,a,r/100.0,s_prime, done_mask))
            s = s_prime

            score += r
            if done:
                break
            
        if memory.size()>2000:
            train(q, q_target, memory, optimizer)

        if n_epi%print_interval==0 and n_epi!=0:
            q_target.load_state_dict(q.state_dict())
            #env.render()
            #plt.imshow(env.render(mode='rgb_array'))
            print("# of episode :{}, avg score : {:.1f}, buffer size : {}, epsilon : {:.1f}%".format(n_epi, score/print_interval, memory.size(), epsilon*100))
            score = 0.0

    print("Learning Completed")
    return q
    
if __name__ == '__main__':
    q = main()

# of episode :20, avg score : 10.4, buffer size : 208, epsilon : 7.9%
# of episode :40, avg score : 9.5, buffer size : 398, epsilon : 7.8%
# of episode :60, avg score : 10.6, buffer size : 609, epsilon : 7.7%
# of episode :80, avg score : 9.4, buffer size : 797, epsilon : 7.6%
# of episode :100, avg score : 9.6, buffer size : 988, epsilon : 7.5%
# of episode :120, avg score : 9.7, buffer size : 1182, epsilon : 7.4%
# of episode :140, avg score : 10.1, buffer size : 1383, epsilon : 7.3%
# of episode :160, avg score : 10.3, buffer size : 1589, epsilon : 7.2%
# of episode :180, avg score : 9.9, buffer size : 1787, epsilon : 7.1%
# of episode :200, avg score : 9.8, buffer size : 1984, epsilon : 7.0%
# of episode :220, avg score : 9.2, buffer size : 2169, epsilon : 6.9%
# of episode :240, avg score : 9.8, buffer size : 2365, epsilon : 6.8%
# of episode :260, avg score : 9.6, buffer size : 2556, epsilon : 6.7%
# of episode :280, avg score : 10.3, buffer size : 2762, epsilon : 6.6%
# of episo

In [2]:
import gym
env = gym.make('CartPole-v1')

for i_episode in range(5):
    observation = env.reset()
    for t in range(200):
        env.render()
        #print(observation)
        #action = env.action_space.sample()
        #action = q.sample_action(torch.from_numpy(observation).float(), 0)
        action = q.sample_action(torch.from_numpy(observation).float(), max(0.01, 0.08 - 0.01*(t/200)))

        observation, reward, done, info = env.step(action)
        #if done:
        #  print("Episode finished after {} timesteps".format(t+1))
        # break
env.close()
print("end")


end


In [3]:
import gym
env = gym.make('CartPole-v1')
frames = []

for i_episode in range(5):
    observation = env.reset()
    for t in range(200):
        #frames.append(env.render(mode = 'rgb_array'))
        env.render()
        #print(observation)
        action = env.action_space.sample()

        observation, reward, done, info = env.step(action)
        #if done:
        #  print("Episode finished after {} timesteps".format(t+1))
        # break

env.close()
#display_frames_as_gif(frames)
print("end")



end


In [4]:
# The typical imports 
import gym 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 

# Imports specifically so we can render outputs in Jupyter. 
from JSAnimation.IPython_display import display_animation 
from matplotlib import animation 
from IPython.display import display 

def display_frames_as_gif(frames): 
    """ Displays a list of frames as a gif, with controls """ 
    #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72) 
    patch = plt.imshow(frames[0]) 
    plt.axis('off') 
    
    def animate(i): 
        patch.set_data(frames[i]) 
        
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50) 
    display(display_animation(anim, default_mode='loop'))
