In [1]:
import gym
import swingUp
import numpy as np
import numpy.random as rnd
import torch as pt
import matplotlib.pyplot as plt
%matplotlib inline

Here is code from the previous assignment for convenience. 

In [2]:
class nnQ(pt.nn.Module):
    """
    Here is a basic neural network with for representing a policy 
    """
    
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers):
        super().__init__()
        
        InputLayer = [pt.nn.Linear(stateDim+numActions,numHiddenUnits),
                      pt.nn.ReLU()]
        
        HiddenLayers = []
        for _ in range(numLayers-1):
            HiddenLayers.append(pt.nn.Linear(numHiddenUnits,numHiddenUnits))
            HiddenLayers.append(pt.nn.ReLU())
            
        
        OutputLayer = [pt.nn.Linear(numHiddenUnits,1)]
        
        AllLayers = InputLayer + HiddenLayers + OutputLayer
        self.net = pt.nn.Sequential(*AllLayers)
        
        self.numActions = numActions
        
    def forward(self,x,a):
        x = pt.tensor(x,dtype=pt.float32)

        b = pt.nn.functional.one_hot(pt.tensor(a),self.numActions)
        
        c = b.float().detach()
        
        if len(x.shape) == 1:
            y = pt.cat([x,c])
        else:
            y = pt.cat([x.T,c.T]).T
        return self.net(y)
        
    
class deepQagent:
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers,epsilon=.1,gamma=.9,alpha=.1,
                c = 100,batch_size=20):
        self.Q = nnQ(stateDim,numActions,numHiddenUnits,numLayers)
        self.Q_target = nnQ(stateDim,numActions,numHiddenUnits,numLayers)
        
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.numActions = numActions
        
        self.S = []
        self.A = []
        self.S_next = []
        self.R = []
        self.Done = []
        self.batch_size = batch_size
        self.c = c
        self.optimizer = pt.optim.SGD(self.Q.parameters(),lr=alpha)
        self.counter = 0
    
    def action(self,x):
        # This is an epsilon greedy selection
        if rnd.rand() < self.epsilon:
            a = rnd.randint(numActions)
        else:
            qBest = -np.inf
            for aTest in range(self.numActions):
                qTest = self.Q(x,aTest).detach().numpy()[0]
                if qTest > qBest:
                    qBest = qTest
                    a = aTest
        return a
    
    def update(self,s,a,r,s_next,done):
        self.counter += 1
        
        self.S.append(s)
        self.A.append(a)
        self.R.append(r)
        self.S_next.append(s_next)
        self.Done.append(done)

        B_ind = np.array(rnd.choice(len(self.S),size=self.batch_size),dtype=int)

        S = np.array([self.S[j] for j in B_ind])
        A = np.array([self.A[j] for j in B_ind])
        R = pt.tensor(np.array([self.R[j] for j in B_ind]))
        S_next = np.array([self.S_next[j] for j in B_ind])
        Done = np.array([self.Done[j] for j in B_ind])
        
        Q_cur = self.Q(S,A).squeeze()

        #B_ind = [-1]

        AllActions = np.arange(self.numActions)
        Q_next = []
        for s_next_j,done_j in zip(S_next,Done):
            if done_j:
                Q_next.append(0.)
            else:
                Q_all = self.Q_target(np.tile(s_next_j,(self.numActions,1)),AllActions)
                Q_next.append(pt.max(Q_all).detach().numpy())
        Q_next = pt.tensor(np.array(Q_next),dtype=pt.float)
        loss = .5*pt.mean((R+self.gamma * Q_next-Q_cur)**2)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
      
        if (self.counter % self.c) == 0:
            for p, p_target in zip(self.Q.parameters(),self.Q_target.parameters()):
                p_target.data = p.data.clone().detach()
                
            
class sarsaAgent:
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers,
                epsilon=.1,gamma=.9,alpha=.1):
        self.Q = nnQ(stateDim,numActions,numHiddenUnits,numLayers)
        self.gamma = gamma
        self.epsilon = epsilon
        self.alpha = alpha
        self.numActions = numActions
        self.s_last = None
        
    def action(self,x):
        # This is an epsilon greedy selection
        if rnd.rand() < self.epsilon:
            a = rnd.randint(numActions)
        else:
            qBest = -np.inf
            for aTest in range(self.numActions):
                qTest = self.Q(x,aTest).detach().numpy()[0]
                if qTest > qBest:
                    qBest = qTest
                    a = aTest
        return a
    
    def update(self,s,a,r,s_next,done):
        
        # Compute the TD error, if there is enough data
        update = True
        if done:
            Q_cur = self.Q(s,a).detach().numpy()[0]
            delta = r - Q_cur
            self.s_last = None
            Q_diff = self.Q(s,a)
        elif self.s_last is not None:
            Q_next = self.Q(s,a).detach().numpy()[0]
            Q_cur = self.Q(self.s_last,self.a_last).detach().numpy()[0]
            delta = self.r_last + self.gamma * Q_next - Q_cur
            Q_diff = self.Q(self.s_last,self.a_last)
        else:
            update = False
            
        # Update the parameter via the semi-gradient method
        if update:
            self.Q.zero_grad()
            Q_diff.backward()
            for p in self.Q.parameters():
                p.data.add_(self.alpha*delta,p.grad.data)
                
        if not done:
            self.s_last = np.copy(s)
            self.a_last = np.copy(a)
            self.r_last = np.copy(r)


The simulation is slightly modified from the previous homework. In particular, the episode lengths are restricted to be at most 500. 

In [3]:
# This is the environment
env = swingUp.SwingUpEnv()

# For simplicity, we only consider forces of -1 and 1
numActions = 2
Actions = np.linspace(-1,1,numActions)

# This is our learning agent
gamma = .95
agent = deepQagent(5,numActions,20,2,epsilon=1e-1,gamma=gamma,batch_size=50,
                   c= 100,alpha=1e-4)

#agent = sarsaAgent(5,numActions,20,2,epsilon=5e-2,gamma=gamma,alpha=1e-4)
maxSteps = 5e5

# This is a helper to deal with the fact that x[2] is actually an angle
x_to_y = lambda x : np.array([x[0],x[1],np.cos(x[2]),np.sin(x[2]),x[3]])

R = []
UpTime = []

step = 0
ep = 0
maxLen = 500
while step < maxSteps:
    ep += 1
    x = env.reset()
    C = 0.
    
    done = False
    t = 1
    while not done:
        t += 1
        step += 1
        y = x_to_y(x)
        a = agent.action(y)
        u = Actions[a:a+1]
        env.render()
        x_next,c,done,info = env.step(u)
        
        max_up_time = info['max_up_time']
        y_next = x_to_y(x_next)

        C += (1./t)*(c-C)
        agent.update(y,a,c,y_next,done)
        x = np.copy(x_next)
        if done:
            break
            
        if step >= maxSteps:
            break
            
        #if t > maxLen:
        #    agent.s_last = None
        #    break
            
        
        R.append(C)
    UpTime.append(max_up_time)
    #print('t:',ep+1,', R:',C,', L:',t-1,', G:',G,', Q:', Q_est, 'U:', max_up_time)
    print('Episode:',ep,'Total Steps:',step,', Ave. Reward:',C,', Episode Length:',t-1, 'Max Up-Time:', max_up_time)
env.close()

plt.plot(UpTime)

RuntimeError: one_hot is only applicable to index tensor.

# Question 

Implement deep Q-learning as described in the paper here:

https://daiwk.github.io/assets/dqn.pdf

In this paper, we have the states, and so there is no need to do the pre-processing described there.

In my tests on this problem, it works substantially better than the SARSA  implementation 
with the following design choices:
* Use the same Q-network architecture as used  in the SARSA algorithm
* Same step size, discount factor, and learning rate as above
* Mini-batch size of 20
* Update the target network every 100 steps

The deep Q-learning method can be implemented via a modification of the SARSA code above.

You could probably make it work even better with further tuning.


In [None]:
# Implement this code below and test it.