In [2]:
import gym
import swingUp
import numpy as np
import numpy.random as rnd
import torch as pt
import matplotlib.pyplot as plt
%matplotlib inline

Here is code from the previous assignment for convenience. 

In [3]:
class nnQ(pt.nn.Module):
    """
    Here is a basic neural network with for representing a policy 
    """
    
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers):
        super().__init__()
        
        InputLayer = [pt.nn.Linear(stateDim+numActions,numHiddenUnits),
                      pt.nn.ReLU()] 
        
        HiddenLayers = []
        for _ in range(numLayers-1):
            HiddenLayers.append(pt.nn.Linear(numHiddenUnits,numHiddenUnits))
            HiddenLayers.append(pt.nn.ReLU())
            
        
        OutputLayer = [pt.nn.Linear(numHiddenUnits,1)]
        
        AllLayers = InputLayer + HiddenLayers + OutputLayer
        self.net = pt.nn.Sequential(*AllLayers)
        
        self.numActions = numActions
        
    def forward(self,x,a):
        x = pt.tensor(x,dtype=pt.float32)
        a = pt.tensor(a,dtype=pt.int64)
        b = pt.nn.functional.one_hot(a,self.numActions)
        
        c = b.float().detach()
        y = pt.cat([x,c])
        
        return self.net(y)
            
class sarsaAgent:
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers,
                epsilon=.1,gamma=.9,alpha=.1):
        self.Q = nnQ(stateDim,numActions,numHiddenUnits,numLayers)
        self.gamma = gamma
        self.epsilon = epsilon
        self.alpha = alpha
        self.numActions = numActions
        self.s_last = None
        
    def action(self,x):
        # This is an epsilon greedy selection
        if rnd.rand() < self.epsilon: # If less than epsilon, t
            a = rnd.randint(numActions)
        else:
            qBest = -np.inf
            for aTest in range(self.numActions):
                qTest = self.Q(x,aTest).detach().numpy()[0]
                if qTest > qBest:
                    qBest = qTest
                    a = aTest
        return a
    
    def update(self,s,a,r,s_next,done):
        
        # Compute the TD error, if there is enough data
        update = True
        if done:
            Q_cur = self.Q(s,a).detach().numpy()[0] 
            delta = r - Q_cur
            self.s_last = None
            Q_diff = self.Q(s,a)
        elif self.s_last is not None: # Not terminal state
            Q_next = self.Q(s,a).detach().numpy()[0]
            Q_cur = self.Q(self.s_last,self.a_last).detach().numpy()[0]
            delta = self.r_last + self.gamma * Q_next - Q_cur 
            Q_diff = self.Q(self.s_last,self.a_last)
        else:
            update = False
            
        # Update the parameter (weights) via the semi-gradient method
        if update:
            
            self.Q.zero_grad()
            Q_diff.backward()
            for p in self.Q.parameters():
                p.data.add_(self.alpha*delta,p.grad.data)
                
        
            
            
        
        if not done:
            self.s_last = np.copy(s)
            self.a_last = np.copy(a)
            self.r_last = np.copy(r)


The simulation is slightly modified from the previous homework. In particular, the episode lengths are restricted to be at most 500. 

In [None]:
# This is the environment
env = swingUp.SwingUpEnv()

# For simplicity, we only consider forces of -1 and 1
numActions = 2
Actions = np.linspace(-1,1,numActions)

# This is our learning agent
gamma = .95

agent = sarsaAgent(5,numActions,20,2,epsilon=5e-2,gamma=gamma,alpha=1e-4)
maxSteps = 2e5

# This is a helper to deal with the fact that x[2] is actually an angle
x_to_y = lambda x : np.array([x[0],x[1],np.cos(x[2]),np.sin(x[2]),x[3]])

R = []
UpTime = []

step = 0
ep = 0
maxLen = 500
while step < maxSteps: # Outerloop is no of steps
    ep += 1
    x = env.reset() # Initialize
    C = 0.
    
    done = False
    t = 1
    while not done:  # These are the number of 
        t += 1
        step += 1
        y = x_to_y(x)
        
        # step #1
        a = agent.action(y)
        u = Actions[a:a+1]
        env.render()
        
        # step #2
        x_next,c,done,info = env.step(u)
        
        max_up_time = info['max_up_time']
        y_next = x_to_y(x_next)
    
        C += (1./t)*(c-C) # avg reward
        
         
        print(y)
        agent.update(y,a,c,y_next,done)
        x = np.copy(x_next)
        
        if done:
            break
            
        if step >= maxSteps:
            break
            
        if t > maxLen:
            agent.s_last = None
            break
            
        
        R.append(C)
    UpTime.append(max_up_time)
    #print('t:',ep+1,', R:',C,', L:',t-1,', G:',G,', Q:', Q_est, 'U:', max_up_time)
    print('Episode:',ep,'Total Steps:',step,', Ave. Reward:',C,', Episode Length:',t-1, 'Max Up-Time:', max_up_time)
env.close()

plt.plot(UpTime)

# Question 

Implement deep Q-learning as described in the paper here:

https://daiwk.github.io/assets/dqn.pdf

In this paper, we have the states, and so there is no need to do the pre-processing described there.

In my tests on this problem, it works substantially better than the SARSA  implementation 
with the following design choices:
* Use the same Q-network architecture as used  in the SARSA algorithm
* Same step size, discount factor, and learning rate as above
* Mini-batch size of 20
* Update the target network every 100 steps

The deep Q-learning method can be implemented via a modification of the SARSA code above.

You could probably make it work even better with further tuning.


In [None]:
# Implement this code below and test it.

In [9]:
class DQN(pt.nn.Module):
    """
    Here is a basic neural network with for representing a policy 
    """
    
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers):
        super().__init__()
        
        InputLayer = [pt.nn.Linear(stateDim+numActions,numHiddenUnits),
                      pt.nn.ReLU()] 
        
        HiddenLayers = []
        for _ in range(numLayers-1):
            HiddenLayers.append(pt.nn.Linear(numHiddenUnits,numHiddenUnits))
            HiddenLayers.append(pt.nn.ReLU())
            
        
        OutputLayer = [pt.nn.Linear(numHiddenUnits,1)]
        
        AllLayers = InputLayer + HiddenLayers + OutputLayer
        self.net = pt.nn.Sequential(*AllLayers)
        
        self.numActions = numActions
        
    def forward(self,x,a):
        x = pt.tensor(x,dtype=pt.float32)
        a = pt.tensor(a,dtype=pt.int64)
        b = pt.nn.functional.one_hot(a,self.numActions)
        
        c = b.float().detach()
        y = pt.cat([x,c])
        
        return self.net(y)
            
class DQNAgent:
    def __init__(self,stateDim,numActions,numHiddenUnits,numLayers,
                epsilon=.1,gamma=.9,alpha=.1):
        self.Q = DQN(stateDim,numActions,numHiddenUnits,numLayers)
        self.Q_c = DQN(stateDim,numActions,numHiddenUnits,numLayers)
        self.gamma = gamma
        self.epsilon = epsilon
        self.alpha = alpha
        self.numActions = numActions
        self.s_last = None
        
    def action(self,x):
        # This is an epsilon greedy selection
        if rnd.rand() < self.epsilon: # If less than epsilon, t
            a = rnd.randint(numActions)
        else:
            qBest = -np.inf
            for aTest in range(self.numActions):
                qTest = self.Q(x,aTest).detach().numpy()[0]
                if qTest > qBest:
                    qBest = qTest
                    a = aTest
        return a
    
    def update(self,s,a,r,s_next,done):
        update = True
        if done:
            y = self.r_last 

        elif self.s_last is not None: # Not terminal state
            Q_next =[];
            Q_next.append(self.Q_c(s,1).detach().numpy()[0])
            Q_next.append(self.Q_c(s,-1).detach().numpy()[0])
            y = self.r_last + self.gamma * np.max(Q_next) 
        else:
            update = False
            return y
        
        if not done:
            self.s_last = np.copy(s)
            self.a_last = np.copy(a)
            self.r_last = np.copy(r)
            
    def gradUpdate(y,s_j_batch, a_batch):
        for i in range(100):
            loss +=  (y[i] - self.Q(s_j_batch[i],a_batch[i]))**2/100
        loss.zero_grad()
        loss.backward()
        for p in self.Q.parameters():
            p.data.add_(loss.grad.data)

        


In [None]:
# This is the environment
env = swingUp.SwingUpEnv()

# For simplicity, we only consider forces of -1 and 1
numActions = 2
Actions = np.linspace(-1,1,numActions)

# This is our learning agent
gamma = .95
agent = DQNAgent(5,numActions,20,1,epsilon=5e-2,gamma=gamma,alpha=1e-4)
#agent_cap = DQNAgent(5,numActions,20,1,epsilon=5e-2,gamma=gamma,alpha=1e-4)

maxSteps = 2e3
minibatch_size = 20
# This is a helper to deal with the fact that x[2] is actually an angle
x_to_y = lambda x : np.array([x[0],x[1],np.cos(x[2]),np.sin(x[2]),x[3]]) # this is dyn of the cartpole

R = []
UpTime = []
D = []
step = 0
ep = 0
observe = 10000
while step < maxSteps:
    ep += 1
    x = env.reset() # initialize s
    C = 0.  # WHATS C
    
    done = False
    t = 1
    while not done:
        t += 1 
        step += 1
        y = x_to_y(x)
        
        # Step 1: Pick At via epsilon-greedy 
        a = agent.action(y)
        u = Actions[a:a+1]
        env.render()
        
        # Step 2: Observe S_t+1 and R_t+1
        x_next,c,done,info = env.step(u)
 
        max_up_time = info['max_up_time']
        y_next = x_to_y(x_next)

        C += (1./t)*(c-C)
        D.append((y, a, c, y_next,done))
        if t > observe:

            minibatch = random.sample(D, minibatch_size)
            # Step 4: Select a mini batch B in D
            minibatch_idx = np.random.randint(t, size=(minibatch_size,))
            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]
            term = [d[4] for d in minibatch]
            
            y_batch = []
            
            for i in range(0, len(minibatch)):
                y_batch.append(agent.update(s_j_batch,a_batch,r_batch[i],s_j1_batch[i],done[i]))
                
          
        # Compute gradient
        agent.gradUpdate(y_batch)
        
        
        x = x_next
        if done:
            break
            
        if step >= maxSteps:
            break
            
        
        R.append(C)
    UpTime.append(max_up_time)
    #print('t:',ep+1,', R:',C,', L:',t-1,', G:',G,', Q:', Q_est, 'U:', max_up_time)
    print('Episode:',ep,'Total Steps:',step,', Ave. Reward:',C,', Episode Length:',t-1, 'Max Up-Time:', max_up_time)
env.close()

plt.plot(UpTime)