In [12]:
import gym
import random
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from tensorflow.compat.v1.keras import Sequential
from collections import deque
from tensorflow.compat.v1.keras.layers import Dense
from tensorflow.compat.v1.keras.optimizers import Adam
import matplotlib.pyplot as plt
from tensorflow.compat.v1.keras.activations import relu, linear
import pickle

In [13]:
import numpy as np
env = gym.make('LunarLander-v2')
env.seed(0)
np.random.seed(0)

[2021-11-28 17:17:43,227] Making new env: LunarLander-v2


In [14]:
class DQN:

    """ Implementation of deep q learning algorithm """

    def __init__(self, action_space, state_space):

        self.action_space = action_space
        self.state_space = state_space
        self.epsilon = 1.0
        self.gamma = .99
        self.batch_size = 64
        self.epsilon_min = .01
        self.lr = 0.001
        self.epsilon_decay = .996
        self.memory = deque(maxlen=1000000)
        self.model = self.build_model()

    def build_model(self):

        model = Sequential()
        model.add(Dense(150, input_dim=self.state_space, activation=relu))
        model.add(Dense(120, activation=relu))
        model.add(Dense(self.action_space, activation=linear))
        model.compile(loss='mse', optimizer=Adam(lr=self.lr))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):

        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self):

        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])

        states = np.squeeze(states)
        next_states = np.squeeze(next_states)

        targets = rewards + self.gamma*(np.amax(self.model.predict_on_batch(next_states), axis=1))*(1-dones)
        targets_full = self.model.predict_on_batch(states)
        ind = np.array([i for i in range(self.batch_size)])
        loss = (targets_full[[ind], [actions]] - targets)**2 
        targets_full[[ind], [actions]] = targets

        self.model.fit(states, targets_full, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return np.mean(loss)


In [18]:
def train_dqn(episode):

    loss = []
    agent = DQN(env.action_space.n, env.observation_space.shape[0])
    meanError =  []
    for e in range(episode):
        state = env.reset()
        state = np.reshape(state, (1, 8))
        score = 0
        max_steps = 3000
        stepErrors = []
        for i in range(max_steps):
            action = agent.act(state)
            env.render()
            next_state, reward, done, _ = env.step(action)
            score += reward
            next_state = np.reshape(next_state, (1, 8))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            error = agent.replay()
            if error is not None:
                stepErrors.append(error)
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        loss.append(score)
        meanError.append(np.mean(stepErrors))

        # Average score of last 100 episode
        is_solved = np.mean(loss[-100:])
        if is_solved > 200:
            print('\n Task Completed! \n')
            break
        print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
    return loss,meanError

In [19]:
if __name__ == '__main__':

    print(env.observation_space)
    print(env.action_space)
    episodes = 400
    loss,meanError = train_dqn(episodes)
    pickle.dump(loss, open("runLosses.p", "wb" ) )  
    pickle.dump(meanError, open("meanErrors.p", "wb" ) )  
    plt.plot([i+1 for i in range(0, len(loss), 2)], loss[::2])
    plt.show()

Box(8,)
Discrete(4)
episode: 0/400, score: -384.5824241681966
Average over last 100 episode: -384.58 

episode: 1/400, score: -193.93825171306588
Average over last 100 episode: -289.26 

episode: 2/400, score: -397.5105277991687
Average over last 100 episode: -325.34 

episode: 3/400, score: -192.3152222283839
Average over last 100 episode: -292.09 

episode: 4/400, score: -329.0991301417556
Average over last 100 episode: -299.49 

episode: 5/400, score: -319.0329241226091
Average over last 100 episode: -302.75 

episode: 6/400, score: -190.43118212083948
Average over last 100 episode: -286.70 

episode: 7/400, score: -238.99352374450507
Average over last 100 episode: -280.74 

episode: 8/400, score: -158.99164664009126
Average over last 100 episode: -267.21 

episode: 9/400, score: -132.03462188367507
Average over last 100 episode: -253.69 

episode: 10/400, score: -87.71467616297826
Average over last 100 episode: -238.60 

episode: 11/400, score: -181.68811832963706
Average over last

episode: 98/400, score: 142.2959258315939
Average over last 100 episode: -70.85 

episode: 99/400, score: -72.56441637593053
Average over last 100 episode: -70.87 

episode: 100/400, score: 199.7259124396487
Average over last 100 episode: -65.02 

episode: 101/400, score: 194.61895157884007
Average over last 100 episode: -61.14 

episode: 102/400, score: 223.39165788716014
Average over last 100 episode: -54.93 

episode: 103/400, score: 181.58780758177338
Average over last 100 episode: -51.19 

episode: 104/400, score: 222.28247140926328
Average over last 100 episode: -45.67 

episode: 105/400, score: 226.95551387156718
Average over last 100 episode: -40.21 

episode: 106/400, score: 170.13769067134243
Average over last 100 episode: -36.61 

episode: 107/400, score: 197.09395567883115
Average over last 100 episode: -32.25 

episode: 108/400, score: -42.899073919838436
Average over last 100 episode: -31.09 

episode: 109/400, score: 179.1893438166622
Average over last 100 episode: -27.9

episode: 197/400, score: 178.30403978190267
Average over last 100 episode: 143.45 

episode: 198/400, score: 198.2421142993595
Average over last 100 episode: 144.01 

episode: 199/400, score: 151.8804390640485
Average over last 100 episode: 146.26 

episode: 200/400, score: 214.14194025830733
Average over last 100 episode: 146.40 

episode: 201/400, score: 226.98416717995968
Average over last 100 episode: 146.72 

episode: 202/400, score: 190.85800781147773
Average over last 100 episode: 146.40 

episode: 203/400, score: 216.2660760460273
Average over last 100 episode: 146.75 

episode: 204/400, score: 200.81790953765264
Average over last 100 episode: 146.53 

episode: 205/400, score: 202.02191248101767
Average over last 100 episode: 146.28 

episode: 206/400, score: 226.6668851486878
Average over last 100 episode: 146.85 

episode: 207/400, score: 61.557473792282686
Average over last 100 episode: 145.49 

episode: 208/400, score: 182.43905054821064
Average over last 100 episode: 147.7

episode: 295/400, score: 226.39131298730064
Average over last 100 episode: 176.84 

episode: 296/400, score: 207.18836688766515
Average over last 100 episode: 177.22 

episode: 297/400, score: 218.6899078793574
Average over last 100 episode: 177.62 

episode: 298/400, score: 202.6030829754955
Average over last 100 episode: 177.66 

episode: 299/400, score: -105.14297788205371
Average over last 100 episode: 175.09 

episode: 300/400, score: 178.89302648515425
Average over last 100 episode: 174.74 

episode: 301/400, score: 206.62275043592913
Average over last 100 episode: 174.54 

episode: 302/400, score: 204.71766886794887
Average over last 100 episode: 174.68 

episode: 303/400, score: 214.38003859871446
Average over last 100 episode: 174.66 

episode: 304/400, score: 178.7837394713144
Average over last 100 episode: 174.44 

episode: 305/400, score: 239.04041850650412
Average over last 100 episode: 174.81 

episode: 306/400, score: 212.95098375074022
Average over last 100 episode: 174

episode: 393/400, score: 198.17108438597847
Average over last 100 episode: 168.27 

episode: 394/400, score: 216.6261913302091
Average over last 100 episode: 168.22 

episode: 395/400, score: 224.5970791900989
Average over last 100 episode: 168.21 

episode: 396/400, score: 216.82012173767964
Average over last 100 episode: 168.30 

episode: 397/400, score: 212.4230230469897
Average over last 100 episode: 168.24 

episode: 398/400, score: 227.10424639754348
Average over last 100 episode: 168.48 

episode: 399/400, score: -678.8787993763048
Average over last 100 episode: 162.75 



NameError: name 'pickle' is not defined

In [17]:
%debug

> [1;32md:\miniconda\envs\tf-gpu\lib\site-packages\numpy\core\_methods.py[0m(178)[0;36m_mean[1;34m()[0m
[1;32m    176 [1;33m            [0mis_float16_result[0m [1;33m=[0m [1;32mTrue[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    177 [1;33m[1;33m[0m[0m
[0m[1;32m--> 178 [1;33m    [0mret[0m [1;33m=[0m [0mumr_sum[0m[1;33m([0m[0marr[0m[1;33m,[0m [0maxis[0m[1;33m,[0m [0mdtype[0m[1;33m,[0m [0mout[0m[1;33m,[0m [0mkeepdims[0m[1;33m,[0m [0mwhere[0m[1;33m=[0m[0mwhere[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    179 [1;33m    [1;32mif[0m [0misinstance[0m[1;33m([0m[0mret[0m[1;33m,[0m [0mmu[0m[1;33m.[0m[0mndarray[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    180 [1;33m        ret = um.true_divide(
[0m
ipdb> u
> [1;32md:\miniconda\envs\tf-gpu\lib\site-packages\numpy\core\fromnumeric.py[0m(3419)[0;36mmean[1;34m()[0m
[1;32m   3417 [1;33m            [1;32mreturn[0m [0mmean[0m[1;33m([0m[0m