In [1]:
import random
import gym
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from collections import deque
from tensorflow.compat.v1.keras.models import Sequential
from tensorflow.compat.v1.keras.layers import Dense
from tensorflow.compat.v1.keras.optimizers import Adam
from scores.score_logger import ScoreLogger
import pickle

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
ENV_NAME = "LunarLander-v2"

GAMMA = 0.99
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 64

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.996

In [3]:
class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(150, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(120, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        
        batch = random.sample(self.memory, BATCH_SIZE)
        states = np.array([i[0] for i in batch])
        actions = np.array([i[1] for i in batch])
        rewards = np.array([i[2] for i in batch])
        next_states = np.array([i[3] for i in batch])
        dones = np.array([i[4] for i in batch])
        states = np.squeeze(states)
        next_states = np.squeeze(next_states)
        
        
        targets = rewards + GAMMA*(np.amax(self.model.predict_on_batch(next_states), axis=1))*(1-dones)
        targets_full = self.model.predict_on_batch(states)
        ind = np.array([i for i in range(BATCH_SIZE)])
        loss = (targets_full[[ind], [actions]] - targets)**2 
        targets_full[[ind], [actions]] = targets
        
        hist = self.model.fit(states, targets_full, epochs=1, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)
        #raise
        return hist.history['loss']#np.mean(loss)


In [4]:
def lunarLander():
    env = gym.make(ENV_NAME)
    env.seed(0)
    np.random.seed(0)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    runLosses=[]
    scores =[]
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        score = 0
        maxStep = 3000
        meanLosses = []
        for i in range(maxStep):
            action = dqn_solver.act(state)
            env.render()
            state_next, reward, terminal, info = env.step(action)
            score = score+reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            meanLoss = dqn_solver.experience_replay()
            if meanLoss is not None:
                meanLosses.append(meanLoss)
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(score))
                break
        scores.append(score)
        runLosses.append(np.mean(meanLosses))
        pickle.dump(runLosses, open("runLossesOriginal.p", "wb" ) )  
        score_logger.add_score(score, run)    
    return runLosses

In [5]:
if __name__ == "__main__":
    runLosses = lunarLander()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Run: 1, exploration: 0.8796275632082743, score: -292.759921832735
Scores: (min: -292.759921832735, avg: -292.759921832735, max: -292.759921832735)

Run: 2, exploration: 0.6157153052379084, score: -456.98549550878226


  show_legend=True)


Scores: (min: -456.98549550878226, avg: -374.87270867075864, max: -292.759921832735)

Run: 3, exploration: 0.48996182025414065, score: -158.23723243334592
Scores: (min: -456.98549550878226, avg: -302.66088325828775, max: -158.23723243334592)

Run: 4, exploration: 0.15385479781989633, score: -88.40025647345298
Scores: (min: -456.98549550878226, avg: -249.09572656207905, max: -88.40025647345298)

Run: 5, exploration: 0.10429581231010568, score: -223.58506135106225
Scores: (min: -456.98549550878226, avg: -243.99359351987567, max: -88.40025647345298)

Run: 6, exploration: 0.023576613616952413, score: -383.1819513466203
Scores: (min: -456.98549550878226, avg: -267.19165315766645, max: -88.40025647345298)

Run: 7, exploration: 0.01, score: -110.3288211021972
Scores: (min: -456.98549550878226, avg: -244.78267714974228, max: -88.40025647345298)

Run: 8, exploration: 0.01, score: -198.12497895605406
Scores: (min: -456.98549550878226, avg: -238.95046487553125, max: -88.40025647345298)



KeyboardInterrupt: 

In [32]:
%debug

> [1;32mc:\users\orkun\appdata\local\temp\ipykernel_36220\2816908130.py[0m(47)[0;36mexperience_replay[1;34m()[0m

ipdb> hist.history['loss']
[0.9058142602443695]
ipdb> loss
array([[4.20656574, 4.52285822, 6.13318718, 7.23456013, 4.82113998,
        3.95630233, 1.55217302, 3.95989534, 4.05378362, 7.84526517,
        5.87739536, 0.67669191, 5.25335462, 1.32646247, 5.16987102,
        4.3693604 , 0.48815   , 3.49310592, 0.1750829 , 3.87635875,
        5.80637697, 0.23065196, 9.54079151, 1.8546273 , 3.14631195,
        0.70215007, 0.17903411, 2.7766446 , 5.38457289, 4.49160407,
        0.68234839, 0.19132463, 0.16791573, 2.34158108, 4.22180226,
        1.65105304, 4.57813014, 5.94218238, 4.91486319, 4.61877009,
        8.24936878, 5.03314463, 6.1347496 , 0.35750281, 3.64720983,
        2.25305383, 3.41674207, 8.49110111, 2.05003109, 2.9454179 ,
        7.32668949, 8.99285608, 3.98195419, 5.6555903 , 1.82523772,
        2.52895241, 2.80868734, 1.35615816, 4.23000466, 0.03295478,
      

In [1]:
import pickle
file = open("runLossesOriginal.p",'rb')
runLossesLoaded = pickle.load(file)
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
plt.plot(runLossesLoaded)
plt.xlabel("runs")
plt.ylabel("loss")
plt.title("Change in loss through runs")
plt.show()

  self.func()


In [6]:
%debug

> [1;32mc:\users\orkun\appdata\local\temp\ipykernel_3936\1402854254.py[0m(31)[0;36mlunarLander[1;34m()[0m

ipdb> meanLosses
[3.838341450335367, 3.6234550013577618, 3.5534881237640126, 3.4931691732048646, 3.73838579023756, 3.58266702655518, 3.5005787708407214, 4.349864538689298, 3.293609289912455, 157.4811770350078]
ipdb> np.mean(meanLosses)
19.0454736199905
ipdb> q


In [8]:
runLossesLoaded

[206.28685366234475, 97.41703394132968]