In [1]:
import gym, random
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np
from collections import deque

class DQLAgent:
    def __init__(self, env):
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9993
        self.gamma = 0.99
        self.learning_rate = 0.0001
        self.memory = deque(maxlen=4000)
        self.model = self.build_model()
        self.target_model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(64, input_dim=self.state_size, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, s):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)
        act_values = self.model.predict(s)
        return np.argmax(act_values[0])

    def replay(self,batch_size):
        "vectorized replay method"
        if len(agent.memory) < batch_size:
            return
        # Vectorized method for experience replay
        minibatch = random.sample(self.memory, batch_size)
        minibatch = np.array(minibatch)
        not_done_indices = np.where(minibatch[:, 4] == False)
        y = np.copy(minibatch[:, 2])

        # If minibatch contains any non-terminal states, use separate update rule for those states
        if len(not_done_indices[0]) > 0:
            predict_sprime = self.model.predict(np.vstack(minibatch[:, 3]))
            predict_sprime_target = self.target_model.predict(np.vstack(minibatch[:, 3]))
            
            # Non-terminal update rule
            y[not_done_indices] += np.multiply(self.gamma, predict_sprime_target[not_done_indices, np.argmax(predict_sprime[not_done_indices, :][0], axis=1)][0])

        actions = np.array(minibatch[:, 1], dtype=int)
        y_target = self.model.predict(np.vstack(minibatch[:, 0]))
        y_target[range(batch_size), actions] = y
        self.model.fit(np.vstack(minibatch[:, 0]), y_target, epochs=1, verbose=0)

    def adaptiveEGreedy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def targetModelUpdate(self):
        self.target_model.set_weights(self.model.get_weights())    
    
    def load(self, name):
        self.model = load_model(name)
            
    def save(self, name):
        self.model.save(name)

In [None]:
if __name__ == "__main__":
    env = gym.make('LunarLander-v2')

    agent = DQLAgent(env)
    state_number = env.observation_space.shape[0]
    
    batch_size = 32
    episodes = 10000
    for e in range(episodes):
        
        state = env.reset()
        state = np.reshape(state, [1, state_number])

        total_reward = 0
        for time in range(1000):
            
            env.render()

            # act
            action = agent.act(state)
            
            # step
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_number])

            # remember / storage
            agent.remember(state, action, reward, next_state, done)

            # update state
            state = next_state

            #Perform experience replay if memory length is greater than minibatch length
            agent.replay(batch_size)

            total_reward += reward
            
            if done:
                agent.targetModelUpdate()
                if total_reward == 500:
                    print("Saving trained model as lunarlander-dqn.h5")
                    agent.save("lunarlander-dqn.h5")
                break
                
        #adjust epsilon
        agent.adaptiveEGreedy()
            
        print("Episode: {}/{}, reward: {}".format(e+1, episodes, total_reward))
            
    env.close()

  minibatch = np.array(minibatch)


Episode: 1/10000, reward: -113.34363104912705
Episode: 2/10000, reward: -163.31231367972086
Episode: 3/10000, reward: -89.04481535106243
Episode: 4/10000, reward: -230.1926709446238
Episode: 5/10000, reward: -241.12729945980615
Episode: 6/10000, reward: -162.78202631661546
Episode: 7/10000, reward: -430.63715623669486
Episode: 8/10000, reward: -184.12009769282517
Episode: 9/10000, reward: -106.0218858688758
Episode: 10/10000, reward: -479.89382031642776
Episode: 11/10000, reward: -92.96834846633345
Episode: 12/10000, reward: -496.0315411608873
Episode: 13/10000, reward: -97.89034835011772
Episode: 14/10000, reward: -415.8421637609812
Episode: 15/10000, reward: -403.130426263975
Episode: 16/10000, reward: -153.32047097691347
Episode: 17/10000, reward: -162.3715467347577
Episode: 18/10000, reward: -167.0279743347399
Episode: 19/10000, reward: -62.03641245226136
Episode: 20/10000, reward: -103.11253969743237
Episode: 21/10000, reward: -225.29512432417528
Episode: 22/10000, reward: -287.79

Episode: 177/10000, reward: -91.13582338054977
Episode: 178/10000, reward: -114.05343571775136
Episode: 179/10000, reward: -285.86261307510176
Episode: 180/10000, reward: -115.20532963771771
Episode: 181/10000, reward: -122.83252760469841
Episode: 182/10000, reward: -114.84271507119973
Episode: 183/10000, reward: -292.94589714514234
Episode: 184/10000, reward: -181.5255905334368
Episode: 185/10000, reward: -117.55361218448806
Episode: 186/10000, reward: -72.15400044365819
Episode: 187/10000, reward: -265.09022231007054
Episode: 188/10000, reward: -275.4125687533789
Episode: 189/10000, reward: -255.07205161086284
Episode: 190/10000, reward: -298.35095777194306
Episode: 191/10000, reward: -177.48145032309216
Episode: 192/10000, reward: -88.5839464977957
Episode: 193/10000, reward: -80.45066146158598
Episode: 194/10000, reward: -140.33593346191344
Episode: 195/10000, reward: -61.688320125552075
Episode: 196/10000, reward: -169.06132828785184
Episode: 197/10000, reward: -128.80924762024733

Episode: 350/10000, reward: -93.57720843898184
Episode: 351/10000, reward: -165.66082667264374
Episode: 352/10000, reward: -90.92551123043074
Episode: 353/10000, reward: -70.05809523878712
Episode: 354/10000, reward: -9.302003654246732
Episode: 355/10000, reward: -85.60023659727278
Episode: 356/10000, reward: -18.83682782134015
Episode: 357/10000, reward: -162.8445662155479
Episode: 358/10000, reward: -56.58735491628347
Episode: 359/10000, reward: -117.30493185406914
Episode: 360/10000, reward: -185.28826019559293
Episode: 361/10000, reward: -113.07237402615252
Episode: 362/10000, reward: -104.04131588804434
Episode: 363/10000, reward: -66.81364941890423
Episode: 364/10000, reward: -86.03940397544417
Episode: 365/10000, reward: -51.847727061738254
Episode: 366/10000, reward: -83.9798653810213
Episode: 367/10000, reward: -120.44804416911474
Episode: 368/10000, reward: -97.5205120629171
Episode: 369/10000, reward: -88.39001497115554
Episode: 370/10000, reward: -115.90145304555953
Episode

Episode: 524/10000, reward: -60.5333321318223
Episode: 525/10000, reward: -253.0599719286048
Episode: 526/10000, reward: -161.6076515703898
Episode: 527/10000, reward: -181.58346285237826
Episode: 528/10000, reward: -68.74510280847228
Episode: 529/10000, reward: -93.08003076857003
Episode: 530/10000, reward: -82.96503157463374
Episode: 531/10000, reward: -99.6215926713993
Episode: 532/10000, reward: -63.88549564317385
Episode: 533/10000, reward: -76.51452065594938
Episode: 534/10000, reward: -65.35001295229547
Episode: 535/10000, reward: -130.60061998238353
Episode: 536/10000, reward: -106.35185258895989
Episode: 537/10000, reward: -98.10473520897546
Episode: 538/10000, reward: -4.17379416520555
Episode: 539/10000, reward: -199.9687976656062
Episode: 540/10000, reward: -198.16680341627352
Episode: 541/10000, reward: -8.58450822446072
Episode: 542/10000, reward: -109.65111068998979
Episode: 543/10000, reward: -228.62402942055803
Episode: 544/10000, reward: -114.38168054541339
Episode: 5

Episode: 698/10000, reward: -94.151379297491
Episode: 699/10000, reward: -94.11559761481182
Episode: 700/10000, reward: -33.611159553839826
Episode: 701/10000, reward: -52.717042439283446
Episode: 702/10000, reward: 5.9246112574175385
Episode: 703/10000, reward: -92.9001057288618
Episode: 704/10000, reward: -58.23791619654001
Episode: 705/10000, reward: -19.73030576223877
Episode: 706/10000, reward: 8.500293470678713
Episode: 707/10000, reward: 2.580494183037999
Episode: 708/10000, reward: 15.677756174733645
Episode: 709/10000, reward: 13.98528208372909
Episode: 710/10000, reward: -68.94432730713977
Episode: 711/10000, reward: 3.044824977206588
Episode: 712/10000, reward: -35.2612922916755
Episode: 713/10000, reward: -71.87955664283832
Episode: 714/10000, reward: -11.007778036296656
Episode: 715/10000, reward: 27.17814859549445
Episode: 716/10000, reward: -136.4538073086719
Episode: 717/10000, reward: -3.9465474675763232
Episode: 718/10000, reward: -62.14420818611997
Episode: 719/10000

Episode: 872/10000, reward: -83.63237729700572
Episode: 873/10000, reward: -45.13438835815278
Episode: 874/10000, reward: -13.311082799714256
Episode: 875/10000, reward: -34.06501245572274
Episode: 876/10000, reward: -1.1373406290261698
Episode: 877/10000, reward: -31.06676797059623
Episode: 878/10000, reward: -152.56627316784056
Episode: 879/10000, reward: -329.4812310460298
Episode: 880/10000, reward: -71.49575901618968
Episode: 881/10000, reward: -49.66362564660763
Episode: 882/10000, reward: -41.472617985605176
Episode: 883/10000, reward: -88.21218184852455
Episode: 884/10000, reward: 33.39074108480892
Episode: 885/10000, reward: -126.22923638595998
Episode: 886/10000, reward: -77.46306516286741
Episode: 887/10000, reward: -15.867773219992301
Episode: 888/10000, reward: -75.2977857886293
Episode: 889/10000, reward: -47.80833009584531
Episode: 890/10000, reward: -42.97829073014293
Episode: 891/10000, reward: -110.31345960559273
Episode: 892/10000, reward: 5.739416049744705
Episode: 

Episode: 1046/10000, reward: -28.436984580239155
Episode: 1047/10000, reward: 66.00497609441406
Episode: 1048/10000, reward: 28.757034281541166
Episode: 1049/10000, reward: -16.23256888918833
Episode: 1050/10000, reward: -53.586160868040785
Episode: 1051/10000, reward: 74.59282448793537
Episode: 1052/10000, reward: 84.40468380851031
Episode: 1053/10000, reward: -105.03563143647317
Episode: 1054/10000, reward: -0.26823440878457916
Episode: 1055/10000, reward: -8.406657962298624
Episode: 1056/10000, reward: -4.790737726491854
Episode: 1057/10000, reward: -44.24631900520377
Episode: 1058/10000, reward: -175.04776628867495
Episode: 1059/10000, reward: 92.196814499454
Episode: 1060/10000, reward: -35.02123087379552
Episode: 1061/10000, reward: -45.389280403342
Episode: 1062/10000, reward: -4.36708988699489
Episode: 1063/10000, reward: 16.27577289213582
Episode: 1064/10000, reward: 79.88517897265186
Episode: 1065/10000, reward: 58.146030012299946
Episode: 1066/10000, reward: -44.879050640726

In [None]:
import time
trained_model = agent
state = env.reset()
state = np.reshape(state, [1,env.observation_space.shape[0]])
time_t = 0
while True:
    env.render()
    action = trained_model.act(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1,env.observation_space.shape[0]])
    state = next_state
    time_t += 1
    print(time_t)
    #time.sleep(0.4)
    if done:
        break
print("Done")