I lernt from the following websites to finish the implementations.
Reference:
1. Hung-yi Lee. DRL Lecture 3: Q-learning (Basic Idea)
https://youtu.be/o_g9JUMw1Oc
2. Hung-yi Lee. DRL Lecture 4: Q-learning (Advanced Tips)
https://www.youtube.com/watch?v=2-zGCx4iv_k&t=980s
3. Tensorflow
https://www.tensorflow.org/api_docs/python/tf/keras/Model
4. Tom. Let's build a DQN: simple implementation
https://tomroth.com.au/dqn-simple/
5. Reinforcement Learning tutorial
https://pylessons.com/CartPole-DDDQN/

In [1]:
import gym
import slimevolleygym
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import tensorflow as tf
from random import sample

# Setting up the environment 
env = gym.make("SlimeVolley-v0")

In [2]:
def create_model(name):
    input_layer = tf.keras.Input(shape = (12,))
    H1 = tf.keras.layers.Dense(128, activation='relu')(input_layer)
    H2 = tf.keras.layers.Dense(128, activation='relu')(H1)
    H3 = tf.keras.layers.Dense(32, activation='relu')(H2)
    # the following two lines derived from the website.
    #https://stackoverflow.com/questions/62336594/dueling-dqn-with-keras
    V_s = tf.keras.layers.Dense(1)(H3) #https://stackoverflow.com/questions/62336594/dueling-dqn-with-keras
    advantage = tf.keras.layers.Dense(8)(H3) #https://stackoverflow.com/questions/62336594/dueling-dqn-with-keras
    output_layer = (V_s + (advantage - tf.math.reduce_mean(advantage, axis=1, keepdims=True)))
    model = tf.keras.Model(inputs = input_layer, outputs = output_layer,name = name)
    model.compile(optimizer='adam',
                  loss="mse",
                  metrics=['accuracy'])
    model.summary()
    return model
model = create_model(name = "QNN")
# Target network
model2 = create_model(name = "TagretNetwork")


Model: "QNN"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 12)]         0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 128)          1664        input_1[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          16512       dense[0][0]                      
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 32)           4128        dense_1[0][0]                    
________________________________________________________________________________________________

In [3]:
# input_layer = tf.keras.Input(shape = (12,))
# H1 = tf.keras.layers.Dense(128, activation='relu')(input_layer)
# H2 = tf.keras.layers.Dense(128, activation='relu')(H1)
# H3 = tf.keras.layers.Dense(32, activation='relu')(H2)
# V_s = tf.keras.layers.Dense(1)(H3) #https://stackoverflow.com/questions/62336594/dueling-dqn-with-keras
# advantage = tf.keras.layers.Dense(8)(H3) #https://stackoverflow.com/questions/62336594/dueling-dqn-with-keras
# advantage =  tf.keras.layers.Lambda(lambda a: a[:, :] - tf.math.reduce_mean(a[:, :], keepdims=True), output_shape=(8,))(advantage)
# output_layer = tf.keras.layers.Add()([V_s, advantage])
# model = tf.keras.Model(inputs = input_layer, outputs = output_layer)
# model.compile(optimizer='adam',
#               loss="mse",
#               metrics=['accuracy'])
# model.summary()

In [4]:
def action_inverse(num):
    if num == 0:
        return np.array([0,0,0])
    elif num == 1:
        return np.array([1,0,0])
    elif num == 2:
        return np.array([0,1,0])
    elif num == 3:
        return np.array([0,0,1])
    elif num == 4:
        return np.array([1,1,0])
    elif num == 5:
        return np.array([1,0,1])
    elif num == 6:
        return np.array([0,1,1])
    elif num == 7:
        return np.array([1,1,1])

def replay(buffer, minibatch_size,timestep,epsoide_num):
    if timestep <= 5000:
        return model
    else:
        minibatch = np.random.choice(buffer, minibatch_size, replace=False)
        s_l =      np.array(list(map(lambda x: x['s'], minibatch)))
        a_l =      np.array(list(map(lambda x: x['a'], minibatch)))
        r_l =      np.array(list(map(lambda x: x['r'], minibatch)))
        sprime_l = np.array(list(map(lambda x: x['sprime'], minibatch)))
        done_l   = np.array(list(map(lambda x: x['done'], minibatch)))
        current_qs = model.predict(s_l)
        Q_qnn = model.predict(sprime_l)
        Q_tgt = model2.predict(sprime_l)

        if epsoide_num % 10 == 0 and epsoide_num != 0:
            model2.set_weights(model.get_weights())   
        # q-update

        for i,(s,a,r,done,sprimeQnn,sprimeTgt) in enumerate(zip(s_l,a_l,r_l,done_l,Q_qnn,Q_tgt)):
            Q_DDQN = sprimeTgt[np.argmax(sprimeQnn)]
            if not done:  target = r + Q_DDQN
            else:         target = r
            # Update Q value for given state
            current_qs[i][a] = target
            # And append to our training data
        model.fit(s_l,current_qs, batch_size= minibatch_size, epochs=1,verbose=0)
        return model

In [5]:
from random import randrange

# hyper parameter
n_episodes = 3500
epsilon = 0.9
minibatch_size = 40
r_sums = []  # stores rewards of each epsiode 
buffer = [] # replay memory holds s, a, r, s'
win_weight_eps_tstep = []
mem_max_size = 30000
init_seed=123
timestep = 0
eps = 0
policy0 = slimevolleygym.BaselinePolicy()
np.random.seed(123)


for n in range(n_episodes): 
    env.seed(seed=init_seed+n)
    s = env.reset()
    s_opponent = s
    done=False
    r_sum = 0
    while not done: 
        # Uncomment this to see the agent learning
        # env.render()
        # Feedforward pass for current state to get predicted q-values for all actions 
        qvals_s = model.predict(s.reshape(1,12))
        # Choose action to be epsilon-greedy
        if np.random.random() < epsilon:  
            a = randrange(8)
        else:                             
            a = np.argmax(qvals_s)
        action1 = policy0.predict(s_opponent) #opponent policy
        a_transfor = action_inverse(a)  
        # Take step, store results 
        sprime, r, done, info = env.step(a_transfor, action1)
        r_sum += r 
        # add to memory, respecting memory buffer limit 
        if len(buffer) > mem_max_size:
            buffer.pop(0)
        buffer.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
        # Update state
        s=sprime
        s_opponent = info['otherObs']
        # Train the nnet that approximates q(s,a), using the replay memory
        model=replay(buffer, minibatch_size = minibatch_size, timestep = timestep, epsoide_num = n)
        # Decrease epsilon until we hit a target threshold 
        timestep += 1
        if timestep % 5000 == 0 and timestep != 0:
            print("timestep",timestep)
    if r_sum > 0:
        win_weight_eps_tstep.append((model.get_weights(),n,timestep))    
    epsilon -= 0.0005
    if epsilon <= 0.0005:
        eps += 0.001
        epsilon = 0.001
        if eps == 0.001:
            print("In the timestep of ", timestep, ", epsilon is fixed at",epsilon)     
    #print("Total reward:", r_sum)
    r_sums.append(r_sum)
    if n % 100 == 0: print(n)
    if n % 10 == 0: 
        print("r_score", np.round(np.mean(np.array(r_sums)), 3), "±", 2*np.round(np.std(np.array(r_sums)), 3), "vs", "Baseline", "over", n+1, "trials.")
        print("rsums_lastest10",r_sums[-10:])

0
r_score -5.0 ± 0.0 vs Baseline over 1 trials.
rsums_lastest10 [-5]
timestep 5000
r_score -4.636 ± 1.762 vs Baseline over 11 trials.
rsums_lastest10 [-5, -5, -2, -4, -5, -5, -5, -5, -5, -5]
timestep 10000
r_score -4.762 ± 1.366 vs Baseline over 21 trials.
rsums_lastest10 [-5, -5, -5, -5, -4, -5, -5, -5, -5, -5]
timestep 15000
r_score -4.806 ± 1.182 vs Baseline over 31 trials.
rsums_lastest10 [-5, -5, -5, -5, -4, -5, -5, -5, -5, -5]
timestep 20000
r_score -4.854 ± 1.042 vs Baseline over 41 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 25000
r_score -4.882 ± 0.942 vs Baseline over 51 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 30000
timestep 35000
r_score -4.869 ± 0.922 vs Baseline over 61 trials.
rsums_lastest10 [-5, -5, -5, -5, -4, -5, -5, -5, -4, -5]
timestep 40000
r_score -4.859 ± 0.966 vs Baseline over 71 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -3, -5]
timestep 45000
r_score -4.864 ± 0.93 vs Baseline over 81

timestep 375000
r_score -4.843 ± 0.814 vs Baseline over 651 trials.
rsums_lastest10 [-5, -5, -5, -3, -5, -5, -5, -5, -4, -5]
timestep 380000
r_score -4.846 ± 0.81 vs Baseline over 661 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 385000
r_score -4.848 ± 0.804 vs Baseline over 671 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 390000
r_score -4.849 ± 0.802 vs Baseline over 681 trials.
rsums_lastest10 [-5, -5, -5, -5, -4, -5, -5, -5, -5, -5]
timestep 395000
r_score -4.851 ± 0.796 vs Baseline over 691 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 400000
700
r_score -4.847 ± 0.802 vs Baseline over 701 trials.
rsums_lastest10 [-5, -5, -5, -4, -4, -5, -5, -4, -4, -5]
timestep 405000
r_score -4.845 ± 0.804 vs Baseline over 711 trials.
rsums_lastest10 [-5, -5, -5, -4, -5, -4, -5, -5, -5, -4]
timestep 410000
timestep 415000
r_score -4.845 ± 0.804 vs Baseline over 721 trials.
rsums_lastest10 [-5, -4, -5, -5, -5, -

r_score -4.861 ± 0.764 vs Baseline over 1291 trials.
rsums_lastest10 [-5, -4, -5, -5, -5, -3, -5, -4, -5, -4]
timestep 745000
1300
r_score -4.862 ± 0.762 vs Baseline over 1301 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 750000
r_score -4.863 ± 0.758 vs Baseline over 1311 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 755000
r_score -4.863 ± 0.758 vs Baseline over 1321 trials.
rsums_lastest10 [-5, -4, -5, -5, -5, -5, -5, -4, -5, -5]
timestep 760000
r_score -4.863 ± 0.758 vs Baseline over 1331 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -4]
timestep 765000
r_score -4.861 ± 0.766 vs Baseline over 1341 trials.
rsums_lastest10 [-5, -3, -5, -5, -5, -5, -5, -5, -4, -4]
timestep 770000
r_score -4.862 ± 0.762 vs Baseline over 1351 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 775000
r_score -4.863 ± 0.762 vs Baseline over 1361 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -4]
tim

timestep 1100000
r_score -4.864 ± 0.748 vs Baseline over 1931 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -4, -5, -5, -5, -5]
timestep 1105000
r_score -4.864 ± 0.748 vs Baseline over 1941 trials.
rsums_lastest10 [-5, -5, -4, -5, -4, -5, -5, -5, -5, -5]
timestep 1110000
r_score -4.864 ± 0.75 vs Baseline over 1951 trials.
rsums_lastest10 [-5, -4, -5, -5, -4, -5, -5, -5, -5, -5]
timestep 1115000
r_score -4.864 ± 0.748 vs Baseline over 1961 trials.
rsums_lastest10 [-5, -4, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1120000
r_score -4.865 ± 0.746 vs Baseline over 1971 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1125000
r_score -4.864 ± 0.75 vs Baseline over 1981 trials.
rsums_lastest10 [-5, -5, -5, -5, -4, -5, -5, -5, -5, -3]
timestep 1130000
timestep 1135000
r_score -4.863 ± 0.752 vs Baseline over 1991 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -4, -5, -4, -5]
timestep 1140000
2000
r_score -4.864 ± 0.75 vs Baseline over 2001 trials.
rsums_lastest10 [-5, -

timestep 1460000
timestep 1465000
r_score -4.867 ± 0.74 vs Baseline over 2571 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1470000
r_score -4.867 ± 0.74 vs Baseline over 2581 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1475000
r_score -4.868 ± 0.738 vs Baseline over 2591 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1480000
2600
r_score -4.869 ± 0.736 vs Baseline over 2601 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1485000
r_score -4.869 ± 0.736 vs Baseline over 2611 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1490000
r_score -4.869 ± 0.736 vs Baseline over 2621 trials.
rsums_lastest10 [-5, -5, -5, -4, -5, -5, -5, -5, -5, -4]
timestep 1495000
r_score -4.869 ± 0.734 vs Baseline over 2631 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1500000
r_score -4.869 ± 0.734 vs Baseline over 2641 trials.
rsums_lastest10 [-5, 

timestep 1835000
timestep 1840000
3200
r_score -4.865 ± 0.748 vs Baseline over 3201 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -4, -5, -5, -5, -4]
timestep 1845000
r_score -4.865 ± 0.748 vs Baseline over 3211 trials.
rsums_lastest10 [-5, -5, -5, -4, -5, -5, -5, -5, -5, -5]
timestep 1850000
r_score -4.865 ± 0.748 vs Baseline over 3221 trials.
rsums_lastest10 [-5, -5, -5, -4, -5, -5, -5, -5, -5, -5]
timestep 1855000
r_score -4.865 ± 0.746 vs Baseline over 3231 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1860000
r_score -4.865 ± 0.746 vs Baseline over 3241 trials.
rsums_lastest10 [-4, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1865000
r_score -4.866 ± 0.746 vs Baseline over 3251 trials.
rsums_lastest10 [-5, -4, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1870000
r_score -4.866 ± 0.744 vs Baseline over 3261 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 1875000
timestep 1880000
r_score -4.866 ± 0.744 vs Baseline over 3271 trials.
rs

# there have been 3500 eps above..
We run 1000 more episodes and tune the epsilon to give more explorations.
Other hyperparameters remain the same.

In [14]:
add_episodes = n_episodes + 1000
epsilon = 0.5
np.random.seed(123)

for n in range(n_episodes,add_episodes): 
    env.seed(seed=init_seed+n)
    s = env.reset()
    s_opponent = s
    done=False
    r_sum = 0
    while not done: 
        # Uncomment this to see the agent learning
        # env.render()
        # Feedforward pass for current state to get predicted q-values for all actions 
        qvals_s = model.predict(s.reshape(1,12))
        # Choose action to be epsilon-greedy
        if np.random.random() < epsilon:  
            a = randrange(8)
        else:                             
            a = np.argmax(qvals_s)
        action1 = policy0.predict(s_opponent) #opponent policy
        a_transfor = action_inverse(a)  
        # Take step, store results 
        sprime, r, done, info = env.step(a_transfor, action1)
        r_sum += r 
        # add to memory, respecting memory buffer limit 
        if len(buffer) > mem_max_size:
            buffer.pop(0)
        buffer.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
        # Update state
        s=sprime
        s_opponent = info['otherObs']
        # Train the nnet that approximates q(s,a), using the replay memory
        model=replay(buffer, minibatch_size = minibatch_size, timestep = timestep, epsoide_num = n)
        # Decrease epsilon until we hit a target threshold 
        timestep += 1
        if timestep % 5000 == 0 and timestep != 0:
            print("timestep",timestep)
    if r_sum > 0:
        win_weight_eps_tstep.append((model.get_weights(),n,timestep))    
    epsilon -= 0.0005
    if epsilon <= 0.0005:
        eps += 0.001
        epsilon = 0.001
        if eps == 0.001:
            print("In the timestep of ", timestep, ", epsilon is fixed at",epsilon)     
    #print("Total reward:", r_sum)
    r_sums.append(r_sum)
    if n % 100 == 0: print(n)
    if n % 10 == 0: 
        print("r_score", np.round(np.mean(np.array(r_sums)), 3), "±", 2*np.round(np.std(np.array(r_sums)), 3), "vs", "Baseline", "over", n+1, "trials.")
        print("rsums_lastest10",r_sums[-10:])

3500
r_score -4.867 ± 0.74 vs Baseline over 3501 trials.
rsums_lastest10 [-5, -4, -4, -5, -5, -5, -5, -4, -5, -5]
timestep 2020000
r_score -4.867 ± 0.742 vs Baseline over 3511 trials.
rsums_lastest10 [-5, -4, -5, -5, -5, -4, -5, -5, -5, -5]
timestep 2025000
r_score -4.867 ± 0.74 vs Baseline over 3521 trials.
rsums_lastest10 [-5, -5, -4, -5, -5, -5, -5, -5, -5, -5]
timestep 2030000
r_score -4.868 ± 0.74 vs Baseline over 3531 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 2035000
timestep 2040000
r_score -4.868 ± 0.738 vs Baseline over 3541 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 2045000
r_score -4.868 ± 0.74 vs Baseline over 3551 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -3, -5]
timestep 2050000
r_score -4.868 ± 0.74 vs Baseline over 3561 trials.
rsums_lastest10 [-4, -5, -5, -4, -5, -5, -5, -5, -5, -5]
timestep 2055000
timestep 2060000
r_score -4.868 ± 0.74 vs Baseline over 3571 trials.
rsums_lastest10 [-5, -5, 

timestep 2400000
timestep 2405000
r_score -4.865 ± 0.752 vs Baseline over 4131 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -4, -5]
timestep 2410000
r_score -4.865 ± 0.752 vs Baseline over 4141 trials.
rsums_lastest10 [-4, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 2415000
r_score -4.865 ± 0.752 vs Baseline over 4151 trials.
rsums_lastest10 [-5, -5, -4, -5, -5, -5, -4, -5, -5, -4]
timestep 2420000
r_score -4.865 ± 0.752 vs Baseline over 4161 trials.
rsums_lastest10 [-5, -5, -4, -5, -5, -5, -5, -5, -5, -4]
timestep 2425000
r_score -4.864 ± 0.754 vs Baseline over 4171 trials.
rsums_lastest10 [-5, -5, -4, -4, -5, -5, -5, -3, -5, -4]
timestep 2430000
r_score -4.864 ± 0.754 vs Baseline over 4181 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -4, -5]
timestep 2435000
timestep 2440000
r_score -4.864 ± 0.754 vs Baseline over 4191 trials.
rsums_lastest10 [-4, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 2445000
4200
r_score -4.865 ± 0.754 vs Baseline over 4201 trials.
rs

In [20]:
n_episodes = add_episodes
add_episodes = add_episodes + 1000
epsilon = 0.5
eps = 0
np.random.seed(123)

for n in range(n_episodes,add_episodes): 
    env.seed(seed=init_seed+n)
    s = env.reset()
    s_opponent = s
    done=False
    r_sum = 0
    while not done: 
        # Uncomment this to see the agent learning
        # env.render()
        # Feedforward pass for current state to get predicted q-values for all actions 
        qvals_s = model.predict(s.reshape(1,12))
        # Choose action to be epsilon-greedy
        if np.random.random() < epsilon:  
            a = randrange(8)
        else:                             
            a = np.argmax(qvals_s)
        action1 = policy0.predict(s_opponent) #opponent policy
        a_transfor = action_inverse(a)  
        # Take step, store results 
        sprime, r, done, info = env.step(a_transfor, action1)
        r_sum += r 
        # add to memory, respecting memory buffer limit 
        if len(buffer) > mem_max_size:
            buffer.pop(0)
        buffer.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
        # Update state
        s=sprime
        s_opponent = info['otherObs']
        # Train the nnet that approximates q(s,a), using the replay memory
        model=replay(buffer, minibatch_size = minibatch_size, timestep = timestep, epsoide_num = n)
        # Decrease epsilon until we hit a target threshold 
        timestep += 1
        if timestep % 5000 == 0 and timestep != 0:
            print("timestep",timestep)
    if r_sum > 0:
        win_weight_eps_tstep.append((model.get_weights(),n,timestep))    
    epsilon -= 0.0005
    if epsilon <= 0.0005:
        eps += 0.001
        epsilon = 0.001
        if eps == 0.001:
            print("In the timestep of ", timestep, ", epsilon is fixed at",epsilon)     
    #print("Total reward:", r_sum)
    r_sums.append(r_sum)
    if n % 100 == 0: print(n)
    if n % 10 == 0: 
        print("r_score", np.round(np.mean(np.array(r_sums)), 3), "±", 2*np.round(np.std(np.array(r_sums)), 3), "vs", "Baseline", "over", n+1, "trials.")
        print("rsums_lastest10",r_sums[-10:])

4500
r_score -4.865 ± 0.754 vs Baseline over 4501 trials.
rsums_lastest10 [-5, -5, -4, -5, -5, -5, -5, -5, -5, -5]
timestep 2630000
r_score -4.865 ± 0.754 vs Baseline over 4511 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 2635000
r_score -4.866 ± 0.752 vs Baseline over 4521 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 2640000
r_score -4.866 ± 0.752 vs Baseline over 4531 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 2645000
r_score -4.866 ± 0.752 vs Baseline over 4541 trials.
rsums_lastest10 [-5, -5, -5, -5, -4, -5, -5, -4, -5, -5]
timestep 2650000
timestep 2655000
r_score -4.866 ± 0.752 vs Baseline over 4551 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -4]
timestep 2660000
r_score -4.866 ± 0.752 vs Baseline over 4561 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 2665000
r_score -4.866 ± 0.752 vs Baseline over 4571 trials.
rsums_lastest10 [-4, -5, -4, -5, -5,

timestep 3000000
r_score -4.862 ± 0.764 vs Baseline over 5131 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 3005000
r_score -4.862 ± 0.764 vs Baseline over 5141 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 3010000
timestep 3015000
r_score -4.862 ± 0.764 vs Baseline over 5151 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -3, -5]
timestep 3020000
r_score -4.862 ± 0.766 vs Baseline over 5161 trials.
rsums_lastest10 [-3, -5, -5, -5, -4, -5, -5, -5, -5, -5]
timestep 3025000
r_score -4.862 ± 0.766 vs Baseline over 5171 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 3030000
r_score -4.862 ± 0.764 vs Baseline over 5181 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 3035000
r_score -4.862 ± 0.764 vs Baseline over 5191 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -4, -5, -5, -5]
timestep 3040000
timestep 3045000
5200
r_score -4.863 ± 0.764 vs Baseline over 5201 trials.
rs

In [28]:
n_episodes = add_episodes
add_episodes = add_episodes + 1000
epsilon = 0.5
eps = 0
np.random.seed(123)

for n in range(n_episodes,add_episodes): 
    env.seed(seed=init_seed+n)
    s = env.reset()
    s_opponent = s
    done=False
    r_sum = 0
    while not done: 
        # Uncomment this to see the agent learning
        # env.render()
        # Feedforward pass for current state to get predicted q-values for all actions 
        qvals_s = model.predict(s.reshape(1,12))
        # Choose action to be epsilon-greedy
        if np.random.random() < epsilon:  
            a = randrange(8)
        else:                             
            a = np.argmax(qvals_s)
        action1 = policy0.predict(s_opponent) #opponent policy
        a_transfor = action_inverse(a)  
        # Take step, store results 
        sprime, r, done, info = env.step(a_transfor, action1)
        r_sum += r 
        # add to memory, respecting memory buffer limit 
        if len(buffer) > mem_max_size:
            buffer.pop(0)
        buffer.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
        # Update state
        s=sprime
        s_opponent = info['otherObs']
        # Train the nnet that approximates q(s,a), using the replay memory
        model=replay(buffer, minibatch_size = minibatch_size, timestep = timestep, epsoide_num = n)
        # Decrease epsilon until we hit a target threshold 
        timestep += 1
        if timestep % 5000 == 0 and timestep != 0:
            print("timestep",timestep)
    if r_sum > 0:
        win_weight_eps_tstep.append((model.get_weights(),n,timestep))    
    epsilon -= 0.0005
    if epsilon <= 0.0005:
        eps += 0.001
        epsilon = 0.001
        if eps == 0.001:
            print("In the timestep of ", timestep, ", epsilon is fixed at",epsilon)     
    #print("Total reward:", r_sum)
    r_sums.append(r_sum)
    if n % 100 == 0: print(n)
    if n % 10 == 0: 
        print("r_score", np.round(np.mean(np.array(r_sums)), 3), "±", 2*np.round(np.std(np.array(r_sums)), 3), "vs", "Baseline", "over", n+1, "trials.")
        print("rsums_lastest10",r_sums[-10:])

5500
r_score -4.862 ± 0.764 vs Baseline over 5501 trials.
rsums_lastest10 [-5, -3, -5, -4, -5, -5, -3, -5, -5, -5]
timestep 3220000
timestep 3225000
r_score -4.862 ± 0.764 vs Baseline over 5511 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -4, -5]
timestep 3230000
r_score -4.862 ± 0.764 vs Baseline over 5521 trials.
rsums_lastest10 [-5, -3, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 3235000
r_score -4.862 ± 0.764 vs Baseline over 5531 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -5]
timestep 3240000
r_score -4.862 ± 0.764 vs Baseline over 5541 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -4, -5, -5, -3]
timestep 3245000
timestep 3250000
r_score -4.862 ± 0.766 vs Baseline over 5551 trials.
rsums_lastest10 [-5, -5, -4, -5, -4, -5, -5, -5, -5, -4]
timestep 3255000
r_score -4.862 ± 0.766 vs Baseline over 5561 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -4, -5, -5, -5, -5]
timestep 3260000
r_score -4.862 ± 0.764 vs Baseline over 5571 trials.
rsums_lastest10 [-4

timestep 3590000
timestep 3595000
r_score -4.865 ± 0.758 vs Baseline over 6131 trials.
rsums_lastest10 [-5, -5, -5, -4, -5, -5, -5, -5, -5, -5]
timestep 3600000
r_score -4.865 ± 0.756 vs Baseline over 6141 trials.
rsums_lastest10 [-5, -5, -5, -5, -4, -5, -5, -5, -5, -5]
timestep 3605000
r_score -4.865 ± 0.758 vs Baseline over 6151 trials.
rsums_lastest10 [-5, -3, -5, -5, -5, -5, -5, -5, -4, -5]
timestep 3610000
r_score -4.865 ± 0.758 vs Baseline over 6161 trials.
rsums_lastest10 [-5, -5, -5, -5, -4, -5, -5, -4, -5, -5]
timestep 3615000
timestep 3620000
r_score -4.865 ± 0.76 vs Baseline over 6171 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -3, -5, -4, -5, -5]
timestep 3625000
r_score -4.865 ± 0.76 vs Baseline over 6181 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -5, -5, -5, -4]
timestep 3630000
r_score -4.865 ± 0.76 vs Baseline over 6191 trials.
rsums_lastest10 [-5, -5, -5, -5, -5, -5, -4, -5, -5, -4]
timestep 3635000
6200
r_score -4.865 ± 0.758 vs Baseline over 6201 trials.
rsums

In [31]:
import pickle
# Store data (serialize)
with open('NEW_DuelingDDQN_batch_size.pickle', 'wb') as handle:
    pickle.dump(r_sums +[timestep,], handle)