## Solving Pendulum Environment with Gaussian Noise and Actor-Critic
### This noise technique is commonly used instead of epsilon-greedy exploration for continuous action-spaces.
### The exploration movements (actions with noise) will be saved in the agent's memory in order to be processed

In [1]:
# Imports the libraries:

import random
import numpy as np
from tensorflow import keras
import tensorflow as tf
import gym
env = gym.make('Pendulum-v0')

In [2]:
stateInput        = keras.Input(shape = (3,))    

actorDense1       = keras.layers.Dense(units = 128, activation = "relu")   (stateInput)
actorDense2       = keras.layers.Dense(units = 128, activation = "relu")   (actorDense1)
actorDense3       = keras.layers.Dense(units = 32, activation = "relu")    (actorDense2)
actorOutput       = keras.layers.Dense(units = 1, activation = "tanh")     (actorDense3)

actorModel        = keras.Model (inputs = stateInput, outputs = actorOutput)
actorModel.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss ="mse")

#print (actorModel.summary())

# Creating the critic:

stateDense1       = keras.layers.Dense(units = 128, activation="relu")      (stateInput)
stateDense2       = keras.layers.Dense(units = 64, activation="relu")       (stateDense1)
actionInput       = keras.Input(shape = (1,), name="actionInput")                            
actionDense1      = keras.layers.Dense(units = 64, activation="relu")       (actionInput)       

criticConcatenate = keras.layers.Concatenate()                              ([stateDense2, actionDense1])
criticDense1      = keras.layers.Dense(units = 128, activation = "relu")    (criticConcatenate)
criticDense2      = keras.layers.Dense(units = 128, activation = "relu")    (criticDense1)
criticOutput      = keras.layers.Dense(units = 1)                           (criticDense2)

criticModel = keras.Model(inputs = [stateInput, actionInput] , outputs = criticOutput )
criticModel.compile(optimizer = keras.optimizers.Adam(lr=0.001), loss = "mse")


# Creating the target networks for both the actor and the critic:

staticActorModel  = keras.models.clone_model( actorModel, input_tensors=None, clone_function=None)
staticCriticModel = keras.models.clone_model( criticModel, input_tensors=None, clone_function=None)



#print (criticModel.summary())

In [3]:
def updateStaticWeights(model, staticModel,tau):
    modelWeights = model.get_weights()
    staticWeights = staticModel.get_weights()
    
    #Python broadcasting:
    
    for i in range (len(staticWeights)):
     
        staticWeights[i] = (1-tau) * staticWeights[i] + tau * modelWeights[i]

        
    staticModel.set_weights(staticWeights)


    return model, staticModel

In [4]:
# Memory Replay is carried out in this function.
# 256 random previous experiences are "remembered" and processed in each step with a batch size of 32
# This way we can prevent overfitting most recent states:

def executeMemoryReplayStep(memoryPreviousStates, memoryStates, memoryActions, memoryRD, actorModel, staticActorModel, criticModel, staticCriticModel, itemsInMemory):
    randomList = np.random.permutation(itemsInMemory) # After that number everything is 0
    
    memoryPreviousStates = memoryPreviousStates[randomList[:numberOfLearningFromReplaySteps]]
    memoryStates = memoryStates[randomList[:numberOfLearningFromReplaySteps]]
    memoryRD = memoryRD[randomList[:numberOfLearningFromReplaySteps]]
    memoryActions = memoryActions[randomList[:numberOfLearningFromReplaySteps]]

    
    previousState = memoryPreviousStates
    state = memoryStates
    actions = memoryActions
    reward, done = np.split(memoryRD,2,axis=1)
    

    
    # Critic Model Training:
    
    futureActions       = staticActorModel.predict (state)

    valueState          = staticCriticModel.predict ([state, futureActions])

    valuePreviousState = reward + discount * valueState * (1-done) #for those rows in which done is False:

    criticModel.fit ( x = [previousState, actions], y = valuePreviousState, epochs=1, verbose = 0, batch_size =32)


    
    # Actor Model Training:    
    
    with tf.GradientTape() as tape:
        tape.watch(actorModel.trainable_weights)
        
        predictedActions = actorModel(previousState, training=False)
 
        criticPreds = criticModel ([previousState, predictedActions],  training=False)
        
    fromCriticToActorGrads = tape.gradient(criticPreds, actorModel.trainable_weights)
    fromCriticToActorGrads = -np.array(fromCriticToActorGrads) # We are trying to do gradient ascent
    
    gradsAndVariables = zip (fromCriticToActorGrads, actorModel.trainable_weights)
    actorModel.optimizer.apply_gradients(gradsAndVariables)

In [None]:
numberEpisodes = 1000
discount = 0.99
counter = 0

rewards = []
tau = 0.5
memorySize = 100000
memoryPreviousStates = np.zeros((memorySize,3))
memoryStates = np.zeros((memorySize,3))
memoryActions = np.zeros((memorySize,1))
memoryRD = np.zeros((memorySize,2))
numberOfLearningFromReplaySteps = 256
itemsInMemory = 0
initialiseReplayAfter = 1000 
scale = 1.0

for episode in range (numberEpisodes):

    
    done = False
    prevObs = env.reset()
    episodeReward = 0

    
    for step in range (2000):
        if done:
            break
        #env.render()
        

    
        actions = actorModel.predict(np.expand_dims(prevObs, axis = 0))[0] # Shape (2,)
        if (scale>0.0): #Loc is the mean and scale is the standard deviation.
            noisedActions = np.clip(actions + np.random.normal(loc=[0], scale=scale), -1, 1)
            scale -= 1/60000 # After 300 episodes it will become 0
        else:
            noisedActions = actions
        
        obs, reward, done, _ = env.step (noisedActions)


        memoryPreviousStates[counter] = prevObs
        memoryStates[counter] = obs
        memoryActions[counter] = noisedActions
        memoryRD[counter] = [reward, float (done)]

        
        if (counter % 3000 == 0):
            actorModel.save_weights("actorCriticPendulum.h5")
            actorModel, staticActorModel   = updateStaticWeights(actorModel, staticActorModel, tau)
            criticModel, staticCriticModel = updateStaticWeights(criticModel, staticCriticModel, tau)
            print ("STATIC WEIGHTS UPDATED")
        
        if (itemsInMemory < (counter+1)):
            itemsInMemory = min ((itemsInMemory+1), memorySize)
        if (itemsInMemory >= initialiseReplayAfter):
            executeMemoryReplayStep(memoryPreviousStates, memoryStates, memoryActions, memoryRD, actorModel, staticActorModel, criticModel, staticCriticModel, itemsInMemory)

        prevObs = obs
        episodeReward += reward
        counter = (counter + 1) % memorySize
    print (episode, " Episode Reward: ", episodeReward, ".Sigma:", scale)
    rewards.append(episodeReward)

env.close()
actorModel.save_weights("actorCriticPendulum.h5")

STATIC WEIGHTS UPDATED
0  Episode Reward:  -1385.4453361701587 .Sigma: 0.996666666666667
1  Episode Reward:  -1387.2541210256888 .Sigma: 0.9933333333333341
2  Episode Reward:  -1248.1811118897745 .Sigma: 0.9900000000000011
3  Episode Reward:  -1169.9658895598948 .Sigma: 0.9866666666666681
4  Episode Reward:  -1196.440277397148 .Sigma: 0.9833333333333352
5  Episode Reward:  -1184.5153221340636 .Sigma: 0.9800000000000022
6  Episode Reward:  -1101.816664499706 .Sigma: 0.9766666666666692
7  Episode Reward:  -1669.745458472823 .Sigma: 0.9733333333333363
8  Episode Reward:  -1375.2015396450988 .Sigma: 0.9700000000000033
9  Episode Reward:  -1392.7651946302487 .Sigma: 0.9666666666666703
10  Episode Reward:  -1206.2331595851938 .Sigma: 0.9633333333333374
11  Episode Reward:  -1085.8981299445477 .Sigma: 0.9600000000000044
12  Episode Reward:  -1159.59624709566 .Sigma: 0.9566666666666714
13  Episode Reward:  -1215.5835793314427 .Sigma: 0.9533333333333385
14  Episode Reward:  -1062.4197744666355 