# Bipedal Walker with a Genetic Algorithm
## The reward function has been changed so it promotes individuals who run farther instead of agents which fall really quickly (don't get penalised by applying force) or that stay static on the floor in order not to fall and receive -100 points

In [1]:
# Imports the libraries:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import tensorflow as tf
from tensorflow import keras
import random
import gym

In [2]:
def createRandomIndividuals (numberOfIndividuals, numberOfParameters, low=-1, high=1):
    parameters = np.random.uniform (low=low, high = high, size = (numberOfParameters, numberOfIndividuals))
    return parameters

In [3]:
def elitismSelection (parameters, numberToBeChosen, shapes, paramsPerShape):
    scores  = [getBipedalWalkerFitness(parameters[:,i], shapes, paramsPerShape) for i in range(parameters.shape[1])]
    indices = np.argsort (scores)[::-1] # Really important for descending order!
    parameters      = parameters[:,indices[:numberToBeChosen]]
    print ("Best individual reward: %d" % scores[indices[0]])
    return parameters

In [4]:
def createPairs (numberOfSurvivors, numberOfPairs, replace=True): # It returns a list of indices that can be splitted every 2 for forming pairs
    pairs = []
    chosenIndices = np.random.choice (range(numberOfSurvivors), size=(numberOfPairs*2), replace = replace)
    return chosenIndices

In [5]:
def SBX (parameters1, parameters2, eta=10): # Simulated Binary Crossover. The bigger is eta, the more close the children are to parents

    r             = np.random.random(size=len(parameters1))
    gamma         = np.empty(len(parameters1))
    gamma[r<=0.5] = (2*r[r<=0.5])**(1/(eta+1))
    gamma[r>0.5 ] = (1/(2*(1-r[r>0.5])))**(1/(eta+1))
    child1        = 0.5 * ((1+gamma)*parameters1 + (1-gamma)*parameters2)
    child2        = 0.5 * ((1-gamma)*parameters1 + (1+gamma)*parameters2)
    return child1, child2

In [6]:
def SPBX (parameters1, parameters2): # Single Point Binary Crossover.
    crossoverPoint = np.random.randint(1, len(parameters1)) # We want at least length 1 at either side of the crossover
    child1         = np.append(parameters1[:crossoverPoint], parameters2[crossoverPoint:])
    child2         = np.append(parameters2[:crossoverPoint], parameters1[crossoverPoint:])
    return child1, child2

In [7]:
def applyGaussianMutation (parameters, mutation_rate=0.05, mu=0, sigma=1): # Pass it a numpy array of floats! (mu and sigma can be arrays too)
    mutationOrNot              = (np.random.random (size=len(parameters))) < mutation_rate
    mutationValues             = np.random.normal(loc=mu, scale = sigma, size = len(parameters))
    parameters[mutationOrNot] += mutationValues[mutationOrNot]
    return parameters

In [8]:
stateInput = keras.Input(shape = (24))

dense1 = keras.layers.Dense(units = 128, activation = "relu") (stateInput)
dense2 = keras.layers.Dense(units = 128, activation = "relu") (dense1)
dense3 = keras.layers.Dense(units = 128, activation = "relu") (dense2)

outputLayer = keras.layers.Dense (units = 4, activation = "tanh") (dense3)

model = keras.Model (inputs = stateInput, outputs = outputLayer)
model.compile(optimizer=keras.optimizers.Adam(lr=0.001),loss ="mse")

In [9]:
def getLengthOfWeights (model):
    setsOfWeights = np.array(model.get_weights())
    totalWeights = 0
    shapes = []
    paramsPerShape = []
    for i in setsOfWeights:
        shapes.append(i.shape)
        i = np.reshape (i, (-1))
        totalWeights += i.shape[0]
        paramsPerShape.append(i.shape[0])
    return totalWeights, shapes, paramsPerShape

In [10]:
def setModelWeights(parameters, shapes, paramsPerShape):
    weights = []
    paramsUsedSoFar = 0
    for index, shape in enumerate (shapes):
        paramsInThisLayer = paramsPerShape[index]
        params = np.reshape (parameters[paramsUsedSoFar:paramsUsedSoFar+paramsInThisLayer], shape)
        weights.append(params)
        paramsUsedSoFar += paramsInThisLayer
    model.set_weights(weights)

In [11]:
def getBipedalWalkerFitness (parameters, shapes, paramsPerShape):
    # The reward function has been changed so it promotes individuals who run farther instead of agents which fall really quickly (don't get penalised by applying force) or that stay static on the floor in order not to fall and receive -100 points
    setModelWeights (parameters, shapes, paramsPerShape)

    env = gym.make('BipedalWalker-v3')

    done = False
    bipedalWalkerReward = 0
    prevObs = np.expand_dims(env.reset(),axis=0)

    for step in range(2002):
        if (done):
            bipedalWalkerReward += (env.hull.position[0]-4.5)*10
            break  
        #env.render()
        action = model.predict(prevObs)[0]
        
        state, reward, done,_ = env.step(action)
        state = np.expand_dims(state,axis=0)
        
        if (np.abs(np.sum(state-prevObs)) < 5E-7 ):
            reward -= 70
            done    = True
      
        prevObs = state
       
        bipedalWalkerReward += reward
    
    env.close()

    return bipedalWalkerReward

In [None]:
numberOfGenerations = 100
indivPerGen         = 100
fittestNumber       = 50

childrenFromParents = 90
parentsThatRemain   = 5
numberNewIndividuals= 5


mu = 0
sigma = 0.7


totalWeights, shapes, paramsPerShape = getLengthOfWeights (model)

primitiveGeneration = createRandomIndividuals (numberOfIndividuals=indivPerGen, numberOfParameters = totalWeights, low=-1, high=1)
# Shape (NumberOfWeights, indivPerGen).

pastGeneration   = np.copy(primitiveGeneration)

for gen in range (numberOfGenerations):
    print ("Generation %d starting" % (gen+1))
    # Choose one selection type:
    mutation_rate = 0.05/np.cbrt(gen+1)
    survivors    = elitismSelection (pastGeneration, numberToBeChosen=fittestNumber, shapes=shapes, paramsPerShape = paramsPerShape)
    # Shape (NumberOfWeights, fittestNumber)
    
    pairs        = createPairs (numberOfSurvivors=fittestNumber, numberOfPairs = childrenFromParents//2, replace=True)
    # Shape (childrenFromParents,). Contains indices for survivors Array ready to be splitted every two.

    newChildren  = []

    for i in range (childrenFromParents//2):

        parents  = survivors[:,pairs[2*i:2*(i+1)]]
        # Shape (NumberOfWeights, numberOfParents)
        
        if (np.random.random()<0.8):
            child1, child2 = SBX (parents[:,0], parents[:,1]) 
        else:
            child1, child2 = SPBX (parents[:,0], parents[:,1])
            
        # Choose one mutation type:
        child1, child2 = applyGaussianMutation (child1, mutation_rate=mutation_rate,mu=mu, sigma=sigma), applyGaussianMutation (child2,mutation_rate=mutation_rate, mu=mu, sigma=sigma)


        newChildren.append(child1)
        newChildren.append(child2)
    
    newChildren    = np.swapaxes(newChildren, axis1=1, axis2=0)
    # Shape (NumberOfWeights, ChildrenFromParents).

    newIndividuals = createRandomIndividuals (numberNewIndividuals, numberOfParameters = totalWeights, low=-1, high=1)
    # Shape (NumberOfWeights, numberNewIndividuals)
    
    newGeneration  = np.concatenate ((survivors[:,:parentsThatRemain],newChildren, newIndividuals), axis = 1)
    # Shape (NumberOfWeights, indivPerGen)
    
    pastGeneration = np.copy(newGeneration)
    if ((gen+1) %5 == 0):
        np.save("bipedalGeneration"+str(gen)+".npy",newGeneration)
getBipedalWalkerFitness(survivors[:,0], shapes, paramsPerShape)

Generation 1 starting




Best individual reward: -65
Generation 2 starting
Best individual reward: -34
Generation 3 starting
Best individual reward: 12
Generation 4 starting
Best individual reward: 102
Generation 5 starting
Best individual reward: 81
Generation 6 starting
Best individual reward: 49
Generation 7 starting
Best individual reward: 97
Generation 8 starting
Best individual reward: 120
Generation 9 starting
Best individual reward: 106
Generation 10 starting
Best individual reward: 231
Generation 11 starting
Best individual reward: 210
Generation 12 starting
Best individual reward: 233
Generation 13 starting
Best individual reward: 213
Generation 14 starting
Best individual reward: 265
Generation 15 starting
Best individual reward: 258
Generation 16 starting
Best individual reward: 244
Generation 17 starting
Best individual reward: 279
Generation 18 starting
Best individual reward: 273
Generation 19 starting
Best individual reward: 220
Generation 20 starting
Best individual reward: 383
Generation 21 s

## The genetic algorithm was manually stopped here because of its expensive computational cost but ideally it would be run for many more generations.
### Furthermore, once we have an agent that can move through the environment, we could change back the fitness function to the official one and the agent should not go back to those local optima in which it remains static on the floor.