# Trying Genetic algorithm in Pong (ram) Environment.
### If we wanted to obtain an optimal agent we would have to run many more generations as the input is bigger than in other environments, especially if we wanted to learn from pixels instead of from the game ram.

In [1]:
# Imports the libraries:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import tensorflow as tf
from tensorflow import keras
import random
import gym


In [2]:
def createRandomIndividuals (numberOfIndividuals, numberOfParameters, low=-1, high=1):
    parameters = np.random.uniform (low=low, high = high, size = (numberOfParameters, numberOfIndividuals))
    return parameters

In [3]:
def elitismSelection (parameters, numberToBeChosen, shapes, paramsPerShape):
    scores  = [getPongFitness(parameters[:,i], shapes, paramsPerShape) for i in range(parameters.shape[1])]
    indices = np.argsort (scores)[::-1] # Really important for descending order!
    parameters      = parameters[:,indices[:numberToBeChosen]]
    print ("Best individual reward: %.2f" % scores[indices[0]])
    return parameters

In [4]:
def createPairs (numberOfSurvivors, numberOfPairs, replace=True): # It returns a list of indices that can be splitted every 2 for forming pairs
    pairs = []
    chosenIndices = np.random.choice (range(numberOfSurvivors), size=(numberOfPairs*2), replace = replace)
    return chosenIndices

In [5]:
def SBX (parameters1, parameters2, eta=100): # Simulated Binary Crossover. The bigger is eta, the more close the children are to parents

    r             = np.random.random(size=len(parameters1))
    gamma         = np.empty(len(parameters1))
    gamma[r<=0.5] = (2*r[r<=0.5])**(1/(eta+1))
    gamma[r>0.5 ] = (1/(2*(1-r[r>0.5])))**(1/(eta+1))
    child1        = 0.5 * ((1+gamma)*parameters1 + (1-gamma)*parameters2)
    child2        = 0.5 * ((1-gamma)*parameters1 + (1+gamma)*parameters2)
    return child1, child2

In [6]:
def applyGaussianMutation (parameters, mutation_rate=0.05, mu=0, sigma=1): # Pass it a numpy array of floats! (mu and sigma can be arrays too)
    mutationOrNot              = (np.random.random (size=len(parameters))) < mutation_rate
    mutationValues             = np.random.normal(loc=mu, scale = sigma, size = len(parameters))
    parameters[mutationOrNot] += mutationValues[mutationOrNot]
    return parameters

In [7]:
stateInput = keras.Input(shape = (128))

dense1 = keras.layers.Dense(units = 64, activation = "relu") (stateInput)
dense2 = keras.layers.Dense(units = 16, activation = "relu") (dense1)

outputLayer = keras.layers.Dense (units = 2) (dense2)

model = keras.Model (inputs = stateInput, outputs = outputLayer)
model.compile(optimizer=keras.optimizers.Adam(lr=0.001),loss ="mse")

In [8]:
def getLengthOfWeights (model):
    setsOfWeights = np.array(model.get_weights())
    totalWeights = 0
    shapes = []
    paramsPerShape = []
    for i in setsOfWeights:
        shapes.append(i.shape)
        i = np.reshape (i, (-1))
        totalWeights += i.shape[0]
        paramsPerShape.append(i.shape[0])
    return totalWeights, shapes, paramsPerShape

In [9]:
def setModelWeights(parameters, shapes, paramsPerShape):
    weights = []
    paramsUsedSoFar = 0
    for index, shape in enumerate (shapes):
        paramsInThisLayer = paramsPerShape[index]
        params = np.reshape (parameters[paramsUsedSoFar:paramsUsedSoFar+paramsInThisLayer], shape)
        weights.append(params)
        paramsUsedSoFar += paramsInThisLayer
    model.set_weights(weights)

In [10]:
def getPongFitness (parameters, shapes, paramsPerShape):
    
    setModelWeights (parameters, shapes, paramsPerShape)

    env = gym.make('Pong-ram-v0')

    done = False
    pongReward = 0
    prevObs = np.expand_dims(env.reset(),axis=0)

    
    for step in range(400): # The game is longer but this should be representative of the agent's ability
        
        if (done):
            break  
            
        env.render()
        
        action = np.argmax(model.predict(prevObs))+2 # Since the game actions are 2 and 3 instead of 0 and 1

        state, reward, done,_ = env.step(action)

        prevObs = np.expand_dims(state,axis=0)
        
        pongReward += reward
    
    pongReward *=100
    pongReward += (step/10) # Steps are the tie break when all agents can't score a single point. Agents which lasted more before losing have an evolutive advantage.
    
    env.close()
    return pongReward

In [11]:
# Ideally the number of generations would be much higher

numberOfGenerations = 25
indivPerGen         = 200
fittestNumber       = 50

childrenFromParents = 170
parentsThatRemain   = 20
numberNewIndividuals= 10

mu = 0
sigma = 0.2


totalWeights, shapes, paramsPerShape = getLengthOfWeights (model)

primitiveGeneration = createRandomIndividuals (numberOfIndividuals=indivPerGen, numberOfParameters = totalWeights, low=-1, high=1)
# Shape (NumberOfWeights, indivPerGen).

pastGeneration   = np.copy(primitiveGeneration)

for i in range (numberOfGenerations):
    print ("Generation %d starting" % (i+1))
    # Choose one selection type:
    mutation_rate = 0.05/np.cbrt(i+1)
    survivors    = elitismSelection (pastGeneration, numberToBeChosen=fittestNumber, shapes=shapes, paramsPerShape = paramsPerShape)
    # Shape (NumberOfWeights, fittestNumber)

    pairs        = createPairs (numberOfSurvivors=fittestNumber, numberOfPairs = childrenFromParents//2, replace=True)
    # Shape (childrenFromParents,). Contains indices for survivors Array ready to be splitted every two.

    newChildren  = []

    for i in range (childrenFromParents//2):

        parents  = survivors[:,pairs[2*i:2*(i+1)]]
        # Shape (NumberOfWeights, numberOfParents)
        
        child1, child2 = SBX (parents[:,0], parents[:,1]) 
    
        # Choose one mutation type:

        child1, child2 = applyGaussianMutation (child1, mutation_rate=mutation_rate,mu=mu, sigma=sigma), applyGaussianMutation (child2,mutation_rate=mutation_rate, mu=mu, sigma=sigma)


        newChildren.append(child1)
        newChildren.append(child2)
    
    newChildren    = np.swapaxes(newChildren, axis1=1, axis2=0)
    # Shape (NumberOfWeights, ChildrenFromParents).

    newIndividuals = createRandomIndividuals (numberNewIndividuals, numberOfParameters = totalWeights, low=-1, high=1)
    # Shape (NumberOfWeights, numberNewIndividuals)
    
    newGeneration  = np.concatenate ((survivors[:,:parentsThatRemain],newChildren, newIndividuals), axis = 1)
    # Shape (NumberOfWeights, indivPerGen)

    pastGeneration = np.copy(newGeneration)
getPongFitness(survivors[:,0], shapes, paramsPerShape)

Generation 1 starting
Best individual reward: -260.10
Generation 2 starting
Best individual reward: -160.10
Generation 3 starting
Best individual reward: 39.90
Generation 4 starting
Best individual reward: -60.10
Generation 5 starting
Best individual reward: 39.90
Generation 6 starting
Best individual reward: 139.90
Generation 7 starting
Best individual reward: 39.90
Generation 8 starting
Best individual reward: 39.90
Generation 9 starting


KeyboardInterrupt: 

## Now that we have agents which can tie against the computer in a short game, let's play the game for longer in each episode (1000 steps)

In [12]:
def getPongLongFitness (parameters, shapes, paramsPerShape):
    
    setModelWeights (parameters, shapes, paramsPerShape)

    env = gym.make('Pong-ram-v0')

    done = False
    pongReward = 0
    prevObs = np.expand_dims(env.reset(),axis=0)

    
    for step in range(1000): # The game is longer but this should be representative of the agent ability
        
        if (done):
            break  
            
        env.render()
        
        action = np.argmax(model.predict(prevObs))+2 # Since the game actions are 2 and 3 instead of 0 and 1

        state, reward, done,_ = env.step(action)

        prevObs = np.expand_dims(state,axis=0)
        
        pongReward += reward
    
    pongReward *=100
    pongReward += (step/10) # Steps are the tie break when all agents can't score a single point. Agents which lasted more before losing have an evolutive advantage.
    
    env.close()
    return pongReward

In [13]:
def elitismSelection (parameters, numberToBeChosen, shapes, paramsPerShape): # Calls the new Long Fitness function!
    scores  = [getPongLongFitness(parameters[:,i], shapes, paramsPerShape) for i in range(parameters.shape[1])]
    indices = np.argsort (scores)[::-1] # Really important for descending order!
    parameters      = parameters[:,indices[:numberToBeChosen]]
    print ("Best individual reward: %.2f" % scores[indices[0]])
    return parameters

In [16]:
for i in range (9,19):
    print ("Generation %d starting" % (i+1))
    # Choose one selection type:
    mutation_rate = 0.05/np.cbrt(i+1)
    survivors    = elitismSelection (pastGeneration, numberToBeChosen=fittestNumber, shapes=shapes, paramsPerShape = paramsPerShape)
    # Shape (NumberOfWeights, fittestNumber)

    pairs        = createPairs (numberOfSurvivors=fittestNumber, numberOfPairs = childrenFromParents//2, replace=True)
    # Shape (childrenFromParents,). Contains indices for survivors Array ready to be splitted every two.

    newChildren  = []

    for i in range (childrenFromParents//2):

        parents  = survivors[:,pairs[2*i:2*(i+1)]]
        # Shape (NumberOfWeights, numberOfParents)
        
        child1, child2 = SBX (parents[:,0], parents[:,1]) 
    
        # Choose one mutation type:

        child1, child2 = applyGaussianMutation (child1, mutation_rate=mutation_rate,mu=mu, sigma=sigma), applyGaussianMutation (child2,mutation_rate=mutation_rate, mu=mu, sigma=sigma)


        newChildren.append(child1)
        newChildren.append(child2)
    
    newChildren    = np.swapaxes(newChildren, axis1=1, axis2=0)
    # Shape (NumberOfWeights, ChildrenFromParents).

    newIndividuals = createRandomIndividuals (numberNewIndividuals, numberOfParameters = totalWeights, low=-1, high=1)
    # Shape (NumberOfWeights, numberNewIndividuals)
    
    newGeneration  = np.concatenate ((survivors[:,:parentsThatRemain],newChildren, newIndividuals), axis = 1)
    # Shape (NumberOfWeights, indivPerGen)

    pastGeneration = np.copy(newGeneration)
getPongLongFitness(survivors[:,0], shapes, paramsPerShape)

Generation 10 starting
Best individual reward: -200.10
Generation 11 starting
Best individual reward: -200.10
Generation 12 starting
Best individual reward: -0.10
Generation 13 starting
Best individual reward: 199.90
Generation 14 starting
Best individual reward: -100.10
Generation 15 starting
Best individual reward: -100.10
Generation 16 starting
Best individual reward: 99.90
Generation 17 starting
Best individual reward: -100.10
Generation 18 starting
Best individual reward: -0.10
Generation 19 starting
Best individual reward: -100.10


-700.1

## The best agent of each generation is losing by 1 or 2 points after 1000 steps. Obtaining an optimal pong agent would take many more generations given the number of inputs and parameters in the network