In [1]:
import gym
import numpy as np
import random
env = gym.make("CartPole-v1")
shape_of_net = (4,
                30,
                20,
                2)
num_layers = len(shape_of_net) - 1

In [2]:
def softmax(x):

    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


class Agent:
    def __init__(self):
        self.weights = []
        self.biases = []
        self.cum_reward = 0
        for i in range(num_layers):
            self.weights.append(np.random.uniform(-3,3,size=(shape_of_net[i],shape_of_net[i+1])))
            self.biases.append(np.random.uniform(-3,3,size=(shape_of_net[i+1])))
    
    def act(self,state):
        l = state
        for i in range(num_layers):
            l = np.matmul(l,self.weights[i]) + self.biases[i]
            if i != num_layers-1:
                #ReLU
                np.maximum(l,0,l)
        
        l = softmax(l)
        
        return np.argmax(l[0])


In [3]:
def mating(parents,offsprings):
    for offspring in offsprings:
        parent1,parent2 = random.sample(parents,2)
        #Crossover the weights and biases of the parents in the offspring
        for i in range(num_layers):
            offspring.weights[i] = np.concatenate([parent1.weights[i][:,:int(shape_of_net[i+1]/2)],
                                                       parent2.weights[i][:,int(shape_of_net[i+1]/2):]],axis=1)
            offspring.biases[i] = np.concatenate([parent1.biases[i][:int(shape_of_net[i+1]/2)],parent2.biases[i][int(shape_of_net[i+1]/2):]])
                
def mutation(agents):
    for agent in agents:
        for i in range(num_layers):
            #Mutate 
            mutation = np.random.normal(scale=3,size=2)
            agent.weights[i][np.random.randint(0,shape_of_net[i]),np.random.randint(0,shape_of_net[i+1])] += mutation[0]
            agent.biases[i][np.random.randint(0,shape_of_net[i+1])] += mutation[1]


In [4]:
num_agents = 200
top_perc = 0.1
agents = []

for i in range(num_agents):
    agents.append(Agent())

cutoff = int(top_perc*num_agents)

In [5]:
#Score won't go above 500 since environment ends after that
num_episodes = 30
for i in range(num_episodes):
    for agent in agents:
        state = env.reset()
        done = False
        while not done:
            action = agent.act(state.reshape((1,-1)))
            state,r,done,_ = env.step(action)
            agent.cum_reward += r
    
    #Choose the fittest agents in each generation
    agent_reward_list = sorted(agents,key= lambda a:a.cum_reward,reverse=True)
    #Populate the rest of the population with the offspring of the fittest agents 
    mating(agent_reward_list[:cutoff],agent_reward_list[cutoff:])
    mutation(agents)
    print(f"Episode {i} best:{agent_reward_list[0].cum_reward} av:{sum([a.cum_reward for a in agents])/num_agents}")
    for agent in agents:
        agent.cum_reward = 0
    

Episode 0 best:325.0 av:18.625
Episode 1 best:500.0 av:24.46
Episode 2 best:500.0 av:46.62
Episode 3 best:500.0 av:56.77
Episode 4 best:500.0 av:90.825
Episode 5 best:500.0 av:116.75
Episode 6 best:500.0 av:156.575
Episode 7 best:500.0 av:184.265
Episode 8 best:500.0 av:202.965
Episode 9 best:500.0 av:196.38
Episode 10 best:500.0 av:220.71
Episode 11 best:500.0 av:318.595
Episode 12 best:500.0 av:336.525
Episode 13 best:500.0 av:322.87
Episode 14 best:500.0 av:322.95
Episode 15 best:500.0 av:334.26
Episode 16 best:500.0 av:357.78
Episode 17 best:500.0 av:336.63
Episode 18 best:500.0 av:357.95
Episode 19 best:500.0 av:388.505
Episode 20 best:500.0 av:370.76
Episode 21 best:500.0 av:384.41
Episode 22 best:500.0 av:358.775
Episode 23 best:500.0 av:347.655
Episode 24 best:500.0 av:377.665
Episode 25 best:500.0 av:364.255
Episode 26 best:500.0 av:401.71
Episode 27 best:500.0 av:368.885
Episode 28 best:500.0 av:332.635
Episode 29 best:500.0 av:368.34


In [6]:
#Rendering an exmaple
for i in range(10):
    done = False
    state = env.reset()
    while not done:
        env.render()
        action = agent_reward_list[0].act(state.reshape((1,-1)))
        state,r,done,_ = env.step(action)