In [1]:
import gym

import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [2]:
env = gym.make('LunarLanderContinuous-v2')
env.seed(0)
state = env.reset()

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [100]:
class BasicNet(nn.Module):
    def __init__(self):
        super(BasicNet, self).__init__()
        self.fc1 = nn.Linear(8,16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 2)


    def forward(self, x):
        """ input = concat(state,action)"""
        x = torch.from_numpy(x)
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        x = np.array(x)
        return x
    
    
    def mutate(self,number_of_mutations):
        for time in range(number_of_mutations):
            key = random.choice(self.state_dict().keys())
            shape = self.state_dict()[key].shape
            loc = (random.randint(0,maximum-1) for maximum in shape)
            to_add = random.choice(0.01,-0.01)
            self.state_dict()[key] += to_add
            
    def make_kid(self,other):
        kid = BasicNet()
        for key in self.state_dict().keys():
            self_values = self.state_dict()[key]
            other_values = other.state_dict()[key]
            selector = torch.randn(self_values.shape)>0
            kid.state_dict()[key][selector] = self_values[selector]
            kid.state_dict()[key][~selector] = other_values[~selector]
        return kid
    
net = BasicNet()

In [101]:
class evolution():
    
    def __init__(self,env):
        self.env=env
        self.pool = [BasicNet() for _ in range(50)]
        
    def play_game(self,model)->float:
        with torch.no_grad():
            over = False
            total_score = 0
            state = self.env.reset()
            while not over:
                action = model(state)
                state, reward, over, _ = self.env.step(action)
                total_score += reward
            return total_score
    
    def evolve(self):
        for generation in range(100):
            scored_pool = [ (self.play_game(instance)) for instance in self.pool]
            scored_pool.sort(reverse=True,key=lambda x:x[0])
            print(f'best score = {scored_pool[0][0]}')
            
            best_instances = [i[1] for i in scored_pool[:20]]
            self.pool = (instance.make_kid(random.choice(best_instances)) for instance in best_instances)

In [102]:
ev = evolution(env = gym.make('LunarLanderContinuous-v2'))