# Install and import needed packages

In [None]:
# #%%capture
# #!sudo apt-get update
# !apt install ffmpeg
# !apt install xvfb
# !pip3 install pyvirtualdisplay
# !pip install gym[box2d]
# !pip install stable-baselines3[extra]
# !pip install huggingface_sb3
# !pip install pyglet
# !pip install ale-py==0.7.4 # To overcome an issue with gym (https://github.com/DLR-RM/stable-baselines3/issues/875)
# !pip install pygad
# !pip install pickle5
# !pip install cma

In [None]:
# Import libraries
import gym
import time
import gym
import numpy as np
import torch
import torch.nn as nn
import math
import pyglet
import torchvision
from PIL import Image
import torch.nn.functional as F
import cma
import time
import random
import pickle
from tqdm import tqdm

# Seeds for reproducibility

In [None]:
# Set seeds
np.random.seed(12345)
random.seed(12345)
torch.manual_seed(12345)
torch.cuda.manual_seed_all(12345)
torch.backends.cudnn.deterministic = True  # Note that this Deterministic mode can have a performance impact
torch.backends.cudnn.benchmark = False

  and should_run_async(code)


# Utils functions

In [None]:
# Define utils to count the parameters in a model 
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [None]:
# Define utils to store the model state dict. Needed during CMA
def store_state_dict(model):
    state_dict={}
    for param_tensor in model.state_dict():
        shape=model.state_dict()[param_tensor].shape
        state_dict[param_tensor]= {"size":np.prod([i for i in shape]),"shape":tuple([i for i in shape])}
    return state_dict

In [None]:
# Define utils to load model state dict. Needed during CMA
def load_state_dict(model, params, old_dict):
    new_dict    = {}
    done_params = 0
    
    for param_tensor in old_dict:
        size                   = old_dict[param_tensor]["size"]
        shape                  = old_dict[param_tensor]["shape"]
        new_dict[param_tensor] = torch.reshape(torch.tensor(params[done_params:done_params+size]),shape)
        done_params           += size
    
    model.load_state_dict(new_dict)
    return model

# Training Utils functions

In [None]:
# Define preprocessing util to graysccale and normalize pixel values
def state_processer(image):
    PIL_img = Image.fromarray(np.uint8(image)).convert('L')
    torch_img  = torch.tensor(np.array(PIL_img),dtype=torch.float).unsqueeze(-1)
    return torch_img/255

In [None]:
# Define util to stack temporal observations
def cycle_observation(observations,new_observation):
    observations[:,:,0]=observations[:,:,1]
    observations[:,:,1]=observations[:,:,2]
    observations[:,:,2]=new_observation[:,:,0]
    return observations

In [None]:
def fitness(gamma):
    global model, autoencoder

    observation = state_processer(env.reset())
    ob_tensor   = torch.cat((observation,observation,observation),dim=-1)
    
    sum_reward  = 0
    done        = False
    step        = 0
    
    while (not done) and (sum_reward < 1000):
        step                                += 1
        actual_gamma                         = gamma**(step)
        action, top_patches                  = model(ob_tensor,autoencoder)
        observation_next, reward, done, info = env.step(action)
        observation_next                     = state_processer(observation_next)
        ob_tensor                            = cycle_observation(ob_tensor,observation_next)
        sum_reward                          += actual_gamma*reward         

    return -sum_reward

In [None]:
def train(es,env,model,max_generations,autoencoder,old_dict,gamma):
        
    fitness_values    = []
    absolute_best_fit = 10000
    gen               = 1
    while not es.stop():
        if gen > max_generations:
            break
        start_time       = time.time()
        solutions        = es.ask()
        current_gen_fit  = []
        
        for sol_id, params in enumerate(solutions): 
            model            = load_state_dict(model, params, old_dict)
            value            = fitness(gamma)
            current_gen_fit.append(value)
            
            if value < absolute_best_fit:
                best_model        = solutions[sol_id]
                absolute_best_fit = value


        fitness_values.append(current_gen_fit)
        end_time   = time.time()
        best_fit   = np.min(current_gen_fit)
        print("Generation {} executed in {} s. Best fitness value: {}".format(gen,round(end_time-start_time,2),round(best_fit,4)))
        es.tell(solutions, current_gen_fit)
        gen+=1
    
    
    return fitness_values, best_model    

# Models

In [None]:
# Define simple autoencoder class
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential( 
            nn.Conv2d(3, 32, 3, stride=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, 3, stride=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=1),
            nn.ReLU(),
            nn.Conv2d(128, 256, 2)
        )

        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(256, 128, 2),
            nn.ReLU(),
            nn.ConvTranspose2d(128, 64, 3, stride=1),
            nn.ReLU(),
            nn.ConvTranspose2d(64, 32, 3, stride=1),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 3, 3, stride=1),         
            nn.Sigmoid()
        )

    def forward(self, x):
        x = x.permute(0,3,1,2)
        code = self.encoder(x)
        reconstruction = self.decoder(code)

        return code, reconstruction

In [None]:

class Attention_module(nn.Module):
    def __init__(self,in_dimension,out_dimension):
        super(Attention_module, self).__init__()
      
        self.out_dimension  = out_dimension
        self.in_dimension   = in_dimension  
        self.attention1     = nn.Linear(self.in_dimension,self.out_dimension)
        self.attention2     = nn.Linear(self.in_dimension,self.out_dimension)

        
    def forward(self, x):
        # Compute query projection
        q                 = self.attention1(x)
        # Compute key projection
        k                 = self.attention2(x)
        scaled_mm         = (q @ k.T)/math.sqrt(self.in_dimension)

        scaled_mm         = (q @ k.T)/math.sqrt(self.in_dimension)
        # Compute attention scores
        attention_scores  = torch.nn.functional.softmax(scaled_mm, dim=1)
      
        return attention_scores 

class Patcher_module(nn.Module):
    def __init__(self,stride,window,channels):
        super(Patcher_module, self).__init__()
      
        self.stride   = stride
        self.window   = window
        self.channels = channels

        
    def forward(self, x):
        # Extract overlapping patches over the inputted images
        patches           = x.unfold(0, self.window, self.stride).unfold(1,self.window,self.stride).reshape((-1, self.channels,self.window,self.window)).permute(0,2,3,1)
        n_patches         = patches.shape[0]
        flattened_patches = patches.reshape((n_patches,-1))
        
        # Return flattened and non-flattened patches
        return patches,flattened_patches

class Slicer_module(nn.Module):
    def __init__(self,K,):
        super(Slicer_module, self).__init__()
      
        self.K = K
        
    def forward(self, x):
        # Perform row-wise sum of attention scores
        score_sums                    = torch.sum(x,dim=0)
        sorted_scores,indexes         = torch.sort(score_sums,descending=True)
        sliced_scores,sliced_indexes  = sorted_scores[:self.K],indexes[:self.K]
        
        # Return top-K best scores and corresponding patch indexes
        return sliced_scores,sliced_indexes

class Feature_extractor_module(nn.Module):
    def __init__(self,FEAT_EXTRACT,stride,window):
        super(Feature_extractor_module,self).__init__()
        
        self.stride          = stride
        self.window          = window
        self.feat_extract    = FEAT_EXTRACTOR

    
        if self.feat_extract=="CONV":
            self.convolutions = nn.Sequential(
                nn.Conv2d(in_channels=3, out_channels=10, kernel_size=3, stride=1 ),
                nn.ReLU(),
                nn.Conv2d(in_channels=10, out_channels=10, kernel_size=3, stride=1 ),
                nn.ReLU(),
                nn.Conv2d(in_channels=10, out_channels=10, kernel_size=3, stride=1 ),
                nn.ReLU(),
                nn.Conv2d(in_channels=10, out_channels=2, kernel_size=2, stride=1)
            )

    def forward(self,top_patches,top_idxs,n,top_scores):
            
        if self.feat_extract=="LOC":
        # Compute top-patches coordinates given their indexes
            x = torch.tensor([[idx//n*self.stride+self.window/2,(idx-(idx//n)*n)*self.stride+self.window/2] for i,idx in enumerate(top_idxs.tolist())])
            
        if self.feat_extract=="CONV":
            # Compute top-patches features via convolutions
            x = self.convolutions(top_patches.permute(0,3,1,2))
     
            
        feats = x.flatten().unsqueeze(0)
              
        return feats
            
class Controller_module(nn.Module):
    def __init__(self,in_dim,out_dim,CONTINUOUS):
        super(Controller_module, self).__init__()
        self.linear=nn.Linear(in_dim,out_dim)
        self.continuous=CONTINUOUS  

    def forward(self, x):
        logits                    = self.linear(x)
        # If the played game is continuous, the returned action
        # is an array of size 3. 
        # 1st element correspond to steer and is in [-1,1].
        # 2nd element correspond to brake and is in [0, 1].
        # 3rd element correspond to gas and is in [0, 1].
        if self.continuous:
            steer                     = logits[:,:1]
            brake_gas                 = logits[:,1:]
            tuple_of_activated_parts  = (
                                      torch.tanh(steer),
                                      torch.sigmoid(brake_gas)
                                      )
            action                    = torch.cat(tuple_of_activated_parts, dim=1).squeeze().numpy()
        
        # If the played game is discrete, action is the classical argmax of logits
        else:
          if logits.shape[0]==1:
            action=np.array(torch.argmax(logits).item())
          else:
            action=np.array(torch.mode(torch.argmax(logits,dim=1))[0].item())
        
        return action

class Agent(nn.Module):
    def __init__(self,obs_space,stride,window,channels,
                 attention_out_dim,K,lstm_hidden_dim,
                 eps,CONTINUOUS,FEAT_EXTRACTOR):
        
        super(Agent, self).__init__()
        if CONTINUOUS:
            self.action_class     = 3
        else:
            self.action_class     = 5
        self.obs_space            = obs_space
        self.stride               = stride
        self.window               = window
        self.channels             = channels
        self.attention_out_dim    = attention_out_dim
        self.attention_in_dim     = window*window*channels
        self.K                    = K
        self.lstm_hidden_dim      = lstm_hidden_dim
        self.continuous           = CONTINUOUS
        self.eps                  = eps
        self.grayscaler           = torchvision.transforms.Grayscale(num_output_channels=1)

        self.attention            = Attention_module(self.attention_in_dim,self.attention_out_dim)
        self.patch_extractor      = Patcher_module(self.stride,self.window,self.channels)
        self.slicer               = Slicer_module(self.K)
        self.feat_extractor       = Feature_extractor_module(FEAT_EXTRACTOR, stride, window)
        self.controller           = Controller_module(self.lstm_hidden_dim,self.action_class,CONTINUOUS)
        if FEAT_EXTRACTOR!="AE":
          self.rnn                  = nn.LSTM(self.K*2, self.lstm_hidden_dim,batch_first=True)   
        
    def forward(self, x,autoencoder=None):
        patches,flat_patches     = self.patch_extractor(x)
        # Extract sqrt value of the number of extracted patches (for location extraction purposes)
        n                        = math.sqrt(flat_patches.shape[0])
        attention_scores         = self.attention(flat_patches)
        top_scores,top_idxs      = self.slicer(attention_scores)
        top_patches              = torch.index_select(patches, 0, top_idxs)
        
        if autoencoder:
            feats,reconstruction = autoencoder(top_patches)
            feats                = feats.squeeze()
        
        else:
            feats                = self.feat_extractor(top_patches,top_idxs,n,top_scores)
            feats,_              = self.rnn(feats)
           
        out                      = self.controller(feats)
        rand                     = torch.rand(1).item()
        # Balance among experience and discovery: 
        # the action will be random with probability eps
        if rand<self.eps:
            if self.continuous:
                out    = np.random.rand(self.action_class,)
                out[0] = (out[0]-0.5)*2
            else:
                out    = torch.randint(self.action_class,(1,)).item()
        return out, top_patches
        

  and should_run_async(code)


# Training

## Top-patch location feature extractor

In [None]:
torch.set_grad_enabled(False)

observation_space_size = 96
stride                 = 4
window                 = 8
channels               = 3
attention_out_dim      = 4
K                      = 10
lstm_hidden_dim        = 16
eps                    = 0.05
CONTINUOUS             = False
                       ###### Choose between "LOC" and "CONV"
FEAT_EXTRACTOR         = "LOC"

model=Agent(observation_space_size,stride,window,
            channels,attention_out_dim,K,lstm_hidden_dim,
            eps,CONTINUOUS,FEAT_EXTRACTOR)
model

Agent(
  (grayscaler): Grayscale(num_output_channels=1)
  (attention): Attention_module(
    (attention1): Linear(in_features=192, out_features=4, bias=True)
    (attention2): Linear(in_features=192, out_features=4, bias=True)
  )
  (patch_extractor): Patcher_module()
  (slicer): Slicer_module()
  (feat_extractor): Feature_extractor_module()
  (controller): Controller_module(
    (linear): Linear(in_features=16, out_features=5, bias=True)
  )
  (rnn): LSTM(20, 16, batch_first=True)
)

In [None]:
print("Model's state_dict:")
old_dict=store_state_dict(model)
old_dict

Model's state_dict:


{'attention.attention1.weight': {'size': 768, 'shape': (4, 192)},
 'attention.attention1.bias': {'size': 4, 'shape': (4,)},
 'attention.attention2.weight': {'size': 768, 'shape': (4, 192)},
 'attention.attention2.bias': {'size': 4, 'shape': (4,)},
 'controller.linear.weight': {'size': 80, 'shape': (5, 16)},
 'controller.linear.bias': {'size': 5, 'shape': (5,)},
 'rnn.weight_ih_l0': {'size': 1280, 'shape': (64, 20)},
 'rnn.weight_hh_l0': {'size': 1024, 'shape': (64, 16)},
 'rnn.bias_ih_l0': {'size': 64, 'shape': (64,)},
 'rnn.bias_hh_l0': {'size': 64, 'shape': (64,)}}

In [None]:
n=get_n_params(model)
print("Model contains {} parameters.".format(n))

Model contains 4061 parameters.


In [None]:
es                = cma.CMAEvolutionStrategy(n * [0], 0.01,{"seed":12345})
env               = gym.make("CarRacing-v2",continuous=CONTINUOUS)
max_generation    = 600
gamma             = 0.999
autoencoder       = None
# autoencoder       = Autoencoder()

fitness_values_LOC, best_model_LOC  = train(es,env,model,max_generation,autoencoder,old_dict,gamma)

(14_w,28)-aCMA-ES (mu_w=8.1,w_1=21%) in dimension 4061 (seed=12345, Thu Sep  8 12:53:39 2022)
Generation 1 executed in 278.38 s. Best fitness value: 14.8152
Generation 2 executed in 244.65 s. Best fitness value: -40.3449
Generation 3 executed in 208.16 s. Best fitness value: -63.5775
Generation 4 executed in 214.13 s. Best fitness value: -54.0353
Generation 5 executed in 219.37 s. Best fitness value: -42.6086
Generation 6 executed in 205.9 s. Best fitness value: -24.676
Generation 7 executed in 197.65 s. Best fitness value: -52.5358
Generation 8 executed in 213.34 s. Best fitness value: -120.0463
Generation 9 executed in 191.18 s. Best fitness value: -50.4142
Generation 10 executed in 189.32 s. Best fitness value: -48.6149
Generation 11 executed in 160.51 s. Best fitness value: -46.8494
Generation 12 executed in 245.84 s. Best fitness value: -38.0001
Generation 13 executed in 202.89 s. Best fitness value: -14.2416
Generation 14 executed in 182.47 s. Best fitness value: -53.2481
Generat

In [None]:

# with open('best_model_LOC.pickle', 'wb') as handle:
#     pickle.dump(best_model_LOC, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('fitness_values_LOC.pickle', 'wb') as handle:
#     pickle.dump(fitness_values_LOC, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('best_model_LOC.pickle', 'rb') as handle:
    best_model_LOC = pickle.load(handle)


## Top-patches convolutional feature extractor

In [None]:
FEAT_EXTRACTOR         = "CONV"
lstm_hidden_dim        = 16

model=Agent(observation_space_size,stride,window,channels,
            attention_out_dim,K,lstm_hidden_dim,eps,
            CONTINUOUS,FEAT_EXTRACTOR)
model

Agent(
  (grayscaler): Grayscale(num_output_channels=1)
  (attention): Attention_module(
    (attention1): Linear(in_features=192, out_features=4, bias=True)
    (attention2): Linear(in_features=192, out_features=4, bias=True)
  )
  (patch_extractor): Patcher_module()
  (slicer): Slicer_module()
  (feat_extractor): Feature_extractor_module(
    (convolutions): Sequential(
      (0): Conv2d(3, 10, kernel_size=(3, 3), stride=(1, 1))
      (1): ReLU()
      (2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1))
      (3): ReLU()
      (4): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1))
      (5): ReLU()
      (6): Conv2d(10, 2, kernel_size=(2, 2), stride=(1, 1))
    )
  )
  (controller): Controller_module(
    (linear): Linear(in_features=16, out_features=5, bias=True)
  )
  (rnn): LSTM(20, 16, batch_first=True)
)

In [None]:
print("Model's state_dict:")
old_dict=store_state_dict(model)
old_dict

Model's state_dict:


{'attention.attention1.weight': {'size': 768, 'shape': (4, 192)},
 'attention.attention1.bias': {'size': 4, 'shape': (4,)},
 'attention.attention2.weight': {'size': 768, 'shape': (4, 192)},
 'attention.attention2.bias': {'size': 4, 'shape': (4,)},
 'feat_extractor.convolutions.0.weight': {'size': 270, 'shape': (10, 3, 3, 3)},
 'feat_extractor.convolutions.0.bias': {'size': 10, 'shape': (10,)},
 'feat_extractor.convolutions.2.weight': {'size': 900,
  'shape': (10, 10, 3, 3)},
 'feat_extractor.convolutions.2.bias': {'size': 10, 'shape': (10,)},
 'feat_extractor.convolutions.4.weight': {'size': 900,
  'shape': (10, 10, 3, 3)},
 'feat_extractor.convolutions.4.bias': {'size': 10, 'shape': (10,)},
 'feat_extractor.convolutions.6.weight': {'size': 80, 'shape': (2, 10, 2, 2)},
 'feat_extractor.convolutions.6.bias': {'size': 2, 'shape': (2,)},
 'controller.linear.weight': {'size': 80, 'shape': (5, 16)},
 'controller.linear.bias': {'size': 5, 'shape': (5,)},
 'rnn.weight_ih_l0': {'size': 1280, '

In [None]:
n=get_n_params(model)
print("Model contains {} parameters.".format(n))

Model contains 6243 parameters.


In [None]:
es                = cma.CMAEvolutionStrategy(n * [0], 0.01,{"seed":12345})
env               = gym.make("CarRacing-v2",continuous=CONTINUOUS)
max_generation    = 150
gamma             = 0.999
autoencoder       = None

fitness_values_CONV, best_model_CONV  = train(es,env,model,max_generation,autoencoder,old_dict,gamma)

(15_w,30)-aCMA-ES (mu_w=8.6,w_1=20%) in dimension 6243 (seed=12345, Tue Sep 13 11:06:24 2022)
Generation 1 executed in 281.65 s. Best fitness value: 27.2262
Generation 2 executed in 245.27 s. Best fitness value: -46.498
Generation 3 executed in 154.21 s. Best fitness value: -43.3922
Generation 4 executed in 176.45 s. Best fitness value: -56.0729
Generation 5 executed in 143.06 s. Best fitness value: -59.2813
Generation 6 executed in 183.93 s. Best fitness value: -56.5282
Generation 7 executed in 169.21 s. Best fitness value: -66.0006
Generation 8 executed in 156.68 s. Best fitness value: -54.956
Generation 9 executed in 135.73 s. Best fitness value: -57.9008
Generation 10 executed in 165.72 s. Best fitness value: -50.3694
Generation 11 executed in 129.21 s. Best fitness value: -48.8621
Generation 12 executed in 134.58 s. Best fitness value: -57.078
Generation 13 executed in 181.37 s. Best fitness value: -36.8055
Generation 14 executed in 174.27 s. Best fitness value: -83.6315
Generatio

In [None]:

with open('best_model_CONV.pickle', 'wb') as handle:
    pickle.dump(best_model_CONV, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('fitness_values_CONV.pickle', 'wb') as handle:
    pickle.dump(fitness_values_CONV, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('best_model_CONV.pickle', 'rb') as handle:
#     best_model_CONV = pickle.load(handle)


## Autoencoder training

In [None]:
env               = gym.make("CarRacing-v2",continuous=CONTINUOUS)
FEAT_EXTRACTOR    = "LOC"

model             = Agent(observation_space_size,stride,window,channels,
                          attention_out_dim,K,lstm_hidden_dim,eps,
                          CONTINUOUS,FEAT_EXTRACTOR)
old_dict          = store_state_dict(model)

autoencoder       = Autoencoder()
autoencoder.train()

ae_losses     = {"train":[],"val":[]}
n_episodes    = 30
learning_rate = 1e-3
criterion     = nn.MSELoss() # mean square error loss
optimizer     = torch.optim.Adam(autoencoder.parameters(),
                                 lr=learning_rate,
                                 weight_decay=1e-5)
torch.set_grad_enabled(True)
for i in range(n_episodes):
  

    model       = load_state_dict(model, best_model_LOC, old_dict)
    observation = state_processer(env.reset())
    ob_tensor   = torch.cat((observation,observation,observation),dim=-1)

    done        = False
    train_loss  = []
    val_loss    = []
    step        = 0
    sum_reward  = 0
    while (not done) and (sum_reward < 1000):
        action, top_patches                  = model(ob_tensor)
        observation_next, reward, done, info = env.step(action)
        observation_next                     = state_processer(observation_next)
        ob_tensor                            = cycle_observation(ob_tensor,observation_next)
        sum_reward                          += reward
        step                                += 1
        if autoencoder:
            autoencoder.train()
            top_patches                   = top_patches[torch.randperm(top_patches.size()[0])]
            top_patch_train,top_patch_val = top_patches[:8],top_patches[8:]
            code,reconstructions          = autoencoder(top_patch_train)
            loss                          = criterion(reconstructions, top_patch_train.permute(0,3,1,2))
            loss.backward(retain_graph=True)
            optimizer.step()
            optimizer.zero_grad()
            train_loss.append(loss.item())

            autoencoder.eval()
            code,reconstructions          = autoencoder(top_patch_val)
            loss                          = criterion(reconstructions, top_patch_val.permute(0,3,1,2))
            val_loss.append(loss.item())

    ae_losses["train"].append(train_loss)
    ae_losses["val"].append(val_loss)
    print("Episode {} lasted {} frames. ### Sum reward : {} ### Train loss : {} Validation loss : {}".\
        format(i,step,round(sum_reward,2),round(np.mean(train_loss),5),round(np.mean(val_loss),5)))
    

Episode 0 lasted 1000 frames. ### Sum reward : 684.51 ### Train loss : 0.01281 Validation loss : 0.01283
Episode 1 lasted 1000 frames. ### Sum reward : 766.67 ### Train loss : 0.00755 Validation loss : 0.00732
Episode 2 lasted 1000 frames. ### Sum reward : 842.57 ### Train loss : 0.00703 Validation loss : 0.00684
Episode 3 lasted 1000 frames. ### Sum reward : 623.03 ### Train loss : 0.00563 Validation loss : 0.00567
Episode 4 lasted 1000 frames. ### Sum reward : 808.42 ### Train loss : 0.00519 Validation loss : 0.00487
Episode 5 lasted 390 frames. ### Sum reward : 52.28 ### Train loss : 0.00561 Validation loss : 0.00509
Episode 6 lasted 1000 frames. ### Sum reward : 263.35 ### Train loss : 0.00514 Validation loss : 0.00513
Episode 7 lasted 1000 frames. ### Sum reward : 877.78 ### Train loss : 0.00412 Validation loss : 0.00414
Episode 8 lasted 1000 frames. ### Sum reward : 719.55 ### Train loss : 0.00457 Validation loss : 0.00441
Episode 9 lasted 879 frames. ### Sum reward : 912.1 ### T

In [None]:
torch.save(autoencoder.state_dict(), "autoencoder.pth")

## AE top-patches feature extractor

In [None]:
lstm_hidden_dim   = 256
FEAT_EXTRACTOR    = "AE"

model             = Agent(observation_space_size,stride,window,channels,
                          attention_out_dim,K,lstm_hidden_dim,eps,
                          CONTINUOUS,FEAT_EXTRACTOR)
n                 = get_n_params(model)
old_dict          = store_state_dict(model)

es                = cma.CMAEvolutionStrategy(n * [0], 0.01,{"seed":12345})
env               = gym.make("CarRacing-v2",continuous=CONTINUOUS)
max_generation    = 150
gamma             = 0.999
fitness_values_AE, best_model_AE  = train(es,env,model,max_generation,autoencoder,old_dict,gamma)

(13_w,27)-aCMA-ES (mu_w=7.8,w_1=22%) in dimension 2829 (seed=12345, Mon Sep 12 19:14:08 2022)
Generation 1 executed in 353.03 s. Best fitness value: -49.706
Generation 2 executed in 300.54 s. Best fitness value: -35.0635
Generation 3 executed in 264.3 s. Best fitness value: -28.3324
Generation 4 executed in 290.62 s. Best fitness value: -42.2366
Generation 5 executed in 237.92 s. Best fitness value: -73.5846
Generation 6 executed in 238.12 s. Best fitness value: -62.3611
Generation 7 executed in 220.57 s. Best fitness value: -58.3146
Generation 8 executed in 186.91 s. Best fitness value: -65.8829
Generation 9 executed in 204.52 s. Best fitness value: -65.008
Generation 10 executed in 181.12 s. Best fitness value: -66.9857
Generation 11 executed in 192.92 s. Best fitness value: -86.2072
Generation 12 executed in 217.66 s. Best fitness value: -51.8156
Generation 13 executed in 184.81 s. Best fitness value: -54.7157
Generation 14 executed in 195.24 s. Best fitness value: -58.9726
Generati

In [None]:

# with open('best_model_AE.pickle', 'wb') as handle:
#     pickle.dump(best_model_AE, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('fitness_values_AE.pickle', 'wb') as handle:
#     pickle.dump(fitness_values_AE, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('best_model_AE.pickle', 'rb') as handle:
#     best_model_AE = pickle.load(handle)


# Test in environment

In [None]:
def game_evaluation(model,pretrained_weights,autoencoder,n_episodes):
    env               = gym.make("CarRacing-v2",continuous=CONTINUOUS)
    old_dict          = store_state_dict(model)
    sum_rewards       = np.zeros(n_episodes)
    for i in tqdm(range(n_episodes)):


        model       = load_state_dict(model, pretrained_weights, old_dict)
        observation = state_processer(env.reset())
        ob_tensor   = torch.cat((observation,observation,observation),dim=-1)

        done        = False
        sum_reward  = 0
        while (not done) and (sum_reward < 1000):
            action, top_patches                  = model(ob_tensor,autoencoder)
            observation_next, reward, done, info = env.step(action)
            observation_next                     = state_processer(observation_next)
            ob_tensor                            = cycle_observation(ob_tensor,observation_next)
            sum_reward                          += reward
        sum_rewards[i]=sum_reward
    return np.mean(sum_rewards), np.std(sum_rewards)

In [None]:
torch.set_grad_enabled(False)
lstm_hidden_dim    = 16
FEAT_EXTRACTOR     = "LOC"

model              = Agent(observation_space_size,stride,window,
                           channels,attention_out_dim,K,lstm_hidden_dim,
                           eps,CONTINUOUS,FEAT_EXTRACTOR)
autoencoder        = None
n_episodes         = 30

reward_LOC,std_LOC = game_evaluation(model,best_model_LOC,autoencoder,n_episodes)
print("LOC feature extractor.\n Mean reward : {} +- {}".format(round(reward_LOC,5),round(std_LOC,5)))

100%|██████████| 30/30 [06:30<00:00, 13.02s/it]

LOC feature extractor.
 Mean reward : 712.67063 +- 192.44935





In [None]:
FEAT_EXTRACTOR       = "CONV"

model                = Agent(observation_space_size,stride,window,
                             channels,attention_out_dim,K,lstm_hidden_dim,
                             eps,CONTINUOUS,FEAT_EXTRACTOR)

autoencoder          = None
n_episodes           = 30

reward_CONV,std_CONV = game_evaluation(model,best_model_CONV,autoencoder,n_episodes)
print("CONV feature extractor.\n Mean reward : {} +- {}".format(round(reward_CONV,5),round(std_CONV,5)))

100%|██████████| 30/30 [02:22<00:00,  4.75s/it]

CONV feature extractor.
 Mean reward : -52.05754 +- 33.37444





In [None]:
FEAT_EXTRACTOR  = "AE"
lstm_hidden_dim = 256

model           = Agent(observation_space_size,stride,window,
                        channels,attention_out_dim,K,lstm_hidden_dim,
                        eps,CONTINUOUS,FEAT_EXTRACTOR)

autoencoder      = Autoencoder()
autoencoder.load_state_dict(torch.load("autoencoder.pth"))
n_episodes       = 30

reward_AE,std_AE = game_evaluation(model,best_model_AE,autoencoder,n_episodes)
print("AE feature extractor.\n Mean reward : {} +- {}".format(round(reward_AE,5),round(std_AE,5)))

100%|██████████| 30/30 [08:51<00:00, 17.71s/it]

AE feature extractor.
 Mean reward : 71.72277 +- 79.42828



