## Description

Experiment with using an evolutionary strategy (CMA-ES) to train a small CNN on MNIST.

**Results**

*   CMA-ES much slower at optimization than SGD (as expected)
*   CMA-ES doesn't scale to >1000 params



## Install

In [1]:
!pip3 install torch torchvision numpy



## Imports

In [0]:
from matplotlib import pyplot as plt
import numpy as np

import torch as th
from torch import nn
import torchvision
from torchvision import transforms

%load_ext autoreload
%autoreload 2

## Config

In [3]:
num_classes = 10
num_epochs = 5
batch_size = 100
learning_rate = 0.001

device = th.device('cuda' if th.cuda.is_available() else 'cpu')
print(f'Using {device}')

Using cuda


## MNIST Dataset

In [0]:
# Download and construct MNIST dataset.
train_dataset = torchvision.datasets.MNIST(root='~/code/data/mnist/',
                                           train=True,
                                           transform=transforms.ToTensor(),
                                           download=True)
test_dataset = torchvision.datasets.MNIST(root='~/code/data/mnist/',
                                          train=False,
                                          transform=transforms.ToTensor(),
                                          download=True)

# Data loader (input pipeline)
train_loader = th.utils.data.DataLoader(dataset=train_dataset,
                                        batch_size=batch_size,
                                        shuffle=True)
test_loader = th.utils.data.DataLoader(dataset=test_dataset,
                                       batch_size=batch_size,
                                       shuffle=False)

## Model

In [9]:
# Convolutional neural network (with 2 convolutional layers).
class ConvNet(nn.Module):
  def __init__(self, num_classes=10):
    super().__init__()
    self.layer1 = nn.Sequential(
        nn.Conv2d(1, 4, kernel_size=5, stride=2),
        #nn.BatchNorm2d(16),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
    )
    self.layer2 = nn.Sequential(
        nn.Conv2d(4, 8, kernel_size=5, stride=2),
        #nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
    )
    self.fc = nn.Linear(8, num_classes)
    
  def forward(self, x):
    out = self.layer1(x)
    out = self.layer2(out)
    out = out.reshape(out.size(0), -1)
    out = self.fc(out)
    return out

model = ConvNet(num_classes).to(device)

# Loss and optimizer.
loss_fn = nn.CrossEntropyLoss()
optimizer = th.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

from rl.core.algs import util

x0 = util.serialize_params(model)

ConvNet(
  (layer1): Sequential(
    (0): Conv2d(1, 4, kernel_size=(5, 5), stride=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(4, 8, kernel_size=(5, 5), stride=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=8, out_features=10, bias=True)
)


## Train (Evolution)

In [10]:

print(x0.shape)
print(util.num_params(model))
print([p.numel() for p in model.parameters()])
print([p.numel() for p in model.state_dict().values()])
print([(n, p.numel()) for n, p in model.state_dict().items()])

def evaluate_objective(model, model_params, images, labels):
  util.deserialize_params(model, model_params)
  
  with th.no_grad():
    outputs = model(images)
    loss = loss_fn(outputs, labels)
  return loss.item()

(1002,)
1002
[100, 4, 800, 8, 80, 10]
[100, 4, 800, 8, 80, 10]
[('layer1.0.weight', 100), ('layer1.0.bias', 4), ('layer2.0.weight', 800), ('layer2.0.bias', 8), ('fc.weight', 80), ('fc.bias', 10)]


In [19]:
import cma
es = cma.CMAEvolutionStrategy(x0, 0.05,)

num_steps = len(train_loader)
while not es.stop():
  for epoch in range(num_epochs):
    for step, (images, labels) in enumerate(train_loader):
      images = images.to(device)
      labels = labels.to(device)

      solutions = es.ask()
      scores = [evaluate_objective(model, p, images, labels) for p in solutions]
      es.tell(solutions, scores)
      es.disp()
    

(12_w,24)-aCMA-ES (mu_w=7.0,w_1=24%) in dimension 1002 (seed=891635, Mon Jan 28 23:33:49 2019)
Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
    1     24 2.304030895233154e+00 1.0e+00 4.96e-02  5e-02  5e-02 0:00.1
    2     48 2.325848340988159e+00 1.0e+00 4.93e-02  5e-02  5e-02 0:00.1
    3     72 2.308803319931030e+00 1.0e+00 4.89e-02  5e-02  5e-02 0:00.2
   41    984 2.181833267211914e+00 1.0e+00 4.21e-02  4e-02  4e-02 0:03.3
   92   2208 1.473973512649536e+00 1.0e+00 4.04e-02  4e-02  4e-02 0:07.4
  100   2400 1.409002542495728e+00 1.0e+00 4.05e-02  4e-02  4e-02 0:08.0
  176   4224 1.179951429367065e+00 1.1e+00 4.13e-02  4e-02  4e-02 0:14.0
  200   4800 1.175792098045349e+00 1.1e+00 4.15e-02  4e-02  4e-02 0:15.9
  299   7176 1.051799178123474e+00 1.1e+00 4.16e-02  4e-02  4e-02 0:23.9
  300   7200 8.483834266662598e-01 1.1e+00 4.16e-02  4e-02  4e-02 0:24.0
  400   9600 9.836115837097168e-01 1.2e+00 4.05e-02  4e-02  4e-02 0:32.0
  495  11880 9.153674840927124

KeyboardInterrupt: ignored

## Test (Evolution)

In [20]:
with th.no_grad():
  correct, total = 0, 0
  for images, labels in test_loader:
    images = images.to(device)
    labels = labels.to(device)
    outputs = model(images)
    _, predicted = th.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
  accuracy = correct / total
  print(f'Accuracy of model on 10000 test images: {100 * accuracy:0.2f}%')

Accuracy of model on 10000 test images: 87.48%


## Train (SGD)

In [21]:
num_steps = len(train_loader)
for epoch in range(num_epochs):
  for step, (images, labels) in enumerate(train_loader):
    images = images.to(device)
    labels = labels.to(device)

    # Forward
    outputs = model(images)
    loss = loss_fn(outputs, labels)
    
    # Backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (step + 1) % 100 == 0:
      print(f'Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{num_steps}], '
            f'Loss: {loss.item():.4}')

Epoch [1/5], Step [100/600], Loss: 0.3084
Epoch [1/5], Step [200/600], Loss: 0.3294
Epoch [1/5], Step [300/600], Loss: 0.65
Epoch [1/5], Step [400/600], Loss: 0.5091
Epoch [1/5], Step [500/600], Loss: 0.3481
Epoch [1/5], Step [600/600], Loss: 0.393
Epoch [2/5], Step [100/600], Loss: 0.3388
Epoch [2/5], Step [200/600], Loss: 0.3316
Epoch [2/5], Step [300/600], Loss: 0.297
Epoch [2/5], Step [400/600], Loss: 0.5856
Epoch [2/5], Step [500/600], Loss: 0.3649
Epoch [2/5], Step [600/600], Loss: 0.3571
Epoch [3/5], Step [100/600], Loss: 0.2357
Epoch [3/5], Step [200/600], Loss: 0.4149
Epoch [3/5], Step [300/600], Loss: 0.2443
Epoch [3/5], Step [400/600], Loss: 0.4721
Epoch [3/5], Step [500/600], Loss: 0.2874
Epoch [3/5], Step [600/600], Loss: 0.1369
Epoch [4/5], Step [100/600], Loss: 0.2735
Epoch [4/5], Step [200/600], Loss: 0.4356
Epoch [4/5], Step [300/600], Loss: 0.2972
Epoch [4/5], Step [400/600], Loss: 0.3975
Epoch [4/5], Step [500/600], Loss: 0.3174
Epoch [4/5], Step [600/600], Loss: 0.2

## Test (SGD)

In [22]:
with th.no_grad():
  correct, total = 0, 0
  for images, labels in test_loader:
    images = images.to(device)
    labels = labels.to(device)
    outputs = model(images)
    _, predicted = th.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()
  accuracy = correct / total
  print(f'Accuracy of model on 10000 test images: {100 * accuracy:0.2f}%')

Accuracy of model on 10000 test images: 92.47%


## Save model

In [0]:
th.save(model.state_dict(), '/tmp/model.ckpt')