# Define

## import

In [1]:
import torch #基本モジュール
from torch.autograd import Variable #自動微分用
import torch.nn as nn #ネットワーク構築用
import torch.optim as optim #最適化関数
import torch.nn.functional as F #ネットワーク用の様々な関数
import torch.utils.data #データセット読み込み関連
import torchvision #画像関連
from torch import Tensor
from torchvision import datasets, models, transforms #画像用データセット諸々

import numpy as np
import argparse
import json
from logging.config import dictConfig
from logging import getLogger
import os
import time
from google.colab import files
import itertools

## Components

In [2]:
class Zero(nn.Module):
  def __init__(self, stride):
    super(Zero, self).__init__()
    self.stride = stride

  def forward(self, x):
    if self.stride == 1:
      return x.mul(0.)
    return x[:,:,::self.stride,::self.stride].mul(0.)

In [3]:
class FactorizedReduce(nn.Module):
  def __init__(self, C_in, C_out, affine=True):
    super(FactorizedReduce, self).__init__()
    assert C_out % 2 == 0
    self.relu = nn.ReLU(inplace=False)
    self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False)
    self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) 
    self.bn = nn.BatchNorm2d(C_out, affine=affine)

  def forward(self, x):
    x = self.relu(x)
    # strideの偶奇による情報ロスを防ぐ
    out = torch.cat([self.conv_1(x), self.conv_2(x[:,:,1:,1:])], dim=1)
    out = self.bn(out)
    return out

In [4]:
class ReLUConvBN(nn.Module):

  def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
    super(ReLUConvBN, self).__init__()
    self.op = nn.Sequential(
      nn.ReLU(inplace=False),
      nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=padding, bias=False),
      nn.BatchNorm2d(C_out, affine=affine)
    )

  def forward(self, x):
    return self.op(x)

In [5]:
class DilConv(nn.Module):
    
  def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True):
    super(DilConv, self).__init__()
    self.op = nn.Sequential(
      nn.ReLU(inplace=False),
      nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=C_in, bias=False),
      nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
      nn.BatchNorm2d(C_out, affine=affine),
      )

  def forward(self, x):
    return self.op(x)

In [6]:
class SepConv(nn.Module):
    
  def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True):
    super(SepConv, self).__init__()
    self.op = nn.Sequential(
      nn.ReLU(inplace=False),
      nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, groups=C_in, bias=False),
      nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
      nn.BatchNorm2d(C_in, affine=affine),
      nn.ReLU(inplace=False),
      nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=1, padding=padding, groups=C_in, bias=False),
      nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
      nn.BatchNorm2d(C_out, affine=affine),
      )

  def forward(self, x):
    return self.op(x)

In [7]:
class Edge(nn.Module):
  def __init__(self, operators, theta=None):
    super(Edge, self).__init__()
    self.operators = operators

  def forward(self, input: Tensor, theta: Tensor) -> Tensor:
    return sum(t * op(input) for t, op in zip(theta, self.operators))

In [8]:
class Graph():
  def __init__(self, node_num : int, input : int=1, output : int=1):
    self.node_num = node_num
    self.input = input
    self.output = output
    self.middle = node_num - input - output
    self._graph = self._make_graph(node_num)
    self._order = self._ordered_edges(1)
  
  def edges(self):
    return self._graph

  def _ordered_edges(self, agg : int):
    g = self._graph
    r = [i for i in range(self.node_num)][self.input:-self.output]
    return [[idx for idx, t in enumerate(g) if t[agg] == i] for i in r]

  # edge indices to each intermediate node
  def ordered_edges(self):
    return self._order

  def size(self):
    return len(self._graph)

  def _make_graph(self, num : int):
    l = [i for i in range(num - self.output)]
    return [(s, e) for (s, e) in itertools.combinations(l, 2) if e >= self.input]

In [9]:
import sys
from graphviz import Digraph

def plot(graph, theta, multi, filename):
  g = Digraph(
      format='png', #pdf
      edge_attr=dict(fontsize='20', fontname="times"),
      node_attr=dict(style='filled', shape='rect', align='center', fontsize='20', height='0.5', width='0.5', penwidth='2', fontname="times"),
      engine='dot')
  g.body.extend(['rankdir=LR'])

  for i in range(graph.node_num):
    color = 'lightblue'
    color = 'darkseagreen2' if graph.input > i else color
    color = 'palegoldenrod' if graph.node_num - graph.output <= i else color
    g.node(str(i), fillcolor=color)
  
  for idx, (s, e) in enumerate(graph.edges()):
    op = [CANDIDATE[i] for i, p in enumerate(theta[idx]) if p >= 1.0]
    if len(op) == 0: continue
    g.edge(str(s), str(e), label=op[0], fillcolor="gray")

  for i in range(graph.node_num - 1 - multi, graph.node_num - 1):
    g.edge(str(i), str(graph.node_num - graph.output), fillcolor="gray")

  g.render(filename, view=True)
  return g

## Operator

In [10]:
OPS = {
  'none' : lambda C, stride, affine: Zero(stride),
  'skip_connect' : lambda C, stride, affine: nn.Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine),
  'avg_pool_3x3' : lambda C, stride, affine: nn.Sequential(nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False), nn.BatchNorm2d(C, affine=False)),
  'max_pool_3x3' : lambda C, stride, affine: nn.Sequential(nn.MaxPool2d(3, stride=stride, padding=1), nn.BatchNorm2d(C, affine=False)),
  'conv_3x3' : lambda C, stride, affine: nn.Conv2d(C, C, 3, stride=stride, padding=1),
  'conv_5x5' : lambda C, stride, affine: nn.Conv2d(C, C, 5, stride=stride, padding=2),
  'ReLUConvBN' : lambda C, stride, affine: ReLUConvBN(C, C, 3, stride=stride, padding=1),
  'sep_conv_3x3' : lambda C, stride, affine: SepConv(C, C, 3, stride, 1, affine=affine),
  'sep_conv_5x5' : lambda C, stride, affine: SepConv(C, C, 5, stride, 2, affine=affine),
  'sep_conv_7x7' : lambda C, stride, affine: SepConv(C, C, 7, stride, 3, affine=affine),
  'dil_conv_3x3' : lambda C, stride, affine: DilConv(C, C, 3, stride, 2, 2, affine=affine),
  'dil_conv_5x5' : lambda C, stride, affine: DilConv(C, C, 5, stride, 4, 2, affine=affine),
  'conv_7x1_1x7' : lambda C, stride, affine: nn.Sequential(
    nn.ReLU(inplace=False),
    nn.Conv2d(C, C, (1,7), stride=(1, stride), padding=(0, 3), bias=False),
    nn.Conv2d(C, C, (7,1), stride=(stride, 1), padding=(3, 0), bias=False),
    nn.BatchNorm2d(C, affine=affine)
    ),
}

In [11]:
CANDIDATE = [
  'conv_3x3',
  'conv_5x5',
  # 'ReLUConvBN',
  'avg_pool_3x3',
  'max_pool_3x3',
  'skip_connect',
  'none',
]
CANDIDATE = [
  'none',
  'skip_connect',
  # 'avg_pool_3x3',
  'max_pool_3x3',
  'sep_conv_3x3',
  'sep_conv_5x5',
  # 'sep_conv_7x7',
  'dil_conv_3x3',
  'dil_conv_5x5',
  # 'conv_7x1_1x7',
]

## module

In [12]:
from torch.autograd import Variable


def _concat(xs):
  return torch.cat([x.view(-1) for x in xs])


class Architect(object):

  def __init__(self, model, criterion, optimizer, args):
    self.network_momentum = args.momentum
    self.network_weight_decay = args.weight_decay
    self.model = model
    self._criterion = criterion
    self.optimizer = optimizer

  def _loss(self, input, target, model=None):
    model = model if model else self.model
    logits = model(input)
    return self._criterion(logits, target)

  def _compute_unrolled_model(self, input, target, eta, network_optimizer):
    loss = self._loss(input, target)
    theta = _concat(self.model.parameters()).data
    try:
      moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_(self.network_momentum)
    except:
      moment = torch.zeros_like(theta)
    dtheta = _concat(torch.autograd.grad(loss, self.model.parameters())).data + self.network_weight_decay*theta
    unrolled_model = self._construct_model_from_theta(theta.sub(eta, moment+dtheta))
    return unrolled_model

  def step(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer, unrolled):
    self.optimizer.zero_grad()
    if unrolled:
        self._backward_step_unrolled(input_train, target_train, input_valid, target_valid, eta, network_optimizer)
    else:
        self._backward_step(input_valid, target_valid)
    self.optimizer.step()

  def _backward_step(self, input_valid, target_valid):
    loss = self._loss(input_valid, target_valid)
    loss.backward()

  def _backward_step_unrolled(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer):
    unrolled_model = self._compute_unrolled_model(input_train, target_train, eta, network_optimizer)
    unrolled_loss = self._loss(input_valid, target_valid, model=unrolled_model)

    unrolled_loss.backward()
    dalpha = [v.grad for v in unrolled_model.arch_parameters()]
    vector = [v.grad.data for v in unrolled_model.parameters()]
    implicit_grads = self._hessian_vector_product(vector, input_train, target_train)

    for g, ig in zip(dalpha, implicit_grads):
      g.data.sub_(eta, ig.data)

    for v, g in zip(self.model.arch_parameters(), dalpha):
      if v.grad is None:
        v.grad = Variable(g.data)
      else:
        v.grad.data.copy_(g.data)

  def _construct_model_from_theta(self, theta):
    model_new = self.model.new()
    model_dict = self.model.state_dict()

    params, offset = {}, 0
    for k, v in self.model.named_parameters():
      v_length = np.prod(v.size())
      params[k] = theta[offset: offset+v_length].view(v.size())
      offset += v_length

    assert offset == len(theta)
    model_dict.update(params)
    model_new.load_state_dict(model_dict)
    return model_new.to(self.model.device())

  def _hessian_vector_product(self, vector, input, target, r=1e-2):
    print(len(vector), vector[0].shape, len(input), input.shape, target.shape)
    print(len(vector), vector[0].device, len(input), input.device, target.device)
    R = r / _concat(vector).norm()
    for p, v in zip(self.model.parameters(), vector):
      p.data.add_(R, v)
    loss = self._loss(input, target)
    print(loss)
    grads_p = torch.autograd.grad(loss, self.model.arch_parameters())

    for p, v in zip(self.model.parameters(), vector):
      p.data.sub_(2*R, v)
    loss = self._loss(input, target)
    grads_n = torch.autograd.grad(loss, self.model.arch_parameters())

    for p, v in zip(self.model.parameters(), vector):
      p.data.add_(R, v)

    return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)]

In [13]:
from typing import List
class Cell(nn.Module):
  def __init__(self, names, graph, cs, multi, reduces=[False], search=True):
    super(Cell, self).__init__()
    assert len(cs) == len(reduces) + 1
    c = cs[-1]
    self.reduce = reduces[-1]
    self.pres = nn.ModuleList([
      FactorizedReduce(c_prev, c, affine=False) if reduce else
      ReLUConvBN(c_prev, c, 1, 1, 0, affine=False)
      for c_prev, reduce in zip(cs[:-1], reduces[:-1] + [False])
    ])

    self.search = search
    self.graph = graph
    self.multi = multi
    self.edges = nn.ModuleList(
      [Edge(self._mix_operators(names, c, r)) for r in self.graph.edges()]
    )

  def _mix_operators(self, names, c, r):
    modules = []
    stride = 2 if self.reduce and r[0] < self.graph.input else 1
    affine = not self.search
    for name in names:
      modules += [OPS[name](c, stride, affine)]
    return nn.ModuleList(modules)

  def forward(self, inputs: List[Tensor], theta: Tensor) -> Tensor:
    nodes = [p(i) for p, i in zip(self.pres, inputs)]

    refs = self.graph.edges()
    for idc in self.graph.ordered_edges():
      output = sum(self.edges[idx](nodes[refs[idx][0]], theta[idx]) for idx in idc)
      nodes += [output]
    
    return torch.cat(nodes[-self.multi:], dim=1)

In [14]:

# TODO : check point (epoch)
# TODO : show layer shape
# TODO : replace to networkx
class CellNetwork(nn.Module):
  def __init__(self, depth = 4, node_num = 4, input = 1, init_channel = 16, class_num = 10, multi = -1, search = True):
    super(CellNetwork, self).__init__()
    self.depth = depth
    self.search = search
    self.onehot = False
    self.graph = Graph(node_num, input)
    self.multi = self.graph.middle if multi <= 0 else multi
    self.args = {"depth":depth, "node_num":node_num, "input":input, "init_channel":init_channel, "class_num":class_num, "multi":multi, "search":search}
    self.init_modules(3, self.multi, class_num, init_channel)
    self.init_theta()
    
    print(self.graph.edges())
    print(CANDIDATE)

  def is_reduce(self, idx):
    return self.depth//3 == idx or 2*self.depth//3 == idx

  def init_theta(self, zero=False, delta=1e-3):
    args = {"requires_grad":True}
    if zero:
      normal_theta = torch.zeros(self.graph.size(), len(CANDIDATE))
      reduce_theta = torch.zeros(self.graph.size(), len(CANDIDATE))
    else:
      normal_theta = delta * torch.randn(self.graph.size(), len(CANDIDATE))
      reduce_theta = delta * torch.randn(self.graph.size(), len(CANDIDATE))
    self.thetas = [Variable(normal_theta, **args), Variable(reduce_theta, **args)]

  def init_modules(self, c, multi, class_num, ini_c):
    # stem
    # c_n = c * multi
    c_n = ini_c
    self.stem = nn.Sequential(
      nn.Conv2d(c, c_n, 3, padding=1, bias=False),
      nn.BatchNorm2d(c_n)
    )
    c = ini_c

    # cells
    input = self.graph.input
    reduces = [False for _ in range(input)]
    cs = [c_n for _ in range(input)]
    self.cells = nn.ModuleList()
    for i in range(self.depth):
      reduces += [self.is_reduce(i)]
      if self.is_reduce(i):
        c *= 2
      
      cell = Cell(CANDIDATE, self.graph, cs[-input:] + [c], multi, reduces=reduces[-input:], search=self.search)
      self.cells += [cell]
      cs += [multi * c]
    
    # classify
    self.pooling = nn.AdaptiveAvgPool2d(1)
    self.linear = nn.Linear(cs[-1], class_num)

  def forward(self, input) -> Tensor:
    s = self.stem(input)
    input_num = self.graph.input
    stas = [s] * input_num

    for idx, cell in enumerate(self.cells):
      theta = self.thetas[1] if cell.reduce else self.thetas[0]
      weights = theta if self.onehot else F.softmax(theta, dim=-1)
      s = cell(stas[-input_num:], weights)
      stas += [s]

    out = self.pooling(s)
    return self.linear(out.view(out.size(0), -1))
  
  def to(self, *args, **kwargs):
    with torch.no_grad():
      for idx, theta in enumerate(self.thetas):
        self.thetas[idx] = theta.to(*args, **kwargs)

    return super(CellNetwork, self).to(*args, **kwargs)

  def learn_theta(self, is_learn: bool):
    for theta in self.thetas:
      theta.requires_grad = is_learn

  def new(self):
    model_new = CellNetwork(**self.args).to(self.device())
    for x, y in zip(model_new.arch_parameters(), self.arch_parameters()):
        x.data.copy_(y.data)
    return model_new

  def device(self):
    return next(self.parameters()).device

  def arch_parameters(self):
    return self.thetas

  def sampling(self, inplace=True):
    def _sampling(theta, degree=2, graph=self.graph, ignore=CANDIDATE.index('none')):
      with torch.no_grad():
        for t in theta:
          t[ignore] = t.min(0).values
          max = t.max(0)
          t[:] = 0.0
          t[max.indices] = max.values

        for edges in graph.ordered_edges():
          values = [(e, theta[e].argmax(), theta[e].max(0).values) for e in edges]
          select = min(len(values), degree)
          edges = sorted(values, key=lambda x: -x[2])[:select]
          
          for (e, o, v) in values:
            theta[e][o] = 0.0
          for (e, o, v) in edges:
            theta[e][o] = 1.0
            
    if inplace:
      for theta in self.thetas:
        _sampling(theta)
        
      self.learn_theta(False)
      self.onehot = True
    else:
      thetas = [theta.detach().clone() for theta in self.thetas]
      for theta in thetas:
        _sampling(theta)
      return thetas
  
  def plot(self, prefix):
    thetas = self.sampling(inplace=False)
    return [plot(self.graph, theta, self.multi, prefix + name)
      for theta, name in zip(thetas, ["normal", "reduce"])]

  def log(self):
    print("Network")
    for theta in self.thetas:
      print(F.softmax(theta, dim=-1))

# Unit Test

In [15]:
import unittest

def tensor_equal(x, y):
  return (torch.sum(x == y) == x.view(-1).shape[0]).item()

class TestEdge(unittest.TestCase):
  def test_id(self):
    input = torch.randn(1, 3, 32, 32)
    operators = [nn.Identity(), None]
    model = Edge(operators, theta=torch.tensor([1.0, 0.0]))
    output = model(input)
    self.assertEqual(tensor_equal(input, output), True)

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

E
ERROR: test_id (__main__.TestEdge)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-15-7c4fb3f2b357>", line 11, in test_id
    output = model(input)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)
TypeError: forward() missing 1 required positional argument: 'theta'

----------------------------------------------------------------------
Ran 1 test in 0.002s

FAILED (errors=1)


# Training

## training

In [16]:
from tqdm.notebook import tqdm
def train(model, device, train_loader, valid_loader, optimizer, optimizerB, criterion, class_array, lr, architect):
    model.train()
    class_array = torch.LongTensor(class_array).to(device)
    for batch_idx, (data, target) in enumerate(tqdm(train_loader)):
        data, target = data.to(device), target.to(device)

        if optimizerB:
          data_v, target_v = next(iter(valid_loader))
          data_v, target_v = data_v.to(device), target_v.to(device)

          architect.step(data, target, data_v, target_v, lr, optimizer, unrolled=True)

          # # theta update
          # optimizerB.zero_grad()
          # output = model(data_v)
          # reg = torch.zeros(output.shape[0], 10).to(device)
          # reg.index_add_(1, class_array, output)
          # loss = criterion(reg, target_v)
          # loss.backward()
          # optimizerB.step()

        optimizer.zero_grad()
        output = model(data)
        reg = torch.zeros(output.shape[0], 10).to(device)
        reg.index_add_(1, class_array, output)
        loss = criterion(reg, target)
        loss.backward(retain_graph=True)
        # clip
        optimizer.step()
        
    return loss.item()

In [17]:
def test(model, device, test_loader, criterion, class_array):
    model.eval()
    test_loss = []
    correct = 0
    class_array = torch.tensor(class_array).to(device)
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss.append(criterion(output, target).item())
            pred = class_array[output.argmax(dim=1, keepdim=True)]
            correct += pred.eq(target.view_as(pred)).sum().item()
            
    test_loss = np.mean(np.array(test_loss))
    accuracy = 100. * correct / len(test_loader.dataset)
    
    return (test_loss, accuracy)

## Utils

In [18]:
from argparse import Namespace
def dictspace(f):
  def inner(**kwds):
    return f(Namespace(**kwds))
  return inner

In [19]:
class Store():
  def __init__(self, dir="result", name="log"):
    self.dict = {}
    self.dir = dir
  
  def add(self, name, value):
    if not name in self.dict:
      self.dict[name] = []
    self.dict[name].append(value)
  
  def save(self, name="log"):
    path = os.path.join(self.dir, name + ".txt")
    with open(path, mode='w') as f:
      f.write("%s" % self.dict)

  def save_fig(self, metrix="acc", xlabel="epochs", ylabel="accuracy[%]"):
    import matplotlib.pyplot as plt
    
    times = len(self.dict[metrix])
    fig = plt.figure()
    plt.plot(np.arange(times), self.dict[metrix])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    plt.show()
    fig.savefig(os.path.join(self.dir, "%s_%d.png" % (metrix, times)))

  def __repr__(self):
    return "store in %s" % self.dict

In [20]:
def SaveModel(name, model, dir="result"):
  path = os.path.join(dir, name + ".pt")
  if not os.path.exists(dir):
    os.mkdir(dir)

  torch.save(model.state_dict(), path)
  # print(os.path.join("/content", result_path))
  # files.download(os.path.join("/content", result_path))

In [21]:
def load_dataset(train=2000, test=500, valid=0):
  #画像の変形処理
  transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
  ])

  transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
  ])

  #CIFAR-10のtrain, testsetのロード
  trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                          download=True, transform=transform)
  testset = torchvision.datasets.CIFAR10(root='./data', train=False, 
                                          download=True, transform=transform_test)
  
  trainset, validset, _ = torch.utils.data.random_split(trainset, [train, valid, 50000-train-valid])
  testset, _ = torch.utils.data.random_split(testset, [test, 10000-test])
  return trainset, testset, validset

In [22]:
class EarlyStopping:
  def __init__(self, dir="min", patent=5):
    self.list = []
    self.best = 0
    self.patent = patent
    self.count = 0
    self.order = dir == "max"

  def step(self, item):
    def _score(item):
      return item * (1 if self.order else -1)

    if len(self.list) == 0:
      self.best = _score(item)

    self.list.append(item)
    item = _score(item)
    if self.best < item:
      self.best = item
      self.count = 0
    else:
      self.count += 1

  def is_stop(self):
    return self.patent <= self.count

In [23]:
# def setup_drive():
#   # Install a Drive FUSE wrapper.
#   # https://github.com/astrada/google-drive-ocamlfuse
#   !apt-get install -y -qq software-properties-common python-software-properties module-init-tools
#   !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
#   !apt-get update -qq 2>&1 > /dev/null
#   !apt-get -y install -qq google-drive-ocamlfuse fuse

#   # Generate auth tokens for Colab
#   from google.colab import auth
#   auth.authenticate_user()

#   # Generate creds for the Drive FUSE library.
#   from oauth2client.client import GoogleCredentials
#   creds = GoogleCredentials.get_application_default()
#   import getpass
#   !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
#   vcode = getpass.getpass()
#   !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [24]:
def save_dir(dir : str, drivepath = './drive/My Drive/ml'):
  # if not "getpass" in sys.modules:
  #   setup_drive()

  if dir:
    import subprocess
    res = subprocess.run(["cp", "-r", "./" + dir, drivepath], stdout=subprocess.PIPE)
    sys.stdout.write(res.stdout)

In [25]:
def get_time():
  import datetime
  import pytz
  dt_now = datetime.datetime.now(pytz.timezone('Asia/Tokyo'))
  return dt_now.strftime('%Y_%m_%d %H_%M %S')

In [26]:
def save_heatmap(data, path):
  import seaborn as sns
  import matplotlib.pyplot as plt
  
  plt.figure()
  sns.heatmap(data, annot=True, fmt="1.2f")
  plt.savefig(path)
  plt.close('all')

In [27]:
def count_parameters_in_MB(model):
  return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6

## main

In [28]:
def main(description, **kwarg):

  save_dir("")

  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  print("device is %s" % device)

  # trainset, testset = load_dataset()
  kwargs = {'num_workers': 2, 'pin_memory': True} if use_cuda else {}

  class_array = [i for i in range(10)]

  @dictspace
  def learning(args):

    store = Store(dir=args.dir)
    store.add("kwargs", args)
    theta_log = Store(dir=args.dir)

    # instantiate
    networkarg = {"depth" : args.depth, "node_num" : args.node,
                  "input" : args.input, "multi" : args.multi}
    model = CellNetwork(**networkarg)
    model.to(device)
    print('param size %sMB' % count_parameters_in_MB(model))

    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=3e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs), eta_min=0.001)
    optimizer_theta = optim.Adam(model.thetas, lr=args.lr_theta, betas=(0.5, 0.999), weight_decay=1e-3)
    criterion = nn.CrossEntropyLoss()
    architect = Architect(model, criterion, optimizer_theta, args)

    trainset, testset, validset = load_dataset(train=args.train_size, valid=args.train_size)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, **kwargs)
    validloader = torch.utils.data.DataLoader(validset, batch_size=args.batch_size, **kwargs)
    testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, **kwargs)
    
    # architecture search
    time_sta = time.time()
    accuracy, loss = 0, 1e10
    model.learn_theta(False)
    for epoch in range(args.epochs):
      
      if epoch == args.switch:
        model.learn_theta(True)
        
      model.log()
      
      lr = scheduler.get_last_lr()[0]
      loss_train = train(model, device, trainloader, validloader, optimizer, optimizer_theta, criterion, class_array, lr, architect)
      loss_test, acc = test(model, device, testloader, criterion, class_array)
      scheduler.step()

      print('epoch %d, lr %s, acc %s' % (epoch, lr, acc))
      if epoch >= args.switch and epoch % 5 == 0:
        model.plot(os.path.join(args.dir, "epoch%d_" % epoch))
        if args.save_dir: save_dir(args.dir)
      store.add("loss", loss_test)
      store.add("acc", acc)
      theta_log.add("theta_n", model.thetas[0].detach().cpu().clone().numpy())
      theta_log.add("theta_r", model.thetas[1].detach().cpu().clone().numpy())
      save_heatmap(model.thetas[0].detach().cpu().clone().numpy(), os.path.join(args.dir, "theta_%s.png" % epoch))

      accuracy, loss = acc, loss_test
      if time.time() - time_sta >= 60 * args.minutes:
        break 

    print("\naccuracy ", accuracy, end=", ")
    print("loss ", loss)
    model.plot(os.path.join(args.dir, "complete_"))
    SaveModel("cell", model, dir=args.dir)

    # relearning
    stop = EarlyStopping(patent=15)
    accuracy, loss = 0, 1e10
    remodel = CellNetwork(search=False, **networkarg).to(device)
    remodel.thetas = model.sampling(inplace=False)
    remodel.sampling()
    model.cpu()
    del model
    model = remodel
    optimizer = optim.SGD(model.parameters(), lr=args.relr, momentum=args.momentum, weight_decay=3e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs), eta_min=0.001)
    for epoch in range(args.epochs):
      
      (_, loss_train) = train(model, device, trainloader, None, optimizer, None, criterion, None, class_array, 0, None)
      (_, (loss_test, acc)) = test(model, device, testloader, criterion, None, class_array)
      scheduler.step()

      print('epoch %d, acc %s' % (epoch, acc))
      store.add("loss", loss_test)
      store.add("acc", acc)

      accuracy, loss = acc, loss_test
      stop.step(loss_test)
      if stop.is_stop():
        break

    print("\naccuracy ", accuracy, end=", ")
    print("loss ", loss)
    SaveModel("cell", model, dir=args.dir)
    print(store)
    store.save()
    theta_log.save(name="theta")
    store.save_fig()
    if args.save_dir: save_dir(args.dir)

    model.cpu()
    del model

    return loss

  kwarg['dir'] = get_time()
  if not os.path.exists(kwarg['dir']):
    os.mkdir(kwarg['dir'])
  learning(**kwarg)

In [29]:
!cat /proc/uptime | awk '{print $1 /60 /60 /24 "days (" $1 "sec)"}'

0.0240323days (2076.39sec)


In [30]:
if __name__ == '__main__':
  # # paper
  # main("", lr=0.0250, lr_theta=0.0003, relr=0.0010, batch_size=64, train_size=20000, momentum=0.9, 
  #          epochs=50, switch=0, minutes=180, depth=8, node=7, multi=4, input=2, save_dir=True)

  # main("", lr=0.0100, lr_theta=0.0002, batch_size=64, train_size=20000, momentum=0.9, 
  #          epochs=90, switch=30, minutes=180, depth=5, node=7, multi=2)
  # main("", lr=0.0100, lr_theta=0.0002, batch_size=64, train_size=20000, momentum=0.9, 
  #          epochs=90, switch=10, minutes=180, depth=5, node=7, multi=2)
  # main("", lr=0.0050, lr_theta=0.0005, batch_size=64, train_size=4000, momentum=0.9, 
  #          epochs=15, switch=5, minutes=180, depth=4, node=4, multi=2)
  # annealing cosine scheduler no restart
  # main("", lr=0.0250, lr_theta=0.0002, relr=0.0050, batch_size=64, train_size=1000, momentum=0.9, 
  #          epochs=10, switch=0, minutes=180, depth=5, node=7, multi=4, input=2, save_dir=True)
  main("", lr=0.0200, lr_theta=0.0002, relr=0.0030, batch_size=32, train_size=10000, momentum=0.9, 
           epochs=50, switch=10, minutes=180, depth=5, node=7, multi=2, input=2, save_dir=True,
       weight_decay=3e-4)

  # main("", lr=0.0100, lr_theta=0.0002, relr=0.0010, batch_size=64, train_size=20000, momentum=0.9, 
  #          epochs=50, switch=10, minutes=180, depth=5, node=7, multi=4, input=2, save_dir=True)
  # main("", lr=0.0080, lr_theta=0.0001, relr=0.0010, batch_size=64, train_size=4000, momentum=0.9, 
  #          epochs=30, switch=10, minutes=180, depth=5, node=7, multi=4, input=2, save_dir=True)
  # main("", lr=0.0080, lr_theta=0.0001, relr=0.0010, batch_size=64, train_size=400, momentum=0.9, 
  #          epochs=1, switch=0, minutes=180, depth=3, node=7, multi=4, input=2, save_dir=True)

device is cuda
[(0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5)]
['none', 'skip_connect', 'max_pool_3x3', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5']


  


param size 1.247002MB
Files already downloaded and verified
Files already downloaded and verified
Network
tensor([[0.1431, 0.1429, 0.1427, 0.1428, 0.1428, 0.1428, 0.1429],
        [0.1431, 0.1429, 0.1427, 0.1427, 0.1429, 0.1428, 0.1429],
        [0.1426, 0.1429, 0.1428, 0.1429, 0.1428, 0.1428, 0.1431],
        [0.1427, 0.1428, 0.1431, 0.1427, 0.1429, 0.1428, 0.1430],
        [0.1430, 0.1428, 0.1429, 0.1429, 0.1427, 0.1428, 0.1429],
        [0.1425, 0.1428, 0.1430, 0.1429, 0.1430, 0.1429, 0.1428],
        [0.1429, 0.1428, 0.1428, 0.1430, 0.1430, 0.1429, 0.1427],
        [0.1427, 0.1430, 0.1429, 0.1429, 0.1425, 0.1428, 0.1431],
        [0.1429, 0.1429, 0.1429, 0.1430, 0.1426, 0.1427, 0.1430],
        [0.1429, 0.1428, 0.1428, 0.1429, 0.1429, 0.1428, 0.1429],
        [0.1429, 0.1427, 0.1429, 0.1428, 0.1430, 0.1426, 0.1431],
        [0.1429, 0.1429, 0.1428, 0.1430, 0.1429, 0.1427, 0.1428],
        [0.1430, 0.1429, 0.1426, 0.1428, 0.1429, 0.1428, 0.1429],
        [0.1430, 0.1426, 0.1428, 0.1

HBox(children=(FloatProgress(value=0.0, max=313.0), HTML(value='')))

	sub(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	sub(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)


[(0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), (1, 5), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5)]
['none', 'skip_connect', 'max_pool_3x3', 'sep_conv_3x3', 'sep_conv_5x5', 'dil_conv_3x3', 'dil_conv_5x5']
889 torch.Size([16, 3, 3, 3]) 32 torch.Size([32, 3, 32, 32]) torch.Size([32])
889 cuda:0 32 cuda:0 cuda:0
tensor(2.3996, device='cuda:0', grad_fn=<NllLossBackward>)


RuntimeError: ignored

In [None]:
def _data_transforms_cifar10(args):
  CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
  CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]

  train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
  ])
  if args.cutout:
    train_transform.transforms.append(Cutout(args.cutout_length))

  valid_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
    ])
  return train_transform, valid_transform

## viz

In [None]:
# # cause a crash to increase RAM
# [_ for _ in range(10000000000)]

In [None]:
# def main(description, **kwarg):

#   save_dir("")

#   use_cuda = torch.cuda.is_available()
#   device = torch.device("cuda" if use_cuda else "cpu")
#   print("device is %s" % device)

#   trainset, testset = load_dataset()
#   kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

#   class_array = [i for i in range(10)]

#   @dictspace
#   def learning(args):

#     store = Store(dir=args.dir)
#     store.add("kwargs", args)
#     theta_log = Store(dir=args.dir)

#     # instantiate
#     networkarg = {"depth" : args.depth, "node_num" : args.node,
#                   "input" : args.input, "multi" : args.multi}
#     model = CellNetwork(**networkarg)
#     model.to(device)

#     optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=3e-4)
#     scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs), eta_min=0.001)
#     optimizer_theta = optim.Adam(model.thetas, lr=args.lr_theta, betas=(0.5, 0.999), weight_decay=1e-3)
#     criterion = nn.CrossEntropyLoss()

#     trainset, testset = load_dataset(train=args.train_size)
#     trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=False, **kwargs)
#     testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, **kwargs)
    
#     # relearning
#     stop = EarlyStopping(patent=15)
#     accuracy, loss = 0, 1e10
#     remodel = CellNetwork(search=False, **networkarg).to(device)
#     remodel.thetas = [torch.tensor([
#         [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
#         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
#        device='cuda:0'), torch.tensor([
#         [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
#         [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
#        device='cuda:0')]
#     remodel.sampling()
#     model = remodel
#     model.plot(os.path.join(args.dir, "complete_"))

#     optimizer = optim.SGD(model.parameters(), lr=args.relr, momentum=args.momentum, weight_decay=3e-4)
#     scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs), eta_min=0.001)
#     for epoch in range(args.epochs):
      
#       (_, loss_train) = train(model, device, trainloader, optimizer, None, criterion, None, class_array)
#       (_, (loss_test, acc)) = test(model, device, testloader, criterion, None, class_array)
#       scheduler.step()

#       print('epoch %d, acc %s' % (epoch, acc))
#       store.add("loss", loss_test)
#       store.add("acc", acc)

#       accuracy, loss = acc, loss_test
#       stop.step(loss_test)
#       # if stop.is_stop():
#         # break

#     print("\naccuracy ", accuracy, end=", ")
#     print("loss ", loss)
#     SaveModel("cell", model, dir=args.dir)
#     print(store)
#     store.save()
#     theta_log.save(name="theta")
#     store.save_fig()
#     if args.save_dir: save_dir(args.dir)

#     del model

#     return loss

#   kwarg['dir'] = get_time()
#   if not os.path.exists(kwarg['dir']):
#     os.mkdir(kwarg['dir'])
#   learning(**kwarg)