diff --git a/.gitignore b/.gitignore index 2918bfb..145eec2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .idea +.git __pycache__ run_all.sh get_time_all.sh diff --git a/README.md b/README.md index 6a50aa0..300f483 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,12 @@ # Multinomial Distribution Learning for Effective Neural Architecture Search -Here we propose a method to extremely accelerate NAS, **without reinforcement learning or gradient**, just by sampling architectures from a distribution and comparing these architectures, Iteratively updating parameters of distribution while training +Here we propose a method to **extremely accelerate** NAS, **without reinforcement learning or gradient**, just by sampling architectures from a distribution and comparing these architectures, estimating their **relative performance** rather than absolute performance, iteratively updating parameters of the distribution while training. ![](figs/1.png) -Here we provide our test codes and pretrained model, our code is based on [DARTS]() and [ProxylessNAS](), pretrained models can be downloaded [here](https://drive.google.com/open?id=1W0UqwAnm37uibTuPDrH5Mt8PKNvFdD3v) +Here we provide our test codes and pretrained models, our code is based on [DARTS]() and [ProxylessNAS](), pretrained models can be downloaded [here](https://drive.google.com/open?id=1W0UqwAnm37uibTuPDrH5Mt8PKNvFdD3v) + +**Search codes** will be released by [Sherwood](https://github.com/zhengxiawu) later ! ## Requirements @@ -13,7 +15,7 @@ Here we provide our test codes and pretrained model, our code is based on [DARTS ## Evaluate -You need to modified your path to dataset in ``` data_providers/cifar10.py``` and ```data_providers/imagenet.py``` +You need to modified your path to dataset in ``` data_providers/cifar10.py``` and ```data_providers/imagenet.py```,```config.sh``` is used to prepare your environment, you should **write this file by yourself** and here we use it to prepare dataset and packages To evaluate the model in **DARTS setting**, just run diff --git a/models/darts_nets_cifar/augment_cells.py b/models/darts_nets_cifar/augment_cells.py index 8c9bde8..bb7f832 100644 --- a/models/darts_nets_cifar/augment_cells.py +++ b/models/darts_nets_cifar/augment_cells.py @@ -1,8 +1,8 @@ """ CNN cell for network augmentation """ import torch import torch.nn as nn -from models.darts_nets_cifar import ops from utils import * +from models.darts_nets_cifar import ops def to_dag(C_in, gene, reduction): """ generate discrete ops from gene """ diff --git a/models/darts_nets_cifar/search_cells.py b/models/darts_nets_cifar/search_cells.py deleted file mode 100644 index ed4dc11..0000000 --- a/models/darts_nets_cifar/search_cells.py +++ /dev/null @@ -1,54 +0,0 @@ -""" CNN cell for architecture search """ -import torch -import torch.nn as nn -from models.darts_nets_cifar import ops - - -class SearchCell(nn.Module): - """ Cell for search - Each edge is mixed and continuous relaxed. - """ - - def __init__(self, n_nodes, C_pp, C_p, C, reduction_p, reduction): - """ - Args: - n_nodes: # of intermediate n_nodes - C_pp: C_out[k-2] - C_p : C_out[k-1] - C : C_in[k] (current) - reduction_p: flag for whether the previous cell is reduction cell or not - reduction: flag for whether the current cell is reduction cell or not - """ - super().__init__() - self.reduction = reduction - self.n_nodes = n_nodes - - # If previous cell is reduction cell, current input size does not match with - # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing. - if reduction_p: - self.preproc0 = ops.FactorizedReduce(C_pp, C, affine=False) - else: - self.preproc0 = ops.StdConv(C_pp, C, 1, 1, 0, affine=False) - self.preproc1 = ops.StdConv(C_p, C, 1, 1, 0, affine=False) - - # generate dag - self.dag = nn.ModuleList() - for i in range(self.n_nodes): - self.dag.append(nn.ModuleList()) - for j in range(2 + i): # include 2 input nodes - # reduction should be used only for input node - stride = 2 if reduction and j < 2 else 1 - op = ops.MixedOp(C, stride) # 搜索的时候都是混合操作 - self.dag[i].append(op) - - def forward(self, s0, s1, w_dag): - s0 = self.preproc0(s0) - s1 = self.preproc1(s1) - - states = [s0, s1] - for edges, w_list in zip(self.dag, w_dag): # 遍历每个内部节点 - s_cur = sum(edges[i](s, w) for i, (s, w) in enumerate(zip(states, w_list))) # 将输入到节点的所有边的输出加起来作为该节点的输入 - states.append(s_cur) - - s_out = torch.cat(states[2:], dim=1) - return s_out diff --git a/models/darts_nets_cifar/search_cnn.py b/models/darts_nets_cifar/search_cnn.py deleted file mode 100644 index 7909dd8..0000000 --- a/models/darts_nets_cifar/search_cnn.py +++ /dev/null @@ -1,131 +0,0 @@ -""" CNN for architecture search """ -import torch -import torch.nn as nn -import torch.nn.functional as F -from models.darts_nets_cifar.search_cells import SearchCell -import utils.genotypes as gt - - -class SearchCNN(nn.Module): - """ Search CNN model """ - - def __init__(self, C_in, C, n_classes, n_layers, criterion, n_nodes=4, stem_multiplier=3): - """ - Args: - C_in: # of input channels - C: # of starting model channels - n_classes: # of classes - n_layers: # of layers - n_nodes: # of intermediate nodes in Cell - stem_multiplier - """ - super().__init__() - self.C_in = C_in - self.C = C - self.n_classes = n_classes - self.n_layers = n_layers - self.n_nodes = n_nodes - self.criterion = criterion - - C_cur = stem_multiplier * C - self.stem = nn.Sequential( - nn.Conv2d(C_in, C_cur, 3, 1, 1, bias=False), - nn.BatchNorm2d(C_cur) - ) - - # for the first cell, stem is used for both s0 and s1 - # [!] C_pp and C_p is output channel size, but C_cur is input channel size. - C_pp, C_p, C_cur = C_cur, C_cur, C - - self.cells = nn.ModuleList() - reduction_p = False - for i in range(n_layers): - # Reduce featuremap size and double channels in 1/3 and 2/3 layer. - if i in [n_layers // 3, 2 * n_layers // 3]: - C_cur *= 2 - reduction = True - else: - reduction = False - - cell = SearchCell(n_nodes, C_pp, C_p, C_cur, reduction_p, reduction) - reduction_p = reduction - self.cells.append(cell) - C_cur_out = C_cur * n_nodes - C_pp, C_p = C_p, C_cur_out - - self.gap = nn.AdaptiveAvgPool2d(1) - self.linear = nn.Linear(C_p, n_classes) - - # initialize architect parameters: alphas - self._init_alphas() - - def _init_alphas(self): - """ - initialize architect parameters: alphas - """ - n_ops = len(gt.PRIMITIVES) - - self.alpha_normal = nn.ParameterList() - self.alpha_reduce = nn.ParameterList() - - for i in range(self.n_nodes): - self.alpha_normal.append(nn.Parameter(1e-3 * torch.randn(i + 2, n_ops))) - self.alpha_reduce.append(nn.Parameter(1e-3 * torch.randn(i + 2, n_ops))) - - def forward(self, x): - s0 = s1 = self.stem(x) - - weights_normal = [F.softmax(alpha, dim=-1) for alpha in self.alpha_normal] - weights_reduce = [F.softmax(alpha, dim=-1) for alpha in self.alpha_reduce] - - for cell in self.cells: - weights = weights_reduce if cell.reduction else weights_normal - s0, s1 = s1, cell(s0, s1, weights) # 利用动态规划来实现cell间,即整个结构的前向传播 - - out = self.gap(s1) - out = out.view(out.size(0), -1) # flatten - logits = self.linear(out) - return logits - - def loss(self, X, y): - logits = self(X) - return self.criterion(logits, y) - - def print_alphas(self): - print("####### ALPHA #######") - print("# Alpha - normal") - for alpha in self.alpha_normal: - print(F.softmax(alpha, dim=-1)) - - print("\n# Alpha - reduce") - for alpha in self.alpha_reduce: - print(F.softmax(alpha, dim=-1)) - print("#####################") - - def genotype(self): - gene_normal = gt.parse(self.alpha_normal, k=2) - gene_reduce = gt.parse(self.alpha_reduce, k=2) - concat = range(2, 2 + self.n_nodes) # concat all intermediate nodes - - return gt.Genotype(normal=gene_normal, normal_concat=concat, - reduce=gene_reduce, reduce_concat=concat) - - def weights(self): - for k, v in self.named_parameters(): - if 'alpha' not in k: - yield v - - def named_weights(self): - for k, v in self.named_parameters(): - if 'alpha' not in k: - yield k, v - - def alphas(self): - for k, v in self.named_parameters(): - if 'alpha' in k: - yield v - - def named_alphas(self): - for k, v in self.named_parameters(): - if 'alpha' in k: - yield k, v diff --git a/models/darts_nets_imagenet/augment_cells.py b/models/darts_nets_imagenet/augment_cells.py index 7fca30d..c9305db 100644 --- a/models/darts_nets_imagenet/augment_cells.py +++ b/models/darts_nets_imagenet/augment_cells.py @@ -1,9 +1,28 @@ """ CNN cell for network augmentation """ import torch import torch.nn as nn -from models.darts_nets_imagenet import ops from utils import * +from models.darts_nets_imagenet import ops +def to_dag(C_in, gene, reduction): + """ generate discrete ops from gene """ + dag = nn.ModuleList() + for edges in gene: + row = nn.ModuleList() + for op_name, s_idx in edges: + # reduction cell & from input nodes => stride = 2 + stride = 2 if reduction and s_idx < 2 else 1 + op = ops.OPS[op_name](C_in, stride, True) + if not isinstance(op, ops.Identity): # Identity does not use drop path + op = nn.Sequential( + op, + ops.DropPath_() + ) + op.s_idx = s_idx + row.append(op) + dag.append(row) + + return dag class AugmentCell(nn.Module): """ Cell for augmentation diff --git a/models/darts_nets_imagenet/ops.py b/models/darts_nets_imagenet/ops.py index 3794dba..f07bef4 100644 --- a/models/darts_nets_imagenet/ops.py +++ b/models/darts_nets_imagenet/ops.py @@ -186,24 +186,4 @@ def forward(self, x): return out -class MixedOp(nn.Module): - """ Mixed operation """ - def __init__(self, C, stride): - super().__init__() - self._ops = nn.ModuleList() - for primitive in PRIMITIVES: - # 利用索引来使用函数可以使用字典,键值分别为索引和lambda表达式 - op = OPS[primitive](C, stride, affine=False) - self._ops.append(op) - - def forward(self, x, weights): - """ - Args: - x: input - weights: weight for each operation - """ - return sum(w * op(x) for w, op in zip(weights, self._ops)) - # index = torch.multinomial(weights, 1) - # sum = self._ops[index](x) - # return sum diff --git a/models/darts_nets_imagenet/search_cells.py b/models/darts_nets_imagenet/search_cells.py deleted file mode 100644 index 790c794..0000000 --- a/models/darts_nets_imagenet/search_cells.py +++ /dev/null @@ -1,54 +0,0 @@ -""" CNN cell for architecture search """ -import torch -import torch.nn as nn -from models.darts_nets_imagenet import ops - - -class SearchCell(nn.Module): - """ Cell for search - Each edge is mixed and continuous relaxed. - """ - - def __init__(self, n_nodes, C_pp, C_p, C, reduction_p, reduction): - """ - Args: - n_nodes: # of intermediate n_nodes - C_pp: C_out[k-2] - C_p : C_out[k-1] - C : C_in[k] (current) - reduction_p: flag for whether the previous cell is reduction cell or not - reduction: flag for whether the current cell is reduction cell or not - """ - super().__init__() - self.reduction = reduction - self.n_nodes = n_nodes - - # If previous cell is reduction cell, current input size does not match with - # output size of cell[k-2]. So the output[k-2] should be reduced by preprocessing. - if reduction_p: - self.preproc0 = ops.FactorizedReduce(C_pp, C, affine=False) - else: - self.preproc0 = ops.StdConv(C_pp, C, 1, 1, 0, affine=False) - self.preproc1 = ops.StdConv(C_p, C, 1, 1, 0, affine=False) - - # generate dag - self.dag = nn.ModuleList() - for i in range(self.n_nodes): - self.dag.append(nn.ModuleList()) - for j in range(2 + i): # include 2 input nodes - # reduction should be used only for input node - stride = 2 if reduction and j < 2 else 1 - op = ops.MixedOp(C, stride) # 搜索的时候都是混合操作 - self.dag[i].append(op) - - def forward(self, s0, s1, w_dag): - s0 = self.preproc0(s0) - s1 = self.preproc1(s1) - - states = [s0, s1] - for edges, w_list in zip(self.dag, w_dag): # 遍历每个内部节点 - s_cur = sum(edges[i](s, w) for i, (s, w) in enumerate(zip(states, w_list))) # 将输入到节点的所有边的输出加起来作为该节点的输入 - states.append(s_cur) - - s_out = torch.cat(states[2:], dim=1) - return s_out diff --git a/models/darts_nets_imagenet/search_cnn.py b/models/darts_nets_imagenet/search_cnn.py deleted file mode 100644 index 423cc1e..0000000 --- a/models/darts_nets_imagenet/search_cnn.py +++ /dev/null @@ -1,131 +0,0 @@ -""" CNN for architecture search """ -import torch -import torch.nn as nn -import torch.nn.functional as F -from models.darts_nets.search_cells import SearchCell -import utils.genotypes as gt - - -class SearchCNN(nn.Module): - """ Search CNN model """ - - def __init__(self, C_in, C, n_classes, n_layers, criterion, n_nodes=4, stem_multiplier=3): - """ - Args: - C_in: # of input channels - C: # of starting model channels - n_classes: # of classes - n_layers: # of layers - n_nodes: # of intermediate nodes in Cell - stem_multiplier - """ - super().__init__() - self.C_in = C_in - self.C = C - self.n_classes = n_classes - self.n_layers = n_layers - self.n_nodes = n_nodes - self.criterion = criterion - - C_cur = stem_multiplier * C - self.stem = nn.Sequential( - nn.Conv2d(C_in, C_cur, 3, 1, 1, bias=False), - nn.BatchNorm2d(C_cur) - ) - - # for the first cell, stem is used for both s0 and s1 - # [!] C_pp and C_p is output channel size, but C_cur is input channel size. - C_pp, C_p, C_cur = C_cur, C_cur, C - - self.cells = nn.ModuleList() - reduction_p = False - for i in range(n_layers): - # Reduce featuremap size and double channels in 1/3 and 2/3 layer. - if i in [n_layers // 3, 2 * n_layers // 3]: - C_cur *= 2 - reduction = True - else: - reduction = False - - cell = SearchCell(n_nodes, C_pp, C_p, C_cur, reduction_p, reduction) - reduction_p = reduction - self.cells.append(cell) - C_cur_out = C_cur * n_nodes - C_pp, C_p = C_p, C_cur_out - - self.gap = nn.AdaptiveAvgPool2d(1) - self.linear = nn.Linear(C_p, n_classes) - - # initialize architect parameters: alphas - self._init_alphas() - - def _init_alphas(self): - """ - initialize architect parameters: alphas - """ - n_ops = len(gt.PRIMITIVES) - - self.alpha_normal = nn.ParameterList() - self.alpha_reduce = nn.ParameterList() - - for i in range(self.n_nodes): - self.alpha_normal.append(nn.Parameter(1e-3 * torch.randn(i + 2, n_ops))) - self.alpha_reduce.append(nn.Parameter(1e-3 * torch.randn(i + 2, n_ops))) - - def forward(self, x): - s0 = s1 = self.stem(x) - - weights_normal = [F.softmax(alpha, dim=-1) for alpha in self.alpha_normal] - weights_reduce = [F.softmax(alpha, dim=-1) for alpha in self.alpha_reduce] - - for cell in self.cells: - weights = weights_reduce if cell.reduction else weights_normal - s0, s1 = s1, cell(s0, s1, weights) # 利用动态规划来实现cell间,即整个结构的前向传播 - - out = self.gap(s1) - out = out.view(out.size(0), -1) # flatten - logits = self.linear(out) - return logits - - def loss(self, X, y): - logits = self(X) - return self.criterion(logits, y) - - def print_alphas(self): - print("####### ALPHA #######") - print("# Alpha - normal") - for alpha in self.alpha_normal: - print(F.softmax(alpha, dim=-1)) - - print("\n# Alpha - reduce") - for alpha in self.alpha_reduce: - print(F.softmax(alpha, dim=-1)) - print("#####################") - - def genotype(self): - gene_normal = gt.parse(self.alpha_normal, k=2) - gene_reduce = gt.parse(self.alpha_reduce, k=2) - concat = range(2, 2 + self.n_nodes) # concat all intermediate nodes - - return gt.Genotype(normal=gene_normal, normal_concat=concat, - reduce=gene_reduce, reduce_concat=concat) - - def weights(self): - for k, v in self.named_parameters(): - if 'alpha' not in k: - yield v - - def named_weights(self): - for k, v in self.named_parameters(): - if 'alpha' not in k: - yield k, v - - def alphas(self): - for k, v in self.named_parameters(): - if 'alpha' in k: - yield v - - def named_alphas(self): - for k, v in self.named_parameters(): - if 'alpha' in k: - yield k, v diff --git a/run_exp.py b/run_exp.py index 2721f3c..5bfc690 100644 --- a/run_exp.py +++ b/run_exp.py @@ -34,7 +34,6 @@ np.random.seed(args.manual_seed) os.makedirs(args.path, exist_ok=True) - run_config_path = '%s/run.config' % args.path run_config = ImagenetRunConfig( **args.__dict__ ) @@ -43,12 +42,12 @@ if args.dataset == 'imagenet': from models.darts_nets_imagenet.augment_cnn import AugmentCNNImageNet - net = AugmentCNNImageNet(num_classes=run_config.data_provider.n_classes, genotype=from_str(args.darts_gene), + net = AugmentCNNImageNet(num_classes=run_config.data_provider.n_classes, genotype=eval(args.darts_gene), drop_out=args.dropout) elif args.dataset == 'cifar10': from models.darts_nets_cifar.augment_cnn import AugmentCNN - net = AugmentCNN(n_classes=run_config.data_provider.n_classes, genotype=from_str(args.darts_gene), + net = AugmentCNN(n_classes=run_config.data_provider.n_classes, genotype=eval(args.darts_gene), drop_out=args.dropout) else: from models.normal_nets.proxyless_nets import proxyless_network @@ -62,7 +61,6 @@ # build run manager run_manager = RunManager(args.path, net, run_config) - init_path = '%s/init' % args.path run_manager.load_model() output_dict = {} diff --git a/run_manager.py b/run_manager.py index 075ae20..c001d53 100644 --- a/run_manager.py +++ b/run_manager.py @@ -6,7 +6,6 @@ from utils import * import apex - class RunConfig: def __init__(self, dataset, test_batch_size, local_rank, world_size): @@ -72,8 +71,7 @@ def __init__(self, path, net, run_config: RunConfig, out_log=True): self._logs_path, self._save_path = None, None self.best_acc = 0 self.start_epoch = 0 - self.net = nn.DataParallel(self.net).cuda() - # self.net.module.to(self.device) + self.net = apex.parallel.convert_syncbn_model(nn.DataParallel(self.net)).cuda() self.print_net_info() self.criterion = nn.CrossEntropyLoss() cudnn.benchmark = True diff --git a/utils/__init__.py b/utils/__init__.py index 5e59e99..039add0 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,8 +1,10 @@ from .my_modules import * from .pytorch_utils import * from .get_data_iter import * -from .genotypes import * from .preproc import * +from collections import namedtuple + +Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat') def make_divisible(v, divisor, min_val=None): diff --git a/utils/genotypes.py b/utils/genotypes.py deleted file mode 100644 index 148fd04..0000000 --- a/utils/genotypes.py +++ /dev/null @@ -1,102 +0,0 @@ -""" Genotypes - - Genotype: normal/reduce gene + normal/reduce cell output connection (concat) - - gene: discrete ops information (w/o output connection) - - dag: real ops (can be mixed or discrete, but Genotype has only discrete information itself) -""" -from collections import namedtuple -import torch -import torch.nn as nn -from models.darts_nets_cifar import ops - -Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat') - -PRIMITIVES = [ - 'max_pool_3x3', - 'avg_pool_3x3', - 'skip_connect', # identity - 'sep_conv_3x3', - 'sep_conv_5x5', - 'dil_conv_3x3', - 'dil_conv_5x5', - 'none' -] - - -def to_dag(C_in, gene, reduction): - """ generate discrete ops from gene """ - dag = nn.ModuleList() - for edges in gene: - row = nn.ModuleList() - for op_name, s_idx in edges: - # reduction cell & from input nodes => stride = 2 - stride = 2 if reduction and s_idx < 2 else 1 - op = ops.OPS[op_name](C_in, stride, True) - if not isinstance(op, ops.Identity): # Identity does not use drop path - op = nn.Sequential( - op, - ops.DropPath_() - ) - op.s_idx = s_idx - row.append(op) - dag.append(row) - - return dag - - -def from_str(s): - """ generate genotype from string - e.g. "Genotype( - normal=[[('sep_conv_3x3', 0), ('sep_conv_3x3', 1)], - [('sep_conv_3x3', 1), ('dil_conv_3x3', 2)], - [('sep_conv_3x3', 1), ('sep_conv_3x3', 2)], - [('sep_conv_3x3', 1), ('dil_conv_3x3', 4)]], - normal_concat=range(2, 6), - reduce=[[('max_pool_3x3', 0), ('max_pool_3x3', 1)], - [('max_pool_3x3', 0), ('skip_connect', 2)], - [('max_pool_3x3', 0), ('skip_connect', 2)], - [('max_pool_3x3', 0), ('skip_connect', 2)]], - reduce_concat=range(2, 6))" - """ - - genotype = eval(s) - - return genotype - - -def parse(alpha, k): - """ - parse continuous alpha to discrete gene. - alpha is ParameterList: - ParameterList [ - Parameter(n_edges1, n_ops), - Parameter(n_edges2, n_ops), - ... - ] - - gene is list: - [ - [('node1_ops_1', node_idx), ..., ('node1_ops_k', node_idx)], - [('node2_ops_1', node_idx), ..., ('node2_ops_k', node_idx)], - ... - ] - each node has two edges (k=2) in CNN. - """ - - gene = [] - assert PRIMITIVES[-1] == 'none' # assume last PRIMITIVE is 'none' - - # 1) Convert the mixed op to discrete edge (single op) by choosing top-1 weight edge - # 2) Choose top-k edges per node by edge score (top-1 weight in edge) - for edges in alpha: - # edges: Tensor(n_edges, n_ops) - edge_max, primitive_indices = torch.topk(edges[:, :-1], 1) # ignore 'none' - topk_edge_values, topk_edge_indices = torch.topk(edge_max.view(-1), k) - node_gene = [] - for edge_idx in topk_edge_indices: - prim_idx = primitive_indices[edge_idx] - prim = PRIMITIVES[prim_idx] - node_gene.append((prim, edge_idx.item())) - - gene.append(node_gene) - - return gene