In [1]:
import sys
import os
import json
import torch
import numpy as np
sys.path.append(os.path.abspath('../'))


from torch.utils.data import Dataset, Subset
from torchvision.transforms import functional
from utils.transforms import HoromaTransforms
from utils.factories import ModelFactory, OptimizerFactory, TrainerFactory

In [2]:
def main(config, resume, test_run=False, helios_run=None, horoma_test=False):
    """
    Execute a training for a model.

    :param config: the configuration of the optimizer, model and trainer.
    :param resume: path to the checkpoint of a model.
    :param test_run: whether it's a test run or not. In case of test run,
    uses custom mnist dataset.
    :param helios_run: start datetime of a run on helios.
    :param horoma_test: whether to use the test horoma dataset or not.
    """
    np.random.seed(config["numpy_seed"])
    torch.manual_seed(config["torch_seed"])
    torch.cuda.manual_seed_all(config["torch_seed"])

    # setup data_loader instances
    if not test_run:
        unlabelled = HoromaDataset(
            **config["data"]["dataset"],
            split='train_overlapped',
            transforms=HoromaTransforms()
        )

        labelled = HoromaDataset(
            data_dir=config["data"]["dataset"]['data_dir'],
            flattened=False,
            split='valid_overlapped',
            transforms=HoromaTransforms()
        )
    elif horoma_test:

        unlabelled = HoromaDataset(
            **config["data"]["dataset"],
            split='train_overlapped',
            transforms=HoromaTransforms(),
            subset=5
        )

        labelled = HoromaDataset(
            data_dir=config["data"]["dataset"]['data_dir'],
            flattened=False,
            split='valid_overlapped',
            transforms=HoromaTransforms(),
            subset=5
        )
    else:
        unlabelled = CustomMNIST(**config["data"]["dataset"], subset=5000)
        labelled = CustomLabelledMNIST(**config["data"]["dataset"],
                                       subset=1000)

    model = ModelFactory.get(config)

    print(model)
    print()

    trainable_params = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = OptimizerFactory.get(config, trainable_params)

    trainer = TrainerFactory.get(config)(
        model,
        optimizer,
        resume=resume,
        config=config,
        unlabelled=unlabelled,
        labelled=labelled,
        helios_run=helios_run,
        **config['trainer']['options']
    )

    trainer.train()

In [3]:
class HoromaDataset(Dataset):

    def __init__(self, data_dir, split="train", subset=None, skip=0,
                 flattened=False, transforms=None):
        """
        Initialize the horoma dataset.

        :param data_dir: Path to the directory containing the samples.
        :param split: Which split to use. [train, valid, test]
        :param subset: Percentage size of dataset to use. Default: all.
        :param skip: How many element to skip before taking the subset.
        :param flattened: If True return the images in a flatten format.
        :param transforms: Transforms to apply on the dataset before using it.
        """
        nb_channels = 3
        height = 32
        width = 32
        datatype = "uint8"

        if split == "train":
            #self.nb_examples = 150900
            self.nb_examples = 10
        elif split == "valid":
            #self.nb_examples = 480
            self.nb_examples = 10
        elif split == "test":
            #self.nb_examples = 498
            self.nb_examples = 10
        elif split == "train_overlapped":
            #self.nb_examples = 544749
            self.nb_examples = 10
        elif split == "valid_overlapped":
            #self.nb_examples = 1331
            self.nb_examples = 10
        else:
            raise ("Dataset: Invalid split. "
                   "Must be [train, valid, test, train_overlapped, valid_overlapped]")

        filename_x = os.path.join(data_dir, "{}_x.dat".format(split))
        filename_y = os.path.join(data_dir, "{}_y.txt".format(split))

        filename_region_ids = os.path.join(data_dir,
                                           "{}_regions_id.txt".format(split))
        self.region_ids = np.loadtxt(filename_region_ids, dtype=object)

        self.targets = None
        if os.path.exists(filename_y) and not split.startswith("train"):
            pre_targets = np.loadtxt(filename_y, 'U2')

            if subset is None:
                pre_targets = pre_targets[skip: None]
            else:
                pre_targets = pre_targets[skip: skip + subset]

            self.map_labels = np.unique(pre_targets)

            self.targets = np.asarray([
                np.where(self.map_labels == t)[0][0]
                for t in pre_targets
            ])
        print(self.nb_examples)
        print(height)
        print(width)
        print(nb_channels)
        self.data = np.memmap(
            filename_x,
            dtype=datatype,
            mode="r",
            shape=(self.nb_examples, height, width, nb_channels)
        )

        if subset is None:
            self.data = self.data[skip: None]
            self.region_ids = self.region_ids[skip: None]
        else:
            self.data = self.data[skip: skip + subset]
            self.region_ids = self.region_ids[skip: skip + subset]

        self.flattened = flattened

        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        img = self.data[index]
        if self.transforms:
            img = self.transforms(img)

        if self.flattened:
            img = img.view(-1)

        if self.targets is not None:
            return img, torch.Tensor([self.targets[index]])
        return img


In [4]:
config = json.load(open('../configs/cnnautoencoder_clusters_kmeans.json'))
print(config)

{'name': 'cnnautoencoder_cluster_kmeans_l20', 'n_gpu': 1, 'numpy_seed': 1, 'torch_seed': 1, 'wall_time': 8, 'model': {'type': 'ConvolutionalAutoEncoder', 'args': {'code_size': 20, 'dropout': 0.1, 'cnn1_out_channels': 10, 'cnn1_kernel_size': 5, 'cnn2_out_channels': 20, 'cnn2_kernel_size': 5, 'lin2_in_channels': 50, 'maxpool_kernel': 2, 'loss_fct': 'MSELoss'}}, 'optimizer': {'type': 'Adam', 'args': {'lr': 0.0001, 'weight_decay': 0, 'amsgrad': False}}, 'trainer': {'epochs': 100, 'save_dir': 'saved/', 'log_dir': 'logs/', 'save_period': 10, 'type': 'ClusterKMeansTrainer', 'options': {'n_clusters': 100, 'kmeans_interval': 0, 'kmeans_headstart': 0, 'kmeans_weight': 10.0}}, 'data': {'dataset': {'data_dir': '/rap/jvb-000-aa/COURS2019/etudiants/data/horoma', 'flattened': False}, 'dataloader': {'split': 0.9, 'train': {'batch_size': 128, 'shuffle': True}, 'valid': {'batch_size': 128, 'shuffle': False}}}}


In [5]:
# Run on a subsample of horoma dataset train_overlapped and valid_overlapped (only take the 5first sampless)
main(config, None, True, None, True)



10
32
32
3
10
32
32
3
ConvolutionalAutoEncoder(
  (loss_fct): MSELoss()
  (dropout): Dropout(p=0.1)
  (encode_cnn_1): Conv2d(3, 10, kernel_size=(5, 5), stride=(1, 1))
  (encode_cnn_2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (encode_lin_1): Linear(in_features=500, out_features=50, bias=True)
  (encode_lin_2): Linear(in_features=50, out_features=20, bias=True)
  (decode_lin_1): Linear(in_features=20, out_features=500, bias=True)
  (decode_lin_2): Linear(in_features=500, out_features=3072, bias=True)
)



ValueError: num_samples should be a positive integeral value, but got num_samples=0