In [1]:
import torch
import torch.nn.functional as F
import pytorch_lightning as pl
from collections import OrderedDict
from torch.utils.data import DataLoader, random_split
import torchvision as tv
from pytorch_lightning.callbacks import EarlyStopping
import optuna
from optuna.integration import PyTorchLightningPruningCallback
import math
import pandas as pd

In [2]:
class FashionMNIST(pl.LightningModule):
    def __init__(self, trial: optuna.trial.Trial):
        super(FashionMNIST, self).__init__()
        self.BATCH_SIZE = 256
        self.INPUT_FEATURES = 28*28
        self.OUTPUT_FEATURES = 10
        
        layers = []
        layers.append(('flatten', torch.nn.Flatten()))
        
        # параметры модели
        n_layers = trial.suggest_int("n_layers", 1, 10) # количество скрытых слоев
        max_features = trial.suggest_int("max_features", 0.5 * self.INPUT_FEATURES, 2 * self.INPUT_FEATURES) # количество нейронов в первом скрытом слое
        activaion = trial.suggest_categorical("activation", ['ReLU', 'RReLU', 'LeakyReLU']) # функция активации
        regul = trial.suggest_categorical('regularization', ['batchnorm', 'dropout']) # регуляризация
        self.optimizer = trial.suggest_categorical('optimizer', ['SGD', 'Adam', 'RMSprop']) # Оптимизатор
        self.lr = trial.suggest_loguniform('learning_rate', 0.005, 0.5) # Learning rate
        
        # Архитектура сети:
        #    сеть состоит из n_layers скрытых слоев
        #    первый слой стостоит из max_features нейронов
        #    далее, количество нейронов линейно убывает до количества выходов
        
        input_features = self.INPUT_FEATURES
        delta_features = math.floor((max_features - self.OUTPUT_FEATURES) / n_layers)
        for i in range(1, n_layers+1): 
            if i == 1:
                in_features = self.INPUT_FEATURES
                out_features = max_features
            elif i == n_layers:
                in_features = prev_features
                out_features = self.OUTPUT_FEATURES
            else:
                in_features = prev_features
                out_features = prev_features - delta_features
            
            if regul == 'batchnorm':
                layers.append((f'batchnorm{i}', torch.nn.BatchNorm1d(in_features)))
            elif regul == 'dropout':
                layers.append((f'dropout{i}', torch.nn.Dropout()))
            else:
                raise ValueError
            
            layers.append((f'lin{i}', torch.nn.Linear(in_features, out_features)))
            prev_features = out_features
            
            if i != n_layers:
                if activaion == 'ReLU':
                    layers.append((f'ReLU{i}', torch.nn.ReLU()))
                elif activaion == 'RReLU':
                    layers.append((f'RReLU{i}', torch.nn.RReLU()))
                elif activaion == 'LeakyReLU':
                    layers.append((f'LeakyReLU{i}', torch.nn.LeakyReLU()))
                else:
                    raise ValueError
                    
        self.model = torch.nn.Sequential(OrderedDict(layers))
        

    def forward(self, x):
        return self.model(x)
    
    def configure_optimizers(self):
        if self.optimizer == 'SGD':
            opt = torch.optim.SGD(self.parameters(), lr=self.lr)
        elif self.optimizer == 'Adam':
            opt = torch.optim.Adam(self.parameters(), lr=self.lr)
        elif self.optimizer == 'RMSprop':
            opt = torch.optim.RMSprop(self.parameters(), lr=self.lr)
        
        return opt

    def cross_entropy_loss(self, logits, labels):
        loss = torch.nn.CrossEntropyLoss()
        return loss(logits, labels)
    
    def training_step(self, batch, batch_idx):
        data, target = batch
        output = self.forward(data)
        loss = self.cross_entropy_loss(output, target)
        logs = {'loss': loss}
        return {'loss': loss, 'log': logs}
    
    def validation_step(self, batch, batch_idx):
        data, target = batch
        output = self.forward(data)
        loss = self.cross_entropy_loss(output, target)
        pred = output.argmax(dim=1, keepdim=True)
        correct = pred.eq(target.view_as(pred)).sum().item()
        accuracy = correct / data.size(0)
        
        return {'val_loss': loss, "accuracy": accuracy}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        accuracy = sum(x["accuracy"] for x in outputs) / len(outputs)
        self.val_accuracy = accuracy
        logs = {'val_loss': avg_loss, 'accuracy': accuracy}
        return {'avg_val_loss': avg_loss, 'log': logs}
    
    def test_step(self, batch, batch_nb):
        data, target = batch
        output = self.forward(data)
        loss = self.cross_entropy_loss(output, target)
        pred = output.argmax(dim=1, keepdim=True)
        correct = pred.eq(target.view_as(pred)).sum().item()
        accuracy = correct / data.size(0)
        
        return {'test_loss': loss, 'test_accuracy': accuracy}
    
    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        accuracy = sum(x["test_accuracy"] for x in outputs) / len(outputs)
        self.test_accuracy = accuracy
        
        logs = {'test_loss': avg_loss, 'test_accuracy': accuracy}
        return {'avg_test_loss': avg_loss, 'log': logs, 'progress_bar': logs}
    
    def prepare_data(self):
        self.train_set, self.val_set = random_split(tv.datasets.FashionMNIST('../..', train=True, download=True, transform=tv.transforms.ToTensor()), [55000, 5000])
        self.test_set = tv.datasets.FashionMNIST('../..', train=False, download=True, transform=tv.transforms.ToTensor())
    
    def train_dataloader(self):
        return DataLoader(self.train_set, batch_size=self.BATCH_SIZE, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_set, batch_size=self.BATCH_SIZE)
    
    def test_dataloader(self):
        return DataLoader(self.test_set, batch_size=self.BATCH_SIZE)

In [3]:
MIN_ACCURACY = 0.88

In [4]:
def objective(trial: optuna.trial.Trial):
    clf = FashionMNIST(trial)
    trainer = pl.Trainer(
        gpus=[0],
        max_epochs=30,
        early_stop_callback=True
    )
    trainer.fit(clf)
    
    trainer.test()
    
    global MIN_ACCURACY
    if clf.test_accuracy > MIN_ACCURACY:
        trainer.save_checkpoint(f'./models/{trial.number}_{clf.test_accuracy:.2}.ckpt')
        MIN_ACCURACY = clf.test_accuracy
    
    return clf.val_accuracy
    

In [5]:
study = optuna.create_study(
    direction="maximize",
    study_name='FashionMNIST',
    storage='sqlite:///FashionMNIST.db',
    load_if_exists=True
)

[32m[I 2020-04-07 23:02:09,769][0m Using an existing study with name 'FashionMNIST' instead of creating a new one.[0m


In [None]:
study.optimize(objective, timeout=9*60*60)

In [7]:
pd.set_option('display.max_rows', None)

In [8]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))

In [9]:
df.sort_values(by='value', ascending=False)

Unnamed: 0,number,value,params_activation,params_learning_rate,params_max_features,params_n_layers,params_optimizer,params_regularization,state
640,640,0.90872,ReLU,0.005415,1361,3,SGD,batchnorm,COMPLETE
689,689,0.905044,ReLU,0.007355,1293,3,SGD,batchnorm,COMPLETE
588,588,0.904963,ReLU,0.005525,1423,4,SGD,batchnorm,COMPLETE
740,740,0.904745,ReLU,0.005964,1383,3,SGD,batchnorm,COMPLETE
659,659,0.904435,ReLU,0.006143,1394,3,SGD,batchnorm,COMPLETE
599,599,0.903941,ReLU,0.017831,1419,4,SGD,batchnorm,COMPLETE
681,681,0.903228,ReLU,0.006338,1313,3,SGD,batchnorm,COMPLETE
189,189,0.902642,LeakyReLU,0.005066,1477,5,SGD,batchnorm,COMPLETE
574,574,0.902252,ReLU,0.005006,1474,4,SGD,batchnorm,COMPLETE
42,42,0.902252,ReLU,0.005034,981,7,RMSprop,batchnorm,COMPLETE


Выводы:
* Лучшая точность получилась около 0.9
* Лучшей функцией активации оказалася ReLU; LeakyReLU - чуть хуже, RReLU хороших результатов не дал
* Лучшим оптимизатором - SGD
* Лучшая регуляризация - batchnorm
* Лучшие результаты были достигнуты сетями с небольшим количеством слоев (3-4), есть подозрение что на сетки с большим количеством слоев надо больше, чем 30 эпох.
* Хороший результат дает увеличение количества нейронов в первом скрытом слое, по сравнению с входами
* Чем меньше learning rate (за редкими, особо удачными исключениями) - тем лучше обучилась модель

In [12]:
df.loc[599]

number                         599
value                     0.903941
params_activation             ReLU
params_learning_rate     0.0178311
params_max_features           1419
params_n_layers                  4
params_optimizer               SGD
params_regularization    batchnorm
state                     COMPLETE
Name: 599, dtype: object

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'test_loss': tensor(0.3392, device='cuda:0'), 'test_accuracy': 0.8962890625}
----------------------------------------------------------------------------------------------------