# Наумкин Владимир, С01-119.

## Задача 1. Анализ модели CNN.

### Подключим библиотеки

In [1]:
from tqdm.notebook import tqdm
import numpy as np
import torch
from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms

### Уберём предупреждения

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Зададим устройство исполнения кода (вычисления провожу на своём ПК)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### Возьмём код из 15 семинара для обучения модели

In [4]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    model.zero_grad()

    output = model(x_batch.to(device))

    loss = loss_function(output, y_batch.to(device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

In [5]:
def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x.to(device), batch_of_y.to(device), optimizer, loss_function)

        if callback is not None:
            callback(model, batch_loss)

        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)

    return epoch_loss/total

In [6]:
def trainer(count_of_epoch,
            batch_size,
            dataset,
            model,
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)

    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        batch_generator = tqdm(
            torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True),
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size> 0))

        epoch_loss = train_epoch(train_generator=batch_generator,
                    model=model,
                    loss_function=loss_function,
                    optimizer=optima,
                    callback=callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})

### И вынесем код из 15 семинара для проверки качества в отдельную функцию
Не забыв исправить недочёты, обсуждаемые в видеозаписи 1 семинара 2 семестра 2021-2022 года. А именно, добавим eval и no_grad.

In [7]:
def check_model(batch_size, dataset, model, loss_function):
    model.eval()

    batch_generator = torch.utils.data.DataLoader(dataset = dataset, batch_size=batch_size)
            
    pred = []
    real = []
    test_loss = 0
    for it, (x_batch, y_batch) in enumerate(batch_generator):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        with torch.no_grad():
            output = model(x_batch)

        test_loss += loss_function(output, y_batch).cpu().item()*len(x_batch)

        pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
        real.extend(y_batch.cpu().numpy().tolist())

    test_loss /= len(dataset)

    print('loss: {}'.format(test_loss))
    return test_loss

### Аналогично возьмём код для отслеживания обучения модели

In [8]:
class callback():
    def __init__(self, writer, dataset, loss_function, delimeter = 100, batch_size=64):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size

        self.dataset = dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 0:
            model.eval()
            
            self.writer.add_graph(model, self.dataset[0][0].view(1,1,28,28).to(model.device))
            
            batch_generator = torch.utils.data.DataLoader(dataset = self.dataset, 
                                                          batch_size=self.batch_size)
            
            pred = []
            real = []
            test_loss = 0
            for it, (x_batch, y_batch) in enumerate(batch_generator):
                x_batch = x_batch.to(model.device)
                y_batch = y_batch.to(model.device)

                with torch.no_grad():
                    output = model(x_batch)

                test_loss += self.loss_function(output, y_batch).cpu().item()*len(x_batch)

                pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
                real.extend(y_batch.cpu().numpy().tolist())
            
            test_loss /= len(self.dataset)
            
            self.writer.add_scalar('LOSS/test', test_loss, self.step)
            self.writer.add_text('REPORT/test', str(classification_report(real, pred)), self.step)
          
    def __call__(self, model, loss):
        return self.forward(model, loss)

## Итак, приступим к построению разных вариантов модели CNN

Упрощение модели из семинара 15

In [9]:
class CNN_simple(torch.nn.Module):
    @property
    def device(self):
        for p in self.parameters():
            return p.device
        
    def __init__(self):
        super(CNN_simple, self).__init__()
        
        self.layers = torch.nn.Sequential() #MNIST image 28x28 monochrome pixels
        self.layers.add_module('conv1', torch.nn.Conv2d(1, 1*6, kernel_size = 3)) #28 - 2 = 26
        self.layers.add_module('relu1', torch.nn.ReLU())
        self.layers.add_module('conv2', torch.nn.Conv2d(1*6, 1*16, kernel_size = 3)) #26 - 2 = 24
        self.layers.add_module('relu2', torch.nn.ReLU())
        self.layers.add_module('flatten', torch.nn.Flatten())
        self.layers.add_module('linear1', torch.nn.Linear(16*24*24, 120))
        self.layers.add_module('relu3', torch.nn.ReLU())
        self.layers.add_module('linear2', torch.nn.Linear(120, 10))

    def forward(self, input):
        return self.layers(input)

Увеличение размера ядра

In [10]:
class CNN_kernel(torch.nn.Module):
    @property
    def device(self):
        for p in self.parameters():
            return p.device
        
    def __init__(self):
        super(CNN_kernel, self).__init__()
        
        self.layers = torch.nn.Sequential() #28
        self.layers.add_module('conv1', torch.nn.Conv2d(1, 1*6, kernel_size = 5)) #28 - 4 = 24
        self.layers.add_module('relu1', torch.nn.ReLU())
        self.layers.add_module('conv2', torch.nn.Conv2d(1*6, 1*16, kernel_size = 5)) #24 - 4 = 20
        self.layers.add_module('relu2', torch.nn.ReLU())
        self.layers.add_module('flatten', torch.nn.Flatten())
        self.layers.add_module('linear1', torch.nn.Linear(16*20*20, 120))
        self.layers.add_module('relu3', torch.nn.ReLU())
        self.layers.add_module('linear2', torch.nn.Linear(120, 10))

    def forward(self, input):
        return self.layers(input)

Увеличение числа слоёв

In [11]:
class CNN_layers(torch.nn.Module):
    @property
    def device(self):
        for p in self.parameters():
            return p.device
        
    def __init__(self):
        super(CNN_layers, self).__init__()
        
        self.layers = torch.nn.Sequential() #28x28
        self.layers.add_module('conv1', torch.nn.Conv2d(1, 1*6, kernel_size = 3)) #28 - 2 = 26
        self.layers.add_module('relu1', torch.nn.ReLU())
        self.layers.add_module('conv2', torch.nn.Conv2d(1*6, 1*16, kernel_size = 3)) #26 - 2 = 24
        self.layers.add_module('relu2', torch.nn.ReLU())
        self.layers.add_module('flatten', torch.nn.Flatten())
        self.layers.add_module('linear1', torch.nn.Linear(16*24*24, 120))
        self.layers.add_module('relu3', torch.nn.ReLU())
        self.layers.add_module('linear2', torch.nn.Linear(120, 84))
        self.layers.add_module('relu4', torch.nn.ReLU())
        self.layers.add_module('linear3', torch.nn.Linear(84, 10))

    def forward(self, input):
        return self.layers(input)

Добавление пулинга

In [12]:
class CNN_pooling(torch.nn.Module):
    @property
    def device(self):
        for p in self.parameters():
            return p.device
        
    def __init__(self):
        super(CNN_pooling, self).__init__()
        
        self.layers = torch.nn.Sequential() #28x28
        self.layers.add_module('conv1', torch.nn.Conv2d(1, 1*6, kernel_size = 3)) #28 - 2 = 26
        self.layers.add_module('relu1', torch.nn.ReLU())
        self.layers.add_module('pool1', torch.nn.MaxPool2d(kernel_size = 2)) #26 // 2 = 13
        self.layers.add_module('conv2', torch.nn.Conv2d(1*6, 1*16, kernel_size = 3)) #13 - 2 = 11
        self.layers.add_module('relu2', torch.nn.ReLU())
        self.layers.add_module('pool2', torch.nn.MaxPool2d(kernel_size = 2)) #11 // 2 = 5
        self.layers.add_module('flatten', torch.nn.Flatten())
        self.layers.add_module('linear1', torch.nn.Linear(16*5*5, 120))
        self.layers.add_module('relu3', torch.nn.ReLU())
        self.layers.add_module('linear2', torch.nn.Linear(120, 10))

    def forward(self, input):
        return self.layers(input)

Добавление BatchNorm

In [13]:
class CNN_batchnorm(torch.nn.Module):
    @property
    def device(self):
        for p in self.parameters():
            return p.device
        
    def __init__(self):
        super(CNN_batchnorm, self).__init__()
        
        self.layers = torch.nn.Sequential() #28x28
        self.layers.add_module('conv1', torch.nn.Conv2d(1, 1*6, kernel_size = 3)) #28 - 2 = 26
        self.layers.add_module('batchnorm1', torch.nn.BatchNorm2d(1*6))
        self.layers.add_module('relu1', torch.nn.ReLU())
        self.layers.add_module('conv2', torch.nn.Conv2d(1*6, 1*16, kernel_size = 3)) #26 - 2 = 24
        self.layers.add_module('batchnorm2', torch.nn.BatchNorm2d(1*16))
        self.layers.add_module('relu2', torch.nn.ReLU())
        self.layers.add_module('flatten', torch.nn.Flatten())
        self.layers.add_module('batchnorm3', torch.nn.BatchNorm1d(16*24*24))
        self.layers.add_module('linear1', torch.nn.Linear(16*24*24, 120))
        self.layers.add_module('relu3', torch.nn.ReLU())
        self.layers.add_module('batchnorm4', torch.nn.BatchNorm1d(120))
        self.layers.add_module('linear2', torch.nn.Linear(120, 10))

    def forward(self, input):
        return self.layers(input)

Добавление dropout

In [45]:
class CNN_dropout(torch.nn.Module):
    @property
    def device(self):
        for p in self.parameters():
            return p.device
        
    def __init__(self):
        super(CNN_dropout, self).__init__()
        
        self.layers = torch.nn.Sequential() #28x28
        self.layers.add_module('conv1', torch.nn.Conv2d(1, 1*6, kernel_size = 3)) #28 - 2 = 26
        self.layers.add_module('relu1', torch.nn.ReLU())
        self.layers.add_module('dropout1', torch.nn.Dropout(p=0.1))
        self.layers.add_module('conv2', torch.nn.Conv2d(1*6, 1*16, kernel_size = 3)) #26 - 2 = 24
        self.layers.add_module('relu2', torch.nn.ReLU())
        self.layers.add_module('dropout2', torch.nn.Dropout(p=0.1))
        self.layers.add_module('flatten', torch.nn.Flatten())
        self.layers.add_module('linear1', torch.nn.Linear(16*24*24, 120))
        self.layers.add_module('dropout3', torch.nn.Dropout(p=0.5))
        self.layers.add_module('relu3', torch.nn.ReLU())
        self.layers.add_module('linear2', torch.nn.Linear(120, 10))

    def forward(self, input):
        return self.layers(input)

## Скачаем датасет FashionMNIST

In [16]:
FashionMNIST_train = datasets.FashionMNIST('./FashionMNIST', train=True, download=True, transform=transforms.ToTensor())
FashionMNIST_test = datasets.FashionMNIST('./FashionMNIST', train=False, download=True, transform=transforms.ToTensor())

## Проводим обучение моделей

In [17]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

Упрощённая модель

In [20]:
model = CNN_simple()
model.to(device)
writer = SummaryWriter(log_dir = 'tensorboard1/CNN_simple')
call = callback(writer, FashionMNIST_test, loss_function, delimeter = 10)

In [28]:
model #проверка структуры (для дальнейших моделей убрана из финального отчёта)

CNN_simple(
  (layers): Sequential(
    (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
    (relu1): ReLU()
    (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
    (relu2): ReLU()
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear1): Linear(in_features=9216, out_features=120, bias=True)
    (relu3): ReLU()
    (linear2): Linear(in_features=120, out_features=10, bias=True)
  )
)

До обучения

In [30]:
check_model(64, FashionMNIST_test, model, loss_function)

loss: 2.302855361175537


2.302855361175537

Обучение

In [31]:
trainer(count_of_epoch = 5,
        batch_size = 64,
        dataset = FashionMNIST_train,
        model = model,
        loss_function = loss_function,
        optimizer = optimizer,
        lr = 0.001,
        callback = call)

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

После обучения

In [32]:
check_model(64, FashionMNIST_test, model, loss_function)

loss: 0.2769430208683014


0.2769430208683014

### Аналогично все другие модели (поменьше текста)

In [33]:
model1 = CNN_kernel()
model1.to(device)
writer1 = SummaryWriter(log_dir = 'tensorboard1/CNN_kernel')
call1 = callback(writer1, FashionMNIST_test, loss_function, delimeter = 10)

In [34]:
check_model(64, FashionMNIST_test, model1, loss_function)

loss: 2.3055130191802977


2.3055130191802977

In [35]:
trainer(count_of_epoch = 5, batch_size = 64, dataset = FashionMNIST_train, model = model1, loss_function = loss_function, optimizer = optimizer, lr = 0.001, callback = call1)

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

In [36]:
check_model(64, FashionMNIST_test, model1, loss_function)

loss: 0.2823917105436325


0.2823917105436325

Увеличение размера ядра ухудшило результат.

In [37]:
model2 = CNN_layers()
model2.to(device)
writer2 = SummaryWriter(log_dir = 'tensorboard1/CNN_layers')
call2 = callback(writer2, FashionMNIST_test, loss_function, delimeter = 10)

In [38]:
check_model(64, FashionMNIST_test, model2, loss_function)

loss: 2.304488439178467


2.304488439178467

In [39]:
trainer(count_of_epoch = 5, batch_size = 64, dataset = FashionMNIST_train, model = model2, loss_function = loss_function, optimizer = optimizer, lr = 0.001, callback = call2)

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

In [40]:
check_model(64, FashionMNIST_test, model2, loss_function)

loss: 0.2868716926336288


0.2868716926336288

Увеличение числа слоёв ухудшило результат.

In [41]:
model3 = CNN_pooling()
model3.to(device)
writer3 = SummaryWriter(log_dir = 'tensorboard1/CNN_pooling')
call3 = callback(writer3, FashionMNIST_test, loss_function, delimeter = 10)

In [42]:
check_model(64, FashionMNIST_test, model3, loss_function)

loss: 2.303406509399414


2.303406509399414

In [43]:
trainer(count_of_epoch = 5, batch_size = 64, dataset = FashionMNIST_train, model = model3, loss_function = loss_function, optimizer = optimizer, lr = 0.001, callback = call3)

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

In [44]:
check_model(64, FashionMNIST_test, model3, loss_function)

loss: 0.3431677439928055


0.3431677439928055

Добавление пулинга значительно ухудшило результат.

In [46]:
model4 = CNN_batchnorm()
model4.to(device)
writer4 = SummaryWriter(log_dir = 'tensorboard1/CNN_batchnorm')
call4 = callback(writer4, FashionMNIST_test, loss_function, delimeter = 10)

In [47]:
check_model(64, FashionMNIST_test, model4, loss_function)

loss: 2.3043930114746094


2.3043930114746094

In [48]:
trainer(count_of_epoch = 5, batch_size = 64, dataset = FashionMNIST_train, model = model4, loss_function = loss_function, optimizer = optimizer, lr = 0.001, callback = call4)

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

In [49]:
check_model(64, FashionMNIST_test, model4, loss_function)

loss: 0.31530283238887785


0.31530283238887785

Добавление BatchNorm тоже достаточно сильно ухудшило результат.

In [50]:
model5 = CNN_dropout()
model5.to(device)
writer5 = SummaryWriter(log_dir = 'tensorboard1/CNN_dropout')
call5 = callback(writer5, FashionMNIST_test, loss_function, delimeter = 10)

In [51]:
check_model(64, FashionMNIST_test, model5, loss_function)

loss: 2.300823692703247


2.300823692703247

In [52]:
trainer(count_of_epoch = 5, batch_size = 64, dataset = FashionMNIST_train, model = model5, loss_function = loss_function, optimizer = optimizer, lr = 0.001, callback = call5)

epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 0/938 [00:00<?, ?it/s]

In [53]:
check_model(64, FashionMNIST_test, model5, loss_function)

loss: 0.30473360900878904


0.30473360900878904

Добавление dropout тоже ухудшило модель.

## Выводы

Как ни странно, самая простая модель дала наилучший результат. Увеличение размера ядра свёртки или количества линейных слоёв лишь немного ухудшило результат. Более сильный негативный эффект был при добавлении dropout или BatchNorm. Пулинг сделал ещё хуже.

Стоит отметить, что все изменения вносились в простую модель, а параметры не перебирались. То есть при комбинировании усложнений модели и подборе параметров возможно получилось бы улучшить качество аппроксимации выборки.

Результаты отслеживания обучения модели я смотрю так:

1 Запускаем cmd в папке task1 и исполняем команду

tensorboard --logdir tensorboard1

2 В браузере переходим по адресу

localhost:6006

### После просмотра графиков tensorboard можно дополнить выводы:

loss модели с BatchNorm начал расти к концу обучения, а в первой половине был ощутимо ниже, чем у остальных моделей. То есть модель переобучилась. Аналогично можно сказать про сравнение исходной модели и модели с увеличенным количеством слоёв: в первой половине процесса обучения это изменение планомерно улучшало модель, но потом loss изменённой модели стал периодически достигать loss исходной модели (хорошо видно при smoothing 0.88 если оставить 2 графика).

Также если посмотреть classification_report в разделе text, то можно сравнить accuracy моделей. Выпишу сюда:

simple, kernel, layers - 0.90

pooling - 0.88 (причём долго было 0.87)

batchnorm - 0.91 (было достигнуто практически сразу, иногда менялось на 0.90)

dropout - 0.89 (постепенно увеличивается 0.87 -> 0.88 -> 0.89)

#### Ещё одним признаком переобучения модели с batchnorm является стабильное уменьшение loss на train (при начавшемся увеличении на test)

В общем, если пытаться улучшить базовую модель, то вероятно лучше всего добавить BatchNorm (но, возможно, не во всех 4 местах) и dropout (тоже подумать, где и с каким p будет оптимально вставить в структуру модели), а также попробовать подобрать размер дополнительного линейного слоя. Предложенная комбинация BatchNorm и dropout, по моему мнению, позволит избавиться от отдельных недостатков этих изменений. А именно, избежать переобучения, но ускорить обучение (то есть ускорить dropout через BatchNorm, сохранив качество работы на неизвестных данных).