**Использование псевдоразметки. ДЗ.**

In [1]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import random
import numpy as np

In [2]:
torch.manual_seed(123)
torch.cuda.manual_seed(123)
np.random.seed(123)
random.seed(123)
torch.backends.cudnn.deterministic = True

Начнем с загрузки датасета. Речевые данные (и модели, обучаемые на них) очень тяжелые, поэтому мы обойдемся чем-нибудь попроще.

In [3]:
pwd

'/home/kontsevaya/speech'

In [4]:
train_dataset = \
    datasets.MNIST('./data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
test_dataset = \
    datasets.MNIST('./data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [5]:
len(train_dataset), len(test_dataset)

(60000, 10000)

Итак, трейн состоит из 60000 картинок цифр. Для того, чтобы получше увидеть эффект от псевдолейблов, мы оставим только 100 этих картинок в качестве размеченных данных. Остальные 59900 будут в качестве неразмеченных. 

На масштабах 100 записей могут проявиться неприятные эффекты, если какие-то из классов не будут достаточно хорошо представлены. Чтобы этого избежать, будем аккуратно семплировать. Самый простой вариант - просто случайно разделять, пока не получится удачное разбиение.

Для начала определим удачность разбиения. Будем считать размеченный датасет хорошим, если из 100 примеров в нем есть хотя бы по 8 представителей каждого класса. Напишите функцию, которая делает такую проверку.

In [43]:
AT_LEAST = 8

def check_dataset(dataset):
    label_dict = {i: 0 for i in range(10)}
    for item in dataset:
        label = item[1]
        label_dict[label] +=1
        if all(v >= AT_LEAST for v in label_dict.values()):
            return True
    return False

In [44]:
sampling_iteration = 0
while True:
    labeled_train_dataset, unlabeled_train_dataset = torch.utils.data.random_split(train_dataset, [100, 59900])
    if check_dataset(labeled_train_dataset):
        break
    sampling_iteration += 1
    print(sampling_iteration)
print(f'Split the dataset after {sampling_iteration} resamplings')

1
2
3
4
5
6
7
8
9
10
11
12
Split the dataset after 12 resamplings


In [46]:
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=64, shuffle=False)
labeled_train_loader = torch.utils.data.DataLoader(
    labeled_train_dataset, batch_size=64, shuffle=True)
unlabeled_train_loader = torch.utils.data.DataLoader(
    unlabeled_train_dataset, batch_size=64, shuffle=False)

Теперь, когда мы получили данные, определим архитектуру сети. Возьмем простую сверточную сетку с droupout'ом.

In [47]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, kernel_size=5)
        self.conv2 = nn.Conv2d(20, 40, kernel_size=5)
        self.dropout = nn.Dropout2d(p=0.5)
        self.fc1 = nn.Linear(640, 150)
        self.fc2 = nn.Linear(150, 10)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = x.view(-1, 1, 28, 28)
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.dropout(self.conv2(x)), 2))
        x = x.view(-1, 640)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.log_softmax(x)
        return x

Опишем вспомогательные функции.

In [48]:
def train(epoch_idx, model, optimizer, train_loader, loss_func=F.nll_loss):
    model.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        x, target = x.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(x)
        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()

In [49]:
def test(epoch_idx, model, test_loader):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for x, target in test_loader:
            x, target = x.cuda(), target.cuda()
            output = model(x)
            test_loss += F.nll_loss(output, target, size_average=False).item()
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('Epoch {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        epoch_idx, test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [50]:
def predict(model, loader):
    model.eval()
    result = []
    with torch.no_grad():
        for x, _ in loader:
            result.append(model(x.cuda()))
    return torch.cat(result)

Создадим модель и обучим ее на нашем размеченном датасете.

In [51]:
model = Net().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [52]:
for i in range(400):
    train(i, model, optimizer, labeled_train_loader)
    if i % 10 == 0:
        test(i, model, test_loader)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch 0: Average loss: 2.2824, Accuracy: 1829/10000 (18%)
Epoch 10: Average loss: 1.8417, Accuracy: 4466/10000 (45%)
Epoch 20: Average loss: 0.9893, Accuracy: 6422/10000 (64%)
Epoch 30: Average loss: 0.6253, Accuracy: 7761/10000 (78%)
Epoch 40: Average loss: 0.6356, Accuracy: 8159/10000 (82%)
Epoch 50: Average loss: 0.7345, Accuracy: 7889/10000 (79%)
Epoch 60: Average loss: 0.6410, Accuracy: 8368/10000 (84%)
Epoch 70: Average loss: 0.6432, Accuracy: 8450/10000 (84%)
Epoch 80: Average loss: 0.6390, Accuracy: 8456/10000 (85%)
Epoch 90: Average loss: 0.6925, Accuracy: 8425/10000 (84%)
Epoch 100: Average loss: 0.6618, Accuracy: 8478/10000 (85%)
Epoch 110: Average loss: 0.7043, Accuracy: 8455/10000 (85%)
Epoch 120: Average loss: 0.8320, Accuracy: 8346/10000 (83%)
Epoch 130: Average loss: 0.7793, Accuracy: 8409/10000 (84%)
Epoch 140: Average loss: 0.7055, Accuracy: 8547/10000 (85%)
Epoch 150: Average loss: 0.7872, Accuracy: 8444/10000 (84%)
Epoch 160: Average loss: 0.8246, Accuracy: 8453/100

In [67]:
predict(model, list(unlabeled_train_dataset)).shape

torch.Size([59900, 10])

Теперь попробуем побить этот результат с помощью псевдолейблов. Напишем функцию, которая принимает модель и возращает DataLoader с хард-лейблами, и запустим обучение.

In [88]:
def get_pseudo_loader(model):
    dataset = list(unlabeled_train_dataset)
    model.eval()
    with torch.no_grad():
        for i, (x, _) in enumerate(dataset):
            pred = model(x.cuda())
            dataset[i] = (x, np.argmax(pred.cpu()))
    return torch.utils.data.DataLoader(
        dataset, batch_size=64, shuffle=True)

In [89]:
model_hard = Net().cuda()
model_hard.load_state_dict(model.state_dict())
optimizer_hard = torch.optim.SGD(model_hard.parameters(), lr=0.1)

In [90]:
hard_labeled_loader = get_pseudo_loader(model)
for i in range(10):
    train(i, model_hard, optimizer_hard, hard_labeled_loader)
    train(i, model_hard, optimizer_hard, labeled_train_loader)
    test(i, model_hard, test_loader)



Epoch 0: Average loss: 0.5427, Accuracy: 8667/10000 (87%)
Epoch 1: Average loss: 0.5580, Accuracy: 8707/10000 (87%)
Epoch 2: Average loss: 0.5911, Accuracy: 8694/10000 (87%)
Epoch 3: Average loss: 0.5596, Accuracy: 8697/10000 (87%)
Epoch 4: Average loss: 0.6305, Accuracy: 8710/10000 (87%)
Epoch 5: Average loss: 0.7311, Accuracy: 8625/10000 (86%)
Epoch 6: Average loss: 0.6505, Accuracy: 8644/10000 (86%)
Epoch 7: Average loss: 0.6118, Accuracy: 8696/10000 (87%)
Epoch 8: Average loss: 0.6262, Accuracy: 8633/10000 (86%)
Epoch 9: Average loss: 0.5810, Accuracy: 8704/10000 (87%)


**Итеративная псевдоразметка.**

Мы уже видим небольшое улучшение, но можно пойти дальше.

In [91]:
model_hard_iter = Net().cuda()
model_hard_iter.load_state_dict(model.state_dict())
optimizer_hard_iter = torch.optim.SGD(model_hard_iter.parameters(), lr=0.1)

In [92]:
for i in range(20):
    hard_labeled_loader = get_pseudo_loader(model_hard_iter)
    train(i, model_hard_iter, optimizer_hard_iter, hard_labeled_loader)
    train(i, model_hard_iter, optimizer_hard_iter, labeled_train_loader)
    test(i, model_hard_iter, test_loader)

Epoch 0: Average loss: 0.5711, Accuracy: 8702/10000 (87%)
Epoch 1: Average loss: 0.6094, Accuracy: 8830/10000 (88%)
Epoch 2: Average loss: 0.5249, Accuracy: 8918/10000 (89%)
Epoch 3: Average loss: 0.6092, Accuracy: 8883/10000 (89%)
Epoch 4: Average loss: 0.6089, Accuracy: 8943/10000 (89%)
Epoch 5: Average loss: 0.5638, Accuracy: 8986/10000 (90%)
Epoch 6: Average loss: 0.6328, Accuracy: 8969/10000 (90%)
Epoch 7: Average loss: 0.5571, Accuracy: 9037/10000 (90%)
Epoch 8: Average loss: 0.5933, Accuracy: 9035/10000 (90%)
Epoch 9: Average loss: 0.6800, Accuracy: 9017/10000 (90%)
Epoch 10: Average loss: 0.6030, Accuracy: 9060/10000 (91%)
Epoch 11: Average loss: 0.6247, Accuracy: 9055/10000 (91%)
Epoch 12: Average loss: 0.6847, Accuracy: 9061/10000 (91%)
Epoch 13: Average loss: 0.7165, Accuracy: 9058/10000 (91%)
Epoch 14: Average loss: 0.6640, Accuracy: 9060/10000 (91%)
Epoch 15: Average loss: 0.7630, Accuracy: 9056/10000 (91%)
Epoch 16: Average loss: 0.6991, Accuracy: 9073/10000 (91%)
Epoch 1

**Оценивание.**

В предыдущем пункте нужно получить accuracy 91% или выше (5 баллов).

Следующие шаги:

Модифицировать функцию `get_pseudo_loader`, чтобы она могла возвращать софт-лейблы (+1 балл).

Правильно запустить обучение - в качестве лосса используем KL-дивергенцию. Получить accuracy 90% или выше. (+3 балла).

Интуитивно кажется, что модель не должна ничему учиться, т.к. ее выход будет полностью совпадать с софт-лейблами. Напишите (текстом), почему тем не менее удается сильно выиграть относительно бейзлайна. (+1 балл).

In [102]:
model_soft_iter = Net().cuda()
model_soft_iter.load_state_dict(model.state_dict())
optimizer_soft_iter = torch.optim.SGD(model_soft_iter.parameters(), lr=0.1)

In [103]:
def get_pseudo_loader(model, soft=False):
    dataset = list(unlabeled_train_dataset)
    model.eval()
    with torch.no_grad():
        for i, (x, _) in enumerate(dataset):
            pred = model(x.cuda())
            if soft:
                dataset[i] = (x, pred.cpu().squeeze())
            else:
                dataset[i] = (x, np.argmax(pred.cpu()))
    return torch.utils.data.DataLoader(
        dataset, batch_size=64, shuffle=True)

In [101]:
criterion = torch.nn.KLDivLoss(reduction='batchmean')

for i in range(20):
    soft_labeled_loader = get_pseudo_loader(model_soft_iter, soft=True)
    train(i, model_soft_iter, optimizer_soft_iter, soft_labeled_loader, loss_func=criterion)
    train(i, model_soft_iter, optimizer_soft_iter, labeled_train_loader)
    test(i, model_soft_iter, test_loader)

Epoch 0: Average loss: 0.8990, Accuracy: 8538/10000 (85%)
Epoch 1: Average loss: 0.9005, Accuracy: 8479/10000 (85%)
Epoch 2: Average loss: 0.8583, Accuracy: 8558/10000 (86%)
Epoch 3: Average loss: 0.8596, Accuracy: 8556/10000 (86%)
Epoch 4: Average loss: 0.8613, Accuracy: 8561/10000 (86%)
Epoch 5: Average loss: 0.8625, Accuracy: 8554/10000 (86%)
Epoch 6: Average loss: 0.8713, Accuracy: 8553/10000 (86%)
Epoch 7: Average loss: 0.8674, Accuracy: 8562/10000 (86%)
Epoch 8: Average loss: 0.8914, Accuracy: 8560/10000 (86%)
Epoch 9: Average loss: 0.8268, Accuracy: 8550/10000 (86%)
Epoch 10: Average loss: 0.8276, Accuracy: 8550/10000 (86%)
Epoch 11: Average loss: 0.8300, Accuracy: 8550/10000 (86%)
Epoch 12: Average loss: 0.8299, Accuracy: 8556/10000 (86%)
Epoch 13: Average loss: 0.8937, Accuracy: 8482/10000 (85%)
Epoch 14: Average loss: 0.8544, Accuracy: 8526/10000 (85%)
Epoch 15: Average loss: 0.8503, Accuracy: 8539/10000 (85%)
Epoch 16: Average loss: 0.8524, Accuracy: 8538/10000 (85%)
Epoch 1

In [105]:
criterion = torch.nn.KLDivLoss(reduction='sum')

for i in range(20):
    soft_labeled_loader = get_pseudo_loader(model_soft_iter, soft=True)
    train(i, model_soft_iter, optimizer_soft_iter, soft_labeled_loader, loss_func=criterion)
    train(i, model_soft_iter, optimizer_soft_iter, labeled_train_loader)
    test(i, model_soft_iter, test_loader)

Epoch 0: Average loss: 0.9000, Accuracy: 8545/10000 (85%)
Epoch 1: Average loss: 0.8910, Accuracy: 8566/10000 (86%)
Epoch 2: Average loss: 0.8929, Accuracy: 8569/10000 (86%)
Epoch 3: Average loss: 0.9373, Accuracy: 8445/10000 (84%)
Epoch 4: Average loss: 0.9397, Accuracy: 8447/10000 (84%)
Epoch 5: Average loss: 0.9428, Accuracy: 8450/10000 (84%)
Epoch 6: Average loss: 0.9515, Accuracy: 8454/10000 (85%)
Epoch 7: Average loss: 0.9446, Accuracy: 8459/10000 (85%)
Epoch 8: Average loss: 0.9502, Accuracy: 8461/10000 (85%)
Epoch 9: Average loss: 0.9476, Accuracy: 8478/10000 (85%)
Epoch 10: Average loss: 0.9488, Accuracy: 8479/10000 (85%)
Epoch 11: Average loss: 0.9482, Accuracy: 8479/10000 (85%)
Epoch 12: Average loss: 0.9474, Accuracy: 8480/10000 (85%)
Epoch 13: Average loss: 0.9432, Accuracy: 8490/10000 (85%)
Epoch 14: Average loss: 0.9431, Accuracy: 8491/10000 (85%)
Epoch 15: Average loss: 0.9479, Accuracy: 8491/10000 (85%)
Epoch 16: Average loss: 0.9482, Accuracy: 8492/10000 (85%)
Epoch 1