# MNIST without Acquisition function

based on [this github](https://github.com/jiuntian/pytorch-mnist-example/blob/master/pytorch-mnist.ipynb)

In [None]:
import torch
import torchvision

import warnings
warnings.filterwarnings("ignore")


In [None]:
batch_size_train = 512
batch_size_test = 1024 #

# define how image transformed
image_transform = torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])
#image datasets
train_dataset = torchvision.datasets.MNIST('dataset/', 
                                           train=True, 
                                           download=True,
                                           transform=image_transform)
test_dataset = torchvision.datasets.MNIST('dataset/', 
                                          train=False, 
                                          download=True,
                                          transform=image_transform)
#data loaders
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=batch_size_train, 
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=batch_size_test, 
                                          shuffle=True)


## Looking at the data

In [None]:
# import library
import matplotlib.pyplot as plt
# We can check the dataloader
_, (example_datas, labels) = next(enumerate(test_loader))
sample = example_datas[0][0]
# show the data
plt.imshow(sample, cmap='gray', interpolation='none')
print("Label: "+ str(labels[0]))

## initializing the neural network

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5, stride=1)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5, stride=1)
        self.conv2_drop = nn.Dropout2d()
        self.conv2_drop_for_unc = nn.Dropout2d(p=0.3)
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
        self.dropout = False

    def forward(self, x):
        x = self.conv1(x)
        x = F.max_pool2d(x, 2)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.conv2_drop(x)
        x = F.max_pool2d(x, 2)
        x = F.relu(x)
        x = x.view(-1, 320)
        x = self.fc1(x)
        x = F.relu(x)
        x = F.dropout(x)
        # for the dropout based uncertainty method
        x = self.custom_dropout(x)
        x = self.fc2(x)
        return F.log_softmax(x)
    
    def custom_dropout(self, x):
        if self.dropout:
            # Apply dropout during training
            return F.dropout(x, p=0.3)
        else:
            # No dropout during evaluation
            return x

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
## create model and optimizer
learning_rate = 0.01
momentum = 0.5
model = CNN().to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
                      momentum=momentum)

In [None]:
from tqdm import tqdm_notebook as tqdm
##define train function
def train(model, device, train_loader, optimizer, epoch, log_interval=10000):
    model.train()
    tk0 = tqdm(train_loader, total=int(len(train_loader)))
    counter = 0
    for batch_idx, (data, target) in enumerate(tk0):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        counter += 1
        tk0.set_postfix(loss=(loss.item()*data.size(0) / (counter * train_loader.batch_size)))
##define test function
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
    return correct / len(test_loader.dataset)

In [None]:
num_epoch = 2
for epoch in range(1, num_epoch + 1):
        train(model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader)

# MNIST with acquisition function

In this part we will train the CNN by using multiple different acquisition functions to see which acquisition function works best.

-----------------------------

We will compare the different acquisition function by plotting the highest achieved accuracy compared to the number of datapoints requested by the acquisition function.

In [None]:
import numpy as np
import random
import pickle
from collections import Counter

# entropy

In [None]:
def entropy(model, dataset, batch_size=1028):
    model.eval()
    entropies = []

    # Create a DataLoader for the dataset with the specified batch size
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

    for inputs, _ in data_loader:
        with torch.no_grad():
            outputs = model(inputs.to(device))
            # Calculate softmax to get probabilities
            probabilities = F.softmax(outputs, dim=1)
            # Calculate entropy for each sample in the batch
            entropy = -torch.sum(probabilities * torch.log2(probabilities + 1e-10), dim=1)
            # Append the entropies for the batch to the list
            entropies.extend(entropy.cpu().numpy())

    return np.array(entropies)

In [None]:
def get_labels_from_pool_entropy(model, pool, current_labeled_dataset, amount_of_labels):
    entropies = entropy(model, train_dataset)

    # Convert entropies to a numpy array for sorting
    entropies = np.array(entropies)

    # Sort the dataset based on entropy values (in ascending order)
    sorted_indices = np.argsort(entropies)

    # Get the indices of the "amount_of_labels" data points with the highest entropy values
    top_entropy_indices = sorted_indices[-amount_of_labels:]

    # Create a new dataset with the "amount_of_labels" data points with the highest entropy
    high_entropy_data = [train_dataset[i] for i in top_entropy_indices]

    # Remove the "amount_of_labels" high-entropy data points from the original dataset
    for i in sorted(top_entropy_indices, reverse=True):
        pool.data = torch.cat((pool.data[:i], pool.data[i+1:]))
        pool.targets = torch.cat((pool.targets[:i], pool.targets[i+1:]))

    current_labeled_dataset.extend(high_entropy_data)

    return


In [None]:
def active_learning_loop_entropy(model, device, full_labeled_dataset, optimizer, target_accuracy=0.90, growth_rate=1000, N=5, batch_size_train=512):
    active_learning_dataset = []  # Initialize the active learning dataset
    current_epoch = 0
    accuracies = dict()
    label_counts_per_epoch = dict()

    while True:
        # Train on the current active learning dataset for N epochs
        print(f"getting {growth_rate} number of datapoints 'labeled'")
        get_labels_from_pool_entropy(model, full_labeled_dataset, active_learning_dataset, growth_rate)
        label_counts = Counter([label for _, label in active_learning_dataset])
        label_counts_per_epoch[current_epoch] = dict()
        for label, counts in label_counts.items():
            label_counts_per_epoch[current_epoch][label] = counts

        print(f"Training on current dataset for {N} epochs on {len(active_learning_dataset)} datapoints")
        active_learning_loader = torch.utils.data.DataLoader(active_learning_dataset, batch_size=batch_size_train, shuffle=True)
        for epoch in range(current_epoch, current_epoch + N):
            train(model, device, active_learning_loader, optimizer, epoch)  # Use active_learning_loader for training
            accuracy = test(model, device, test_loader)
            if len(active_learning_dataset) not in accuracies:
                accuracies[len(active_learning_dataset)] = []

            accuracies[len(active_learning_dataset)].append(accuracy)
            
            current_epoch += 1

        if accuracy >= target_accuracy:
            print(f"after {current_epoch} epochs and {len(active_learning_dataset)} number of datapoints an accuracy of {target_accuracy} is reached")
            break

    return accuracies, label_counts_per_epoch
        

       

In [None]:
train_dataset = torchvision.datasets.MNIST('dataset/', 
                                           train=True, 
                                           download=True,
                                           transform=image_transform)

model1 = CNN().to(device)
optimizer = optim.SGD(model1.parameters(), lr=learning_rate,
                      momentum=momentum)

nrpooledvalues_acc_entropy, label_counts_per_epoch_entropy = active_learning_loop_entropy(model1, device, train_dataset, optimizer)

filename = 'nrpooledvalues_acc_entropy.pkl'
with open(filename, 'wb') as file:
    pickle.dump(nrpooledvalues_acc_entropy, file)

filename = 'label_counts_per_epoch_entropy.pkl'
with open(filename, 'wb') as file:
    pickle.dump(label_counts_per_epoch_entropy, file)

# dropout difference

In [None]:
def dropout_diff(model, dataset, batch_size=1028):
    model.eval()
    criterion = nn.MSELoss(reduction='none')
    differences = []

    # Create a DataLoader for the dataset with the specified batch size
    data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

    for inputs, _ in data_loader:
        with torch.no_grad():
            model.dropout = True
            outputs_wdropout = model(inputs.to(device))
            model.dropout = False
            outputs_normal = model(inputs.to(device))

            loss = criterion(outputs_wdropout, outputs_normal)

            loss_per_item = torch.mean(loss, dim=1, keepdim=True)

            # Append the entropies for the batch to the list
            differences.extend(loss_per_item.cpu().numpy())


    return np.array(differences)

In [None]:
def get_labels_from_dropout_diff(model, pool, current_labeled_dataset, amount_of_labels):
    dropout_diffs = dropout_diff(model, train_dataset)

    # Convert entropies to a numpy array for sorting
    dropout_diffs = np.array(dropout_diffs)
    dropout_diffs = np.concatenate(dropout_diffs, axis=0)

    # Sort the dataset based on entropy values (in ascending order)
    sorted_indices = np.argsort(dropout_diffs)

    # Get the indices of the "amount_of_labels" data points with the highest entropy values
    top_entropy_indices = sorted_indices[-amount_of_labels:]

    # Create a new dataset with the "amount_of_labels" data points with the highest entropy
    high_diff_data = [train_dataset[i] for i in top_entropy_indices]

    # Remove the "amount_of_labels" high-entropy data points from the original dataset
    for i in sorted(top_entropy_indices, reverse=True):
        pool.data = torch.cat((pool.data[:i], pool.data[i+1:]))
        pool.targets = torch.cat((pool.targets[:i], pool.targets[i+1:]))

    current_labeled_dataset.extend(high_diff_data)

    return

In [None]:
def active_learning_loop_dropout_diff(model, device, full_labeled_dataset, optimizer, target_accuracy=0.90, growth_rate=1000, N=5, batch_size_train=512):
    active_learning_dataset = []  # Initialize the active learning dataset
    current_epoch = 0
    accuracies = dict()
    label_counts_per_epoch = dict()

    while True:
        # Train on the current active learning dataset for N epochs
        print(f"getting {growth_rate} number of datapoints 'labeled'")
        get_labels_from_dropout_diff(model, full_labeled_dataset, active_learning_dataset, growth_rate)
        label_counts = Counter([label for _, label in active_learning_dataset])
        label_counts_per_epoch[current_epoch] = dict()
        for label, counts in label_counts.items():
            label_counts_per_epoch[current_epoch][label] = counts

        print(f"Training on current dataset for {N} epochs on {len(active_learning_dataset)} datapoints")
        active_learning_loader = torch.utils.data.DataLoader(active_learning_dataset, batch_size=batch_size_train, shuffle=True)
        print(model.dropout)
        for epoch in range(current_epoch, current_epoch + N):
            train(model, device, active_learning_loader, optimizer, epoch)  # Use active_learning_loader for training
            accuracy = test(model, device, test_loader)
            if len(active_learning_dataset) not in accuracies:
                accuracies[len(active_learning_dataset)] = []

            accuracies[len(active_learning_dataset)].append(accuracy)

            current_epoch += 1

        if accuracy >= target_accuracy:
            print(f"after {current_epoch} epochs and {len(active_learning_dataset)} number of datapoints an accuracy of {target_accuracy} is reached")
            break

    return accuracies, label_counts_per_epoch
        

       

In [None]:
train_dataset = torchvision.datasets.MNIST('dataset/', 
                                           train=True, 
                                           download=True,
                                           transform=image_transform)

model2 = CNN().to(device)
optimizer = optim.SGD(model2.parameters(), lr=learning_rate,
                      momentum=momentum)

nrpooledvalues_acc_dropout_diff, label_counts_per_epoch_dropout_diff = active_learning_loop_dropout_diff(model2, device, train_dataset, optimizer)

filename = 'nrpooledvalues_acc_dropout_diff.pkl'
with open(filename, 'wb') as file:
    pickle.dump(nrpooledvalues_acc_dropout_diff, file)

filename = 'label_counts_per_epoch_dropout_diff.pkl'
with open(filename, 'wb') as file:
    pickle.dump(label_counts_per_epoch_dropout_diff, file)



# random

In [None]:
def get_labels_from_pool_random(model, pool, current_labeled_dataset, amount_of_labels):
    indexes = random.sample(range(len(pool)), amount_of_labels)
    data_points = [train_dataset[i] for i in indexes]

    # Remove the "amount_of_labels" high-entropy data points from the original dataset
    for i in sorted(indexes, reverse=True):
        pool.data = torch.cat((pool.data[:i], pool.data[i+1:]))
        pool.targets = torch.cat((pool.targets[:i], pool.targets[i+1:]))

    current_labeled_dataset.extend(data_points)

    return

In [None]:
def random_learning_loop(model, device, full_labeled_dataset, optimizer, target_accuracy=0.90, growth_rate=1000, N=5, batch_size_train=512):
    active_learning_dataset = []  # Initialize the active learning dataset
    current_epoch = 0
    accuracies = dict()

    while True:
        # Train on the current active learning dataset for N epochs
        print(f"getting {growth_rate} number of datapoints 'labeled'")
        get_labels_from_pool_random(model, full_labeled_dataset, active_learning_dataset, growth_rate)

        print(f"Training on current dataset for {N} epochs on {len(active_learning_dataset)} datapoints")
        active_learning_loader = torch.utils.data.DataLoader(active_learning_dataset, batch_size=batch_size_train, shuffle=True)
        for epoch in range(current_epoch, current_epoch + N):
            train(model, device, active_learning_loader, optimizer, epoch)  # Use active_learning_loader for training
            accuracy = test(model, device, test_loader)
            if len(active_learning_dataset) not in accuracies:
                accuracies[len(active_learning_dataset)] = []

            accuracies[len(active_learning_dataset)].append(accuracy)
            
            current_epoch += 1

        if accuracy >= target_accuracy:
            print(f"after {current_epoch} epochs and {len(active_learning_dataset)} number of datapoints an accuracy of {target_accuracy} is reached")
            break

    return accuracies
        

       

In [None]:
nrpooledvalues_acc_random = random_learning_loop(model, device, train_dataset, optimizer)

filename = 'nrpooledvalues_acc_random.pkl'
with open(filename, 'wb') as file:
    pickle.dump(nrpooledvalues_acc_random, file)
