In [1]:
# This program combines the importance weight estimation w from RLLS (refer to Anqi Liu)
# and then combine with Active Learning strategy to save label cost

In [1]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torchvision import datasets, transforms
import numpy as np
from mnist_for_labelshift import MNIST_SHIFT
from cifar10_for_labelshift import CIFAR10_SHIFT
import torchvision
from resnet import *
import cvxpy as cp
from sklearn.metrics import f1_score
import os
import copy
import tensorflow as tf
import matplotlib.pyplot as plt
import heapq

In [2]:
log_interval = 1000
use_cuda = 0 # we use cpu
device = torch.device("cuda" if use_cuda else "cpu")
lr = 0.01
momentum = 0.5
iterations = 1
sigma = 0
batch_size = 64
test_batch_size = 1000
shift_type = 2
shift_para = 0.2
shift_para_aux = None
#model = 'MLP'
epochs_estimation = 1
epochs_training = 1
epochs_validation = 1
labda = [1.0]
data_name = 'mnist'
n_class = 10
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

In [3]:
# # test 1
# training_size = 30000
# testing_size = 15000
# testsize_range = [5000]

In [4]:
training_size = 20000
testing_size = 15000
testsize_range = [5000]

In [5]:
# models
class Net(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(Net, self).__init__()
        self.D_in = D_in
        self.H = H
        self.D_out = D_out
        self.model = torch.nn.Sequential(
            torch.nn.Linear(self.D_in, self.H),
            torch.nn.ReLU(),
            torch.nn.Linear(self.H, self.D_out),
            )

    def forward(self, x):
        x = x.view(-1, self.D_in)
        x = self.model(x)
        return x

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [6]:
def train(model, device, train_loader, optimizer, epoch, weight=None):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        #print("train(): batch_idx = ", batch_idx, "; len(data) = ", len(data))
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        #print("output type: ", type(output))
        if weight is None:
            criterion = nn.CrossEntropyLoss()
        else:
            criterion = nn.CrossEntropyLoss(weight)
        loss = criterion(output, target)

        loss.backward()
        optimizer.step()
        # ???
        
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [7]:
def train_al(model, device, train_loader, extra_data, optimizer, epoch, weight):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        #print("train(): batch_idx = ", batch_idx, "; len(data) = ", len(data))
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        criterion = nn.CrossEntropyLoss(weight)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    
    for data, target in extra_data:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        criterion = nn.CrossEntropyLoss()
        loss = criterion(output, target)
        #print("loss = ", loss, ", loss type = ", type(loss) )
        loss.backward()
        optimizer.step()
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, 0 * len(data), len(extra_data),
                100. * 0 / len(extra_data), loss.item()))

    
    

In [8]:
def test(model, device, test_loader, weight=None):
    model.eval()
    test_loss = 0
    correct = 0
    prediction = np.empty([0,1])
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            #print("output type: ", type(output))
            if weight is None:
                criterion = nn.CrossEntropyLoss(reduction='sum')
            else:
                criterion = nn.CrossEntropyLoss(weight, reduction='sum')

            loss = criterion(output, target)
            test_loss += loss.item()# sum up batch loss
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability

            correct += pred.eq(target.view_as(pred)).sum().item()
            pred = pred.cpu().numpy()
            prediction = np.concatenate((prediction, pred))

    test_loss /= len(test_loader.dataset)
    print('Average loss: {:.4f}, Accuracy: {}/{} ({:.5f}%)\n'.format(test_loss, correct, len(test_loader.dataset),float(100. * correct / len(test_loader.dataset))))
    return prediction, float(100. * correct / len(test_loader.dataset)), test_loss



In [26]:
def test_query(model, device, test_loader, query_size):
    model.eval()
    test_loss = 0
    correct = 0
    prediction = np.empty([0,1])
    
    probs_list = []
    uncertainty_queue = []
    
    id_data_target = {}
    id_data = 0
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            #print("output type: ", type(output))
            criterion = nn.CrossEntropyLoss(reduction='sum')
            
            loss = criterion(output, target)
            test_loss += loss.item()# sum up batch loss
            
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            probs = output.max(1, keepdim=True)[0]
            
            #print("size of probs tensor = ", probs.size())
            #print("probs.cpu().numpy() = ", probs.cpu().numpy())
            sum_output_tensor = torch.sum(output, 1)
            sum_output = sum_output_tensor.cpu().numpy()[0]
            
            prob = probs.cpu().numpy()[0][0] / sum_output
            
            heapq.heappush(probs_list, (prob, id_data))
            id_data_target[id_data] = (data, target)
            id_data += 1
            
            correct += pred.eq(target.view_as(pred)).sum().item()
            pred = pred.cpu().numpy()
            prediction = np.concatenate((prediction, pred))
    
    for i in range(query_size):
        prob, id_data = heapq.heappop(probs_list)
        uncertainty_queue.append(id_data_target[id_data])
    
    test_loss /= len(test_loader.dataset)
    print('Average loss: {:.4f}, Accuracy: {}/{} ({:.5f}%)\n'.format(test_loss, correct, len(test_loader.dataset),float(100. * correct / len(test_loader.dataset))))
    return prediction, 100. * correct / len(test_loader.dataset), test_loss, uncertainty_queue
    



In [10]:
# compute weight for Q(y)/P(y) = w
def compute_w_opt(C_yy,mu_y,mu_train_y, rho, labda = 1):

    n = C_yy.shape[0]
    theta = cp.Variable(n)
    b = mu_y - mu_train_y
    objective = cp.Minimize(cp.pnorm(C_yy*theta - b) + rho* cp.pnorm(theta))
    constraints = [-1 <= theta]
    prob = cp.Problem(objective, constraints)

    result = prob.solve()
    w = 1 + theta.value * labda

    print('Estimated w is', w)
   
    return w

In [11]:
def compute_true_w(train_labels, test_labels, n_class, m_train, m_test):
     # compute the true w
    mu_y_train = np.zeros(n_class)
    for i in range(n_class):
        mu_y_train[i] = float(len(np.where(train_labels == i)[0]))/m_train
    mu_y_test = np.zeros(n_class)
    for i in range(n_class):
        mu_y_test[i] = float(len(np.where(test_labels == i)[0]))/m_test
    true_w = mu_y_test/mu_y_train
    print('True w is', true_w)
    return true_w

In [12]:
def compute_3deltaC(n_class, n_train, delta):
    rho = 3*(2*np.log(2*n_class/delta)/(3*n_train) + np.sqrt(2*np.log(2*n_class/delta)/n_train))
    return rho 

In [13]:
def choose_alpha(n_class, C_yy, mu_y, mu_y_train, rho, true_w):
    alpha = [10, 1, 0.1, 0.01, 0.001, 0.0001]
    w2 = np.zeros((len(alpha), n_class))
    for i in range(len(alpha)):

        w2[i, :] = compute_w_opt(C_yy, mu_y, mu_y_train, alpha[i] * rho)
    mse2 = np.sum(np.square(np.matlib.repmat(true_w, len(alpha),1) - w2), 1)/n_class
    i = np.argmin(mse2)
    print("choose_alpha - mse2, ", mse2)
    return alpha[i]

In [14]:
def acc_perclass(y, predictions, n_class):

    acc = np.zeros(n_class)
    predictions = np.concatenate(predictions)

    for i in range(n_class):
        si = float(len(np.where(y == i)[0]))
        if si != 0:
            acc[i] = float(len(np.where((predictions == i)& (y == i))[0]))/float(len(np.where(y == i)[0]))
        else:
            acc[i] = 0
    return acc

In [15]:
def train_validate_test(device, use_cuda, w, train_model, init_state, train_loader, test_loader, validate_loader, test_labels, n_class):
    w = torch.tensor(w)
    train_model.load_state_dict(init_state)
    if use_cuda:
        w = w.cuda().float()
        train_model.cuda()
    else:
        w = w.float()
    
    best_loss = 10
    # model = train_model.to(device)#ConvNet().to(device)
    print("train_validate_test --- best_loss = ", best_loss)
    optimizer = optim.SGD(train_model.parameters(), lr=lr, momentum=momentum, weight_decay=5e-4)
    for epoch in range(1, epochs_training + 1):
        print("i am in for loop for train_validate_test!!!")
        train(train_model, device, train_loader, optimizer, epoch, weight=w) 
        # save checkpoint
        
        if epoch >= epochs_validation:
            # validation
            _, _, loss = test(train_model, device, validate_loader, weight=w)
            print("saving model ---- loss = ", loss)
            if loss < best_loss: 
                print('saving model')
                state = {
                    'model': train_model.state_dict(),
                    }
                if not os.path.isdir('checkpoint'):
                    os.mkdir('checkpoint')
                torch.save(state, './checkpoint/ckpt.pt')
                best_loss = loss
        
    print('\nTesting on test set')
    # read checkpoint
    print('Reading model')
    checkpoint = torch.load('./checkpoint/ckpt.pt')
    train_model.load_state_dict(checkpoint['model'])
    predictions, acc, _ = test(train_model, device, test_loader)
    f1 = f1_score(test_labels, predictions, average='macro')
    f2 = f1_score(test_labels, predictions, average='micro') 
    acc_per_class = acc_perclass(test_labels, predictions, n_class)
     
    print('F1-score-micro:', f2)
    print('F1-score-macro:', f1)
    return acc, f1, f2, acc_per_class

In [16]:
print('Shift-type:', shift_type)
print('shift parameters: ', shift_para)
#shift type


num_paras = len(testsize_range)
print("num_paras = ", num_paras)
print("testsize_range = ", testsize_range)
num_labda = len(labda)
print("num_labda = ", num_labda)
print("labda = ", labda)

Shift-type: 2
shift parameters:  0.2
num_paras =  1
testsize_range =  [5000]
num_labda =  1
labda =  [1.0]


In [17]:
# train h_0 

# Download data and separate into training and test
# load raw_data
# raw_data = MNIST_SHIFT('data/mnist', training_size, training_size, 1, sigma, target_label=2, train=True, download=True, \
#             transform=transforms.Compose([ \
#                            transforms.ToTensor(), \
#                            transforms.Normalize((0.1307,), (0.3081,)) \
#                            ]))
raw_data_h0 = MNIST_SHIFT('data/mnist', training_size, testing_size, 1, sigma, target_label=2, train=True, download=True, \
            transform=transforms.Compose([ \
                           transforms.ToTensor(), \
                           transforms.Normalize((0.1307,), (0.3081,)) \
                           ]))

# saparate into training and testing
m_h0 = len(raw_data_h0)
print("Raw Data Size = ", m_h0)

m_test_h0 = raw_data_h0.get_testsize()
test_indices_h0 = range(m_test_h0)
test_data_h0 = data.Subset(raw_data_h0, test_indices_h0) # type dataset
print('Test Data size = ', m_test_h0)

m_train_h0 = m_h0 -  m_test_h0
train_data_h0 = data.Subset(raw_data_h0, range(m_test_h0, m_h0)) # type dataset
print('Training Data size = ', m_train_h0)

# get labels for future use
test_labels_h0 = raw_data_h0.get_test_label() # numpy array
train_labels_h0 = raw_data_h0.get_train_label() # numpy array

# base model for h_0
D_in = 784
base_model = Net(D_in, 256, 10)
base_model = base_model.to(device)
# saparate into training and validation
# finish data preprocessing
# estimate weights using training and validation set
train_loader_h0 = data.DataLoader(train_data_h0, batch_size=batch_size, shuffle=True, **kwargs)
#model = ResNet18(**kwargs).to(device)#ConvNet().to(device)
optimizer = optim.SGD(base_model.parameters(), lr=lr, momentum=momentum, weight_decay=5e-4)
print('\nTraining using training_data1, testing on training_data2 to estimate weights.') 
for epoch in range(1, epochs_estimation + 1):
    train(base_model, device, train_loader_h0, optimizer, epoch)
print("Finish training for h_0")

Raw Data Size =  35000
Test Data size =  15000
Training Data size =  20000

Training using training_data1, testing on training_data2 to estimate weights.
Finish training for h_0


In [18]:
# calculate importance weights

tw_tensor = torch.zeros([num_paras, 10])
# assuming training size >> testing size
#raw_data = MNIST_SHIFT('data/mnist', training_size, training_size, shift_type, shift_para, parameter_aux = shift_para_aux,target_label=2, train=True, download=True,
raw_data = MNIST_SHIFT('data/mnist', training_size, testing_size, shift_type, shift_para, parameter_aux = shift_para_aux,target_label=2, train=True, download=True, 
        transform=transforms.Compose([ 
                   transforms.ToTensor(), 
                   transforms.Normalize((0.1307,), (0.3081,)) 
                   ]))
# model for importance weights
D_in = 784
train_model_w = Net(D_in, 256, 10)
train_model_w = train_model_w.to(device)
init_state = copy.deepcopy(train_model_w.state_dict())

# saparate into training and testing
m = len(raw_data)
print("All Data Size", m)

m_test = raw_data.get_testsize()
print('All Test Size,', m_test)
test_indices = range(m_test)

m_train = m -  m_test
print('All Train Size, ', m_train)

range_m_test = testsize_range[0]
print('test_size_range = ', range_m_test)

range_test_data = data.Subset(raw_data, test_indices[0 : testsize_range[0]]) # 4000 test samples from all test data
train_data = data.Subset(raw_data, range(m_test, m))

# get labels for future use
range_test_labels = raw_data.get_test_label()[0 : testsize_range[0]] # 4000 test labels from all test data
print("test_labels shape = ", range_test_labels.shape)

train_labels = raw_data.get_train_label() # labels of all train data
print("train_labels shape = ", train_labels.shape)

# finish data preprocessing
# estimate weights using training and validation set
test_loader_train_data = data.DataLoader(train_data, batch_size=batch_size, shuffle=False, **kwargs) # all train data
print('\nTesting on training_data2 to estimate C_yy.')
predictions, acc, _ = test(base_model, device, test_loader_train_data)
# compute C_yy 
C_yy = np.zeros((n_class, n_class)) 
#print(m_train_v)
predictions = np.concatenate(predictions)

for i in range(n_class):
    for j in range(n_class):
        C_yy[i,j] = float(len(np.where((predictions== i)&(train_labels==j))[0]))/m_train

mu_y_train_hat = np.zeros(n_class)
for i in range(n_class):
    mu_y_train_hat[i] = float(len(np.where(predictions == i)[0]))/m_train

# print(mu_y_train)
# print(C_yy)
# prediction on x_test to estimate mu_y
print('\nTesting on test data to estimate mu_y.')
range_test_loader = data.DataLoader(range_test_data, batch_size=batch_size, shuffle=False, **kwargs)
predictions, acc, _ = test(base_model, device, range_test_loader)
mu_y = np.zeros(n_class)
for i in range(n_class):
    mu_y[i] = float(len(np.where(predictions == i)[0]))/range_m_test
# print(mu_y)

rho = compute_3deltaC(n_class, m_train, 0.05)
alpha = 0.0001
w2 = compute_w_opt(C_yy, mu_y, mu_y_train_hat, alpha * rho)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("Our weight: ", w2)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

# use original test set to test
# All test
test_data = data.Subset(raw_data, test_indices)
#print("test_data size: ", len(test_data))
test_labels = raw_data.get_test_label()
m_test = raw_data.get_testsize()
test_loader = data.DataLoader(test_data, batch_size=batch_size, shuffle=False, **kwargs)
# compute the true w
true_w = compute_true_w(train_labels, test_labels, n_class, m_train, m_test)
tw_tensor[0,:] = torch.tensor(true_w)
#print('True w is', true_w)
mse2 = np.sum(np.square(true_w - w2))/n_class
print('Mean square error of (true_w, w2) = ', mse2)

All Data Size 35000
All Test Size, 15000
All Train Size,  20000
test_size_range =  5000
test_labels shape =  (5000,)
train_labels shape =  (20000,)

Testing on training_data2 to estimate C_yy.
Average loss: 0.3880, Accuracy: 17780/20000 (88.90000%)


Testing on test data to estimate mu_y.
Average loss: 0.3744, Accuracy: 4452/5000 (89.04000%)

Estimated w is [1.04082098 1.28267475 0.49072941 1.08733626 1.08435536 1.14561165
 1.13243625 1.15993498 1.08858884 1.11248658]
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Our weight:  [1.04082098 1.28267475 0.49072941 1.08733626 1.08435536 1.14561165
 1.13243625 1.15993498 1.08858884 1.11248658]
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
True w is [1.13363029 1.25636672 0.49188325 1.13322091 1.09420692 1.02867784
 1.11723879 1.13106525 1.10644734 1.13081122]
Mean square error of (true_w, w2) =  0.0026902123403365315


In [19]:
# saparate into training and testing
m = len(raw_data) # all data
print("All Data Size, ", m)

m_test = raw_data.get_testsize() # all test data
print('All Test Size,', m_test)
test_indices = range(m_test)

test_data = data.Subset(raw_data, range(m_test)) 
test_labels = raw_data.get_test_label()
test_loader = data.DataLoader(test_data, batch_size=1, shuffle=True, **kwargs)

train_data = data.Subset(raw_data, range(m_test, m)) # all train data (all data - all test data)
train_labels = raw_data.get_train_label()
train_loader = data.DataLoader(train_data, batch_size=1, shuffle=True, **kwargs)

# train the base model
D_in = 784
base_model = Net(D_in, 256, 10)
base_model = base_model.to(device)
#model = ResNet18(**kwargs).to(device)#ConvNet().to(device)
optimizer = optim.SGD(base_model.parameters(), lr=lr, momentum=momentum, weight_decay=5e-4)
for epoch in range(1, epochs_training + 1):
    train(base_model, device, train_loader, optimizer, epoch)
print("Finish training for base model")

All Data Size,  35000
All Test Size, 15000
Finish training for base model


In [20]:
# query_size = 20
# prediction, accuracy, test_loss, uncertainty_queue = test_query(base_model, device, test_loader, query_size)
# extra_data = []
# for item in uncertainty_queue:
#     data = item[0]
#     target = item[1]
#     extra_data.append((data, target))
# w2_tensor = torch.from_numpy(w2).float()
# train_al(base_model, device, train_loader, extra_data, optimizer, epoch, w2_tensor)


In [21]:
# prediction, accuracy, test_loss, uncertainty_queue = test_query(base_model, device, test_loader, query_size)

In [22]:
# query_size = 20
# #prediction, accuracy, test_loss, uncertainty_queue = test_query(base_model, device, test_loader, query_size)
# extra_data = []
# for item in uncertainty_queue:
#     data = item[0]
#     target = item[1]
#     extra_data.append((data, target))
# w2_tensor = torch.from_numpy(w2).float()
# train_al(base_model, device, train_loader, extra_data, optimizer, epoch, w2_tensor)



In [23]:
# prediction, accuracy, test_loss, uncertainty_queue = test_query(base_model, device, test_loader, query_size)

In [24]:
# # test 1
# query_size = 20
# iterations = 5
# for i in range(iterations):
#     print("Current iteration = ", i)
#     prediction, accuracy, test_loss, uncertainty_queue = test_query(base_model, device, test_loader, query_size)
#     extra_data = []
#     for item in uncertainty_queue:
#         data = item[0]
#         target = item[1]
#         extra_data.append((data, target))
#     w2_tensor = torch.from_numpy(w2).float()
#     train_al(base_model, device, train_loader, extra_data, optimizer, epoch, w2_tensor)
    
# prediction, accuracy, test_loss, uncertainty_queue = test_query(base_model, device, test_loader, query_size)


In [27]:
query_size = 50
iterations = 10
for i in range(iterations):
    print("Current iteration = ", i)
    prediction, accuracy, test_loss, uncertainty_queue = test_query(base_model, device, test_loader, query_size)
    extra_data = []
    for item in uncertainty_queue:
        data = item[0]
        target = item[1]
        extra_data.append((data, target))
    w2_tensor = torch.from_numpy(w2).float()
    train_al(base_model, device, train_loader, extra_data, optimizer, epoch, w2_tensor)
    
prediction, accuracy, test_loss, uncertainty_queue = test_query(base_model, device, test_loader, query_size)



Current iteration =  0
Average loss: 0.3782, Accuracy: 13633/15000 (90.88667%)

Current iteration =  1
Average loss: 0.3783, Accuracy: 13476/15000 (89.84000%)

Current iteration =  2
Average loss: 0.2966, Accuracy: 13920/15000 (92.80000%)

Current iteration =  3


Average loss: 0.5648, Accuracy: 13414/15000 (89.42667%)

Current iteration =  4
Average loss: 0.3161, Accuracy: 13825/15000 (92.16667%)

Current iteration =  5
Average loss: 0.3997, Accuracy: 13785/15000 (91.90000%)

Current iteration =  6


Average loss: 0.3201, Accuracy: 14018/15000 (93.45333%)

Current iteration =  7
Average loss: 0.4211, Accuracy: 13593/15000 (90.62000%)

Current iteration =  8
Average loss: 0.2914, Accuracy: 13885/15000 (92.56667%)

Current iteration =  9


Average loss: 0.2660, Accuracy: 14100/15000 (94.00000%)

Average loss: 0.2326, Accuracy: 14150/15000 (94.33333%)

