In [1]:
from __future__ import print_function
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import random
import os
import argparse
import numpy as np
from PreResNet_dlh import *
from sklearn.mixture import GaussianMixture
import dataloader_cifar_dlh0617 as dataloader

import time
import datetime

In [2]:
parser = argparse.ArgumentParser(description='PyTorch CIFAR Training')
parser.add_argument('--batch_size', default=64, type=int, help='train batchsize') 
parser.add_argument('--lr', '--learning_rate', default=0.02, type=float, help='initial learning rate')
parser.add_argument('--noise_mode',  default='sym')
parser.add_argument('--alpha', default=4, type=float, help='parameter for Beta')
parser.add_argument('--lambda_u', default=25, type=float, help='weight for unsupervised loss')
parser.add_argument('--p_threshold', default=0.5, type=float, help='clean probability threshold')
parser.add_argument('--T', default=0.5, type=float, help='sharpening temperature')
parser.add_argument('--num_epochs', default=300, type=int)
parser.add_argument('--t_w', default=10, type=int)

parser.add_argument('--r', default=0.8, type=float, help='noise ratio')
parser.add_argument('--id', default='')
parser.add_argument('--seed', default=123)
parser.add_argument('--gpuid', default=0, type=int)
parser.add_argument('--data_path', default='./cifar-10', type=str, help='path to dataset')
parser.add_argument('--dataset', default='cifar10', type=str)

args = parser.parse_args(args = ['--data_path', 'data/CIFAR100',
                                 '--dataset', 'cifar100',
                                 '--t_w', '30',
                                 '--lambda_u','150',
                                 '--lr','0.02',
                                 '--noise_mode','asym',
                                 '--r','0.4',
                                 '--batch_size','64',
                                 '--num_epochs','500'])


In [3]:
torch.cuda.set_device(args.gpuid)
random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

In [4]:
samples = 50000
test_samples = 10000
if args.dataset == 'cifar10':
    n_class = 10
elif args.dataset == 'cifar100':
    n_class = 100
else:
    raise
t_w = args.t_w
feature_num = 512

In [5]:
def train(epoch,net,net2,optimizer,labeled_trainloader,unlabeled_trainloader, loss_x):
    net.train()
    net2.eval() #fix one network and train the other    
    feature_temp = np.zeros((samples, feature_num))
    
    if not isinstance(loss_x, torch.Tensor):
        loss_x = torch.from_numpy(loss_x)
    loss_x = (loss_x - loss_x.min()) / (loss_x.max() - loss_x.min())
    mask_rand = torch.logical_and(torch.rand(len(loss_x),) >= loss_x, torch.rand(len(loss_x),) < 0.0)
    
    unlabeled_train_iter = iter(unlabeled_trainloader)    
    num_iter = (len(labeled_trainloader.dataset)//args.batch_size)+1
    for batch_idx, (ind_x, inputs_x, inputs_x2, labels_x, w_x) in enumerate(labeled_trainloader):      
        try:
            ind_u, inputs_u, inputs_u2 = unlabeled_train_iter.next()
        except:
            unlabeled_train_iter = iter(unlabeled_trainloader)
            ind_u, inputs_u, inputs_u2 = unlabeled_train_iter.next()                 
        batch_size = inputs_x.size(0)
        
        
        label_rand = torch.randint(low=0, high = n_class, size=labels_x.size())
        labels_x = torch.where(mask_rand[ind_x], label_rand, labels_x)
        # Transform label to one-hot
        labels_x = torch.zeros(batch_size, n_class).scatter_(1, labels_x.view(-1,1), 1)         
        
        w_x = torch.minimum(torch.maximum(w_x,torch.tensor(0.)),torch.tensor(1.))
        w_x = w_x.view(-1,1).type(torch.FloatTensor) 

        inputs_x, inputs_x2, labels_x, w_x = inputs_x.cuda(), inputs_x2.cuda(), labels_x.cuda(), w_x.cuda()
        inputs_u, inputs_u2 = inputs_u.cuda(), inputs_u2.cuda()

        with torch.no_grad():
            # label co-guessing of unlabeled samples
            fe_u11, outputs_u11 = net(inputs_u)
            fe_u12, outputs_u12 = net(inputs_u2)
            fe_u21, outputs_u21 = net2(inputs_u)
            fe_u22, outputs_u22 = net2(inputs_u2)            
            
            pu = (torch.softmax(outputs_u11, dim=1) + torch.softmax(outputs_u12, dim=1) +
                  torch.softmax(outputs_u21, dim=1) + torch.softmax(outputs_u22, dim=1)) / 4       
            ptu = pu**(1/args.T) # temparature sharpening
            
            targets_u = ptu / ptu.sum(dim=1, keepdim=True) # normalize
            targets_u = targets_u.detach()       
            
            # label refinement of labeled samples
            fe_x, outputs_x = net(inputs_x)
            fe_x2, outputs_x2 = net(inputs_x2)            
            
            px = (torch.softmax(outputs_x, dim=1) + torch.softmax(outputs_x2, dim=1)) / 2
            px = w_x*labels_x + (1-w_x)*px              
            ptx = px**(1/args.T) # temparature sharpening 
                       
            targets_x = ptx / ptx.sum(dim=1, keepdim=True) # normalize           
            targets_x = targets_x.detach()       
        feature_temp[ind_x] = fe_x.cpu().detach().numpy()
        # mixmatch
        l = np.random.beta(args.alpha, args.alpha)        
        l = max(l, 1-l)
                
        all_inputs = torch.cat([inputs_x, inputs_x2, inputs_u, inputs_u2], dim=0)
        all_targets = torch.cat([targets_x, targets_x, targets_u, targets_u], dim=0)

        idx = torch.randperm(all_inputs.size(0))

        input_a, input_b = all_inputs, all_inputs[idx]
        target_a, target_b = all_targets, all_targets[idx]
        
        mixed_input = l * input_a + (1 - l) * input_b        
        mixed_target = l * target_a + (1 - l) * target_b
                
        feature, logits = net(mixed_input)
        logits_x = logits[:batch_size*2]
        logits_u = logits[batch_size*2:]        
           
        Lx, Lu, lamb = criterion(logits_x, mixed_target[:batch_size*2], logits_u, mixed_target[batch_size*2:], epoch+batch_idx/num_iter, t_w)
        
        # regularization
        prior = torch.ones(n_class)/n_class
        prior = prior.cuda()        
        pred_mean = torch.softmax(logits, dim=1).mean(0)
        penalty = torch.sum(prior*torch.log(prior/pred_mean))

        loss_o = loss_ortho(fe_x)
        
        
        loss = Lx + lamb * Lu  + penalty + loss_o * 1e1
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        sys.stdout.write('\r')
        sys.stdout.write('%s:%.1f-%s | Epoch [%3d/%3d] Iter[%3d/%3d]\t Labeled loss: %.2f  Unlabeled loss: %.2f, loss_o : %.2f'
                %(args.dataset, args.r, args.noise_mode, epoch, args.num_epochs, batch_idx+1, num_iter, Lx.item(), Lu.item(), loss_o.item()))
        sys.stdout.flush()
    return feature_temp
    

def warmup(epoch,net,optimizer,dataloader):
    net.train()
    num_iter = (len(dataloader.dataset)//dataloader.batch_size)+1
    for batch_idx, (ind, inputs, labels, ) in enumerate(dataloader):      
        inputs, labels = inputs.cuda(), labels.cuda() 
        optimizer.zero_grad()
        _, outputs = net(inputs)               
        loss = CEloss(outputs, labels)      
        if args.noise_mode=='asym':  # penalize confident prediction for asymmetric noise
            penalty = conf_penalty(outputs)
            L = loss + penalty      
        elif args.noise_mode=='sym':   
            L = loss
        L.backward()  
        optimizer.step() 

        sys.stdout.write('\r')
        sys.stdout.write('%s:%.1f-%s | Epoch [%3d/%3d] Iter[%3d/%3d]\t CE-loss: %.4f'
                %(args.dataset, args.r, args.noise_mode, epoch, args.num_epochs, batch_idx+1, num_iter, loss.item()))
        sys.stdout.flush()

def test(epoch,net1,net2):
    net1.eval()
    net2.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (ind, inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.cuda(), targets.cuda()
            fes1, outputs1 = net1(inputs)
            fes2, outputs2 = net2(inputs)           
            outputs = outputs1+outputs2
            _, predicted = torch.max(outputs, 1)            
                       
            total += targets.size(0)
            correct += predicted.eq(targets).cpu().sum().item()                 
    acc = 100.*correct/total
    print("\n| Test Epoch #%d\t Accuracy: %.2f%%\n" %(epoch,acc))  
    test_log.write('Epoch:%d   Accuracy:%.2f\n'%(epoch,acc))
    test_log.flush()  


def eval_train_dlh(model, all_loss, noise_label):    
    model.eval()
    Y_onehot = np.eye(n_class)[noise_label].astype(np.float32)
    feature_temp = np.zeros((samples, feature_num), dtype = np.float32)
    
    score = torch.zeros(samples)   
    with torch.no_grad():
        for batch_idx, (ind, inputs, targets) in enumerate(eval_loader):
            inputs, targets = inputs.cuda(), targets.cuda() 
            feature, outputs = model(inputs) 
            loss = CE(outputs, targets)   
            score[ind] = loss.cpu()
            feature_temp[ind] = feature.cpu().detach().numpy()
    score = (score-score.min())/(score.max()-score.min()) 
    score = score.cpu().detach().numpy()
    
    clean_mask = np.zeros((samples,),dtype = np.bool)
    for j_ in range(n_class):
        class_mask = np.array(noise_label) == j_
        c_n = class_mask.sum()
        if c_n > 1:
            thres = np.sort(score[class_mask])[int((c_n-1) * 0.4)]
            clean_mask[np.logical_and(score < thres, class_mask)] = True
    feature_gpu = torch.from_numpy(feature_temp[clean_mask]).cuda()

    Y_onehot_gpu = torch.from_numpy(Y_onehot[clean_mask],).cuda()


    W = torch.linalg.lstsq(feature_gpu, Y_onehot_gpu).solution
    W_cpu = W.cpu().detach().numpy()

    f_prob = feature_temp @ W_cpu   
    
    losses = np.sum((Y_onehot - f_prob) ** 2, axis = -1)
    # losses = (losses - losses.min()) / (losses.max() - losses.min())
        
    all_loss.append(torch.from_numpy(losses))  
    
    prob = 1 - losses
    
    return prob,all_loss, feature_temp, f_prob, score


def linear_rampup(current, t_w, rampup_length=16):
    current = np.clip((current-t_w) / rampup_length, 0.0, 1.0)
    return args.lambda_u*float(current)

class SemiLoss(object):
    def __call__(self, outputs_x, targets_x, outputs_u, targets_u, epoch, t_w):
        probs_u = torch.softmax(outputs_u, dim=1)

        Lx = -torch.mean(torch.sum(F.log_softmax(outputs_x, dim=1) * targets_x, dim=1))
        Lu = torch.mean((probs_u - targets_u)**2)

        return Lx, Lu, linear_rampup(epoch,t_w)

class Orthogonal_loss(nn.Module):
    def __init__(self,):
        super(Orthogonal_loss, self).__init__()
        
    def forward(self, x, ):
        n = x.size(0)
        m = x.size(1)

        I = torch.eye(m).cuda()
        e = x - x.mean(dim=0, keepdims = True)
        m_nonz = (e.sum(dim = 0) != 0).sum()
        
        cov = e.T @ e
        
        cov2 = cov ** 2
        
        select_i = torch.argmax(cov2 - cov2 * I, dim = 1)
        cov_m = (F.one_hot(select_i, num_classes = m) * cov2).sum()
        cov_i = (I * cov).sum()
        
        result = (cov_m-cov_i) / (m_nonz*n)
        return result
    
class NegEntropy(object):
    def __call__(self,outputs):
        probs = torch.softmax(outputs, dim=1)
        return torch.mean(torch.sum(probs.log()*probs, dim=1))

def relevant_hard_np(x,):
    n = x.shape[0]
    nz = x.shape[1]
    e = x - x.mean(axis = 0,keepdims = True)

    cov = e.T @ e

    sigma = (e ** 2).sum(axis = 0, keepdims = True)
    r = cov / (sigma.T @ sigma) ** 0.5

    r = r ** 2
    r[np.isnan(r)] = 0.0
    
    return np.mean(np.max(r - r * np.eye(nz), axis = -1))
    
def create_model():
    model = ResNet18(num_classes=n_class)
    model = model.cuda()
    return model

In [6]:
stats_log=open('./checkpoint/%s_%.1f_%s_%s'%(args.dataset,args.r,args.noise_mode,
                                             str(datetime.date.today())+'_'+str(time.localtime().tm_hour))+'_stats.txt','w') 
test_log=open('./checkpoint/%s_%.1f_%s_%s'%(args.dataset,args.r,args.noise_mode,
                                            str(datetime.date.today())+'_'+str(time.localtime().tm_hour))+'_acc.txt','w')     

In [7]:
loader = dataloader.cifar_dataloader(args.dataset,r=args.r,noise_mode=args.noise_mode,batch_size=args.batch_size,num_workers=5,\
    root_dir=args.data_path,log=stats_log,noise_file='%s/%.1f_%s.json'%(args.data_path,args.r,args.noise_mode))

print('| Building net')
net1 = create_model()
net2 = create_model()
cudnn.benchmark = True

criterion = SemiLoss()
optimizer1 = optim.SGD(net1.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
optimizer2 = optim.SGD(net2.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)

scheduler1 = optim.lr_scheduler.MultiStepLR(optimizer1, milestones=[300,400], gamma=0.1)
scheduler2 = optim.lr_scheduler.MultiStepLR(optimizer2, milestones=[300,400], gamma=0.1)

CE = nn.CrossEntropyLoss(reduction='none')
CEloss = nn.CrossEntropyLoss()

loss_ortho = Orthogonal_loss()

if args.noise_mode=='asym':
    conf_penalty = NegEntropy()

all_loss = [[],[]] # save the history of losses from two networks

| Building net


In [None]:
for epoch in range(args.num_epochs+1):

    test_dataset, test_loader = loader.run('test')
    eval_dataset, eval_loader = loader.run('eval_train')   
    
    if epoch<t_w:       
        warmup_dataset, warmup_trainloader = loader.run('warmup')
        print('Warmup Net1')
        warmup(epoch,net1,optimizer1,warmup_trainloader)    
        print('\nWarmup Net2')
        warmup(epoch,net2,optimizer2,warmup_trainloader) 
   
    else:        
        start = time.time()
        prob1,all_loss[0], feature_temp_1_eval, f_prob_1, score_1 = eval_train_dlh(net1,all_loss[0], eval_dataset.noise_label)   
        prob2,all_loss[1], feature_temp_2_eval, f_prob_2, score_2 = eval_train_dlh(net2,all_loss[1], eval_dataset.noise_label)  
        
                
        r_ = 0.4
        
        thres_1 = np.sort(prob1)[int(samples * r_)]
        thres_2 = np.sort(prob2)[int(samples * r_)]
        
        pred1 = (prob1 > thres_1)
        pred2 = (prob2 > thres_2)       

        print('Train Net1')
        labeled_traindataset, labeled_trainloader, unlabeled_traindataset, unlabeled_trainloader = loader.run('train',pred2,prob2) # co-divide
        feature_temp_1 = train(epoch,net1,net2,optimizer1,
                                labeled_trainloader, unlabeled_trainloader,
                                score_2[labeled_traindataset.predidx]) # train net1  
        print('\nloss_b : %.2f'%relevant_hard_np(feature_temp_1[:10000]))
        print('\nTrain Net2')
        labeled_traindataset, labeled_trainloader, unlabeled_traindataset, unlabeled_trainloader = loader.run('train',pred1,prob1) # co-divide
        feature_temp_2 = train(epoch,net2,net1,optimizer2,
                               labeled_trainloader, unlabeled_trainloader, 
                               score_1[labeled_traindataset.predidx]) # train net2        
        end = time.time()

    test(epoch,net1,net2)   
    scheduler1.step()
    scheduler2.step()

Warmup Net1
cifar100:0.4-asym | Epoch [  0/500] Iter[391/391]	 CE-loss: 4.0796
Warmup Net2
cifar100:0.4-asym | Epoch [  0/500] Iter[391/391]	 CE-loss: 4.0079
| Test Epoch #0	 Accuracy: 17.20%

Warmup Net1
cifar100:0.4-asym | Epoch [  1/500] Iter[391/391]	 CE-loss: 3.7126
Warmup Net2
cifar100:0.4-asym | Epoch [  1/500] Iter[391/391]	 CE-loss: 3.8704
| Test Epoch #1	 Accuracy: 23.12%

Warmup Net1
cifar100:0.4-asym | Epoch [  2/500] Iter[391/391]	 CE-loss: 3.5102
Warmup Net2
cifar100:0.4-asym | Epoch [  2/500] Iter[391/391]	 CE-loss: 3.5878
| Test Epoch #2	 Accuracy: 29.11%

Warmup Net1
cifar100:0.4-asym | Epoch [  3/500] Iter[391/391]	 CE-loss: 3.3831
Warmup Net2
cifar100:0.4-asym | Epoch [  3/500] Iter[391/391]	 CE-loss: 3.2158
| Test Epoch #3	 Accuracy: 33.27%

Warmup Net1
cifar100:0.4-asym | Epoch [  4/500] Iter[391/391]	 CE-loss: 3.1601
Warmup Net2
cifar100:0.4-asym | Epoch [  4/500] Iter[391/391]	 CE-loss: 3.2395
| Test Epoch #4	 Accuracy: 39.59%

Warmup Net1
cifar100:0.4-asym | Epo

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  clean_mask = np.zeros((samples,),dtype = np.bool)


Train Net1
labeled data has a size of 29999
unlabeled data has a size of 20001
cifar100:0.4-asym | Epoch [ 30/500] Iter[469/469]	 Labeled loss: 2.03  Unlabeled loss: 0.00, loss_o : 0.30
loss_b : 0.21

Train Net2
labeled data has a size of 29999
unlabeled data has a size of 20001
cifar100:0.4-asym | Epoch [ 30/500] Iter[469/469]	 Labeled loss: 2.68  Unlabeled loss: 0.00, loss_o : 0.39
| Test Epoch #30	 Accuracy: 65.04%

Train Net1
labeled data has a size of 29999
unlabeled data has a size of 20001
cifar100:0.4-asym | Epoch [ 31/500] Iter[469/469]	 Labeled loss: 3.07  Unlabeled loss: 0.00, loss_o : 0.41
loss_b : 0.23

Train Net2
labeled data has a size of 29999
unlabeled data has a size of 20001
cifar100:0.4-asym | Epoch [ 31/500] Iter[469/469]	 Labeled loss: 2.66  Unlabeled loss: 0.00, loss_o : 0.54
| Test Epoch #31	 Accuracy: 64.82%

Train Net1
labeled data has a size of 29999
unlabeled data has a size of 20001
cifar100:0.4-asym | Epoch [ 32/500] Iter[469/469]	 Labeled loss: 3.36  Unla

In [None]:
end-start