In [1]:
from __future__ import print_function
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.models as models
import random
import os
import argparse
import numpy as np
import dataloader_clothing1M as dataloader
from sklearn.mixture import GaussianMixture

import datetime

In [2]:
parser = argparse.ArgumentParser(description='PyTorch Clothing1M Training')
parser.add_argument('--batch_size', default=32, type=int, help='train batchsize') 
parser.add_argument('--lr', '--learning_rate', default=0.002, type=float, help='initial learning rate')
parser.add_argument('--alpha', default=0.5, type=float, help='parameter for Beta')
parser.add_argument('--lambda_u', default=0, type=float, help='weight for unsupervised loss')
parser.add_argument('--p_threshold', default=0.5, type=float, help='clean probability threshold')
parser.add_argument('--T', default=0.5, type=float, help='sharpening temperature')
parser.add_argument('--num_epochs', default=80, type=int)
parser.add_argument('--id', default='clothing1m')
parser.add_argument('--data_path', default='data/Clothing1M/data', type=str, help='path to dataset')
parser.add_argument('--seed', default=123)
parser.add_argument('--gpuid', default=0, type=int)
parser.add_argument('--num_class', default=14, type=int)
parser.add_argument('--num_batches', default=1000, type=int)
parser.add_argument('--t_w', default=1, type=int)
parser.add_argument('--xi', default=0.04, type=float)
parser.add_argument('--eta', default=10., type=float)
parser.add_argument('--nc', default=1., type=float)
parser.add_argument('--nv', default=0.2, type=float)

args = parser.parse_args(args = ['--data_path', 'autodl-tmp/clothing1m',
                                 '--lambda_u','0',
                                 '--lr','0.002',
                                 '--batch_size','32',
                                 '--num_epochs','100'])

torch.cuda.set_device(args.gpuid)
random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

In [3]:
t_w = args.t_w
feature_num = 2048

In [4]:
# Training
def train(epoch,net,net2,optimizer,labeled_trainloader,unlabeled_trainloader,loss_x):
    net.train()
    net2.eval() #fix one network and train the other
    
    if not isinstance(loss_x, torch.Tensor):
        loss_x = torch.from_numpy(loss_x)
    loss_x = (loss_x - loss_x.min()) / (loss_x.max() - loss_x.min())
    mask_rand = torch.logical_and(torch.rand(len(loss_x),) >= loss_x, torch.rand(len(loss_x),) < args.xi)
    
    unlabeled_train_iter = iter(unlabeled_trainloader)    
    num_iter = (len(labeled_trainloader.dataset)//args.batch_size)+1
    for batch_idx, (ind_x, inputs_x, inputs_x2, labels_x, w_x) in enumerate(labeled_trainloader):      
        try:
            ind_u, inputs_u, inputs_u2 = unlabeled_train_iter.next()
        except:
            unlabeled_train_iter = iter(unlabeled_trainloader)
            ind_u, inputs_u, inputs_u2 = unlabeled_train_iter.next()                 
        batch_size = inputs_x.size(0)
        
        label_rand = torch.randint(low=0, high = args.num_class, size=labels_x.size())
        labels_x = torch.where(mask_rand[ind_x], label_rand, labels_x)
        # Transform label to one-hot
        labels_x = torch.zeros(batch_size, args.num_class).scatter_(1, labels_x.view(-1,1), 1)        
        w_x = w_x.view(-1,1).type(torch.FloatTensor) 

        inputs_x, inputs_x2, labels_x, w_x = inputs_x.cuda(), inputs_x2.cuda(), labels_x.cuda(), w_x.cuda()
        inputs_u, inputs_u2 = inputs_u.cuda(), inputs_u2.cuda()

        with torch.no_grad():
            # label co-guessing of unlabeled samples
            outputs_u11 = net(inputs_u)
            outputs_u12 = net(inputs_u2)
            outputs_u21 = net2(inputs_u)
            outputs_u22 = net2(inputs_u2)            
            
            pu = (torch.softmax(outputs_u11, dim=1) + torch.softmax(outputs_u12, dim=1) + torch.softmax(outputs_u21, dim=1) + torch.softmax(outputs_u22, dim=1)) / 4       
            ptu = pu**(1/args.T) # temparature sharpening
            
            targets_u = ptu / ptu.sum(dim=1, keepdim=True) # normalize
            targets_u = targets_u.detach()       
            
            # label refinement of labeled samples
            fe_x, outputs_x = forward_wf(net, inputs_x)
            outputs_x2 = net(inputs_x2)            
            
            px = (torch.softmax(outputs_x, dim=1) + torch.softmax(outputs_x2, dim=1)) / 2
            px = w_x*labels_x + (1-w_x)*px              
            ptx = px**(1/args.T) # temparature sharpening 
                       
            targets_x = ptx / ptx.sum(dim=1, keepdim=True) # normalize           
            targets_x = targets_x.detach()       
        
        # mixmatch
        l = np.random.beta(args.alpha, args.alpha)        
        l = max(l, 1-l)        
        
        all_inputs = torch.cat([inputs_x, inputs_x2, inputs_u, inputs_u2], dim=0)
        all_targets = torch.cat([targets_x, targets_x, targets_u, targets_u], dim=0)

        idx = torch.randperm(all_inputs.size(0))

        input_a, input_b = all_inputs, all_inputs[idx]
        target_a, target_b = all_targets, all_targets[idx]
        
        mixed_input = l * input_a[:batch_size*2] + (1 - l) * input_b[:batch_size*2]        
        mixed_target = l * target_a[:batch_size*2] + (1 - l) * target_b[:batch_size*2]
                
        logits = net(mixed_input)
        
        Lx = -torch.mean(torch.sum(F.log_softmax(logits, dim=1) * mixed_target, dim=1))
        
        # regularization
        prior = torch.ones(args.num_class)/args.num_class
        prior = prior.cuda()        
        pred_mean = torch.softmax(logits, dim=1).mean(0)
        penalty = torch.sum(prior*torch.log(prior/pred_mean))
       
        loss_o = loss_ortho(fe_x)
        
        loss = Lx + penalty + loss_o * args.eta
        
        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        sys.stdout.write('\r')
        sys.stdout.write('Clothing1M | Epoch [%3d/%3d] Iter[%3d/%3d]\t  Labeled loss: %.4f \t loss_o: %.4f'
                %(epoch, args.num_epochs, batch_idx+1, num_iter, Lx.item(), loss_o.item()))
        sys.stdout.flush()
    
def warmup(net,optimizer,dataloader):
    net.train()
    for batch_idx, (ind, inputs, labels, path) in enumerate(dataloader):      
        inputs, labels = inputs.cuda(), labels.cuda() 
        optimizer.zero_grad()
        outputs = net(inputs)              
        loss = CEloss(outputs, labels)  
        
        penalty = conf_penalty(outputs)
        L = loss + penalty       
        L.backward()  
        optimizer.step() 

        sys.stdout.write('\r')
        sys.stdout.write('|Warm-up: Iter[%3d/%3d]\t CE-loss: %.4f  Conf-Penalty: %.4f'
                %(batch_idx+1, args.num_batches, loss.item(), penalty.item()))
        sys.stdout.flush()
    
def val(net,val_loader,k):
    net.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (ind, inputs, targets) in enumerate(val_loader):
            inputs, targets = inputs.cuda(), targets.cuda()
            outputs = net(inputs)
            _, predicted = torch.max(outputs, 1)         
                       
            total += targets.size(0)
            correct += predicted.eq(targets).cpu().sum().item()              
    acc = 100.*correct/total
    print("\n| Validation\t Net%d  Acc: %.2f%%" %(k,acc))  
    if acc > best_acc[k-1]:
        best_acc[k-1] = acc
        print('| Saving Best Net%d ...'%k)
        save_point = './checkpoint/%s_net%d.pth.tar'%(args.id,k)
        torch.save(net.state_dict(), save_point)
    return acc

def test(net1,net2,test_loader):
    net1.eval()
    net2.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (ind, inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.cuda(), targets.cuda()
            outputs1 = net1(inputs)       
            outputs2 = net2(inputs)           
            outputs = outputs1+outputs2
            _, predicted = torch.max(outputs, 1)            
                       
            total += targets.size(0)
            correct += predicted.eq(targets).cpu().sum().item()                    
    acc = 100.*correct/total
    print("\n| Test Acc: %.2f%%\n" %(acc))  
    return acc    


def eval_train(model,):
    model.eval()

    num_samples = args.num_batches*args.batch_size
    losses = np.zeros(num_samples)
    feature_temp = np.zeros((num_samples, feature_num),)
    noise_label = np.zeros(num_samples, dtype = np.int64)
    paths = []
    n = 0
    with torch.no_grad():
        for batch_idx, (ind, inputs, targets, path) in enumerate(eval_loader):
            noise_label[ind] = targets.detach().numpy()
            inputs, targets = inputs.cuda(), targets.cuda() 
            feature, outputs = forward_wf(model, inputs) 
            loss = CE(outputs, targets)  
            losses[ind] = loss.cpu().detach().numpy()
            feature_temp[ind] = feature.cpu().detach().numpy()
            
            for b in range(inputs.size(0)):
                paths.append(path[b])
            sys.stdout.write('\r')
            sys.stdout.write('| Evaluating loss Iter %3d\t' %(batch_idx)) 
            sys.stdout.flush()
    
    Y_onehot = np.eye(args.num_class)[noise_label]
    
    losses = (losses-losses.min())/(losses.max()-losses.min())    
    losses = losses.ravel()
    
    if args.nc < 1:
        clean_mask = np.zeros((num_samples,),dtype = np.bool)
        for j_ in range(args.num_class):
            class_mask = np.array(noise_label) == j_
            c_n = class_mask.sum()
            if c_n > 1:
                thres = np.sort(losses[class_mask])[int((c_n-1) * args.nc)]
                clean_mask[np.logical_and(losses < thres, class_mask)] = True
        feature_gpu = torch.from_numpy(feature_temp[clean_mask]).cuda()

        Y_onehot_gpu = torch.from_numpy(Y_onehot[clean_mask]).cuda()
    else:
        feature_gpu = torch.from_numpy(feature_temp).cuda()
        Y_onehot_gpu = torch.from_numpy(Y_onehot).cuda()

    W = torch.linalg.lstsq(feature_gpu, Y_onehot_gpu).solution
    W_cpu = W.cpu().detach().numpy()

    f_prob = feature_temp @ W_cpu   
    
    score = np.sum((Y_onehot - f_prob) ** 2, axis = -1)
            
    prob = 1 - score
    prob = (prob-prob.min())/(prob.max()-prob.min())  
        
    return prob, losses, paths  


class NegEntropy(object):
    def __call__(self,outputs):
        probs = torch.softmax(outputs, dim=1)
        return torch.mean(torch.sum(probs.log()*probs, dim=1))

    
class Orthogonal_loss(nn.Module):
    def __init__(self,):
        super(Orthogonal_loss, self).__init__()
        
    def forward(self, x, ):
        n = x.size(0)
        m = x.size(1)

        I = torch.eye(m).cuda()
        e = x - x.mean(dim=0, keepdims = True)
        m_nonz = (e.sum(dim = 0) != 0).sum()
        
        cov = e.T @ e
        
        cov2 = cov ** 2
        
        select_i = torch.argmax(cov2 - cov2 * I, dim = 1)
        cov_m = (F.one_hot(select_i, num_classes = m) * cov2).sum()
        cov_i = (I * cov).sum()
        
        result = (cov_m-cov_i) / (m_nonz*n)
        return result
    
def create_model():
    model = models.resnet50(pretrained=True)
    model.fc = nn.Linear(2048,args.num_class)
    model = model.cuda()
    return model     

def forward_wf(net, x):
    import torch.nn.functional as F
    out = x
    out = net.conv1(out)
    out = net.bn1(out)
    out = F.relu(out)
    
    out = net.layer1(out)
    out = net.layer2(out)
    out = net.layer3(out)
    out = net.layer4(out)
    out = F.adaptive_avg_pool2d(out, (1,1))
    feature = out.view(out.size(0), -1)
    out = net.fc(feature)
    
    return feature, out

log=open('./checkpoint/%s_%s.txt'%(args.id,str(datetime.datetime.now().date())+'-'+str(datetime.datetime.now().hour)),'w')     
log.flush()

loader = dataloader.clothing_dataloader(root=args.data_path,batch_size=args.batch_size,num_workers=5,num_batches=args.num_batches)

print('| Building net')
net1 = create_model()
net2 = create_model()
cudnn.benchmark = True

optimizer1 = optim.SGD(net1.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-3)
optimizer2 = optim.SGD(net2.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-3)

scheduler1 = optim.lr_scheduler.MultiStepLR(optimizer1, milestones=[40,80], gamma=0.1)
scheduler2 = optim.lr_scheduler.MultiStepLR(optimizer2, milestones=[40,80], gamma=0.1)
                      
CE = nn.CrossEntropyLoss(reduction='none')
CEloss = nn.CrossEntropyLoss()
conf_penalty = NegEntropy()

loss_ortho = Orthogonal_loss()


| Building net


In [5]:
best_acc = [0,0]
for epoch in range(args.num_epochs+1):   
        
    if epoch<1:     # warm up  
        warmup_dataset, train_loader = loader.run('warmup')
        print('Warmup Net1')
        warmup(net1,optimizer1,train_loader)     
        warmup_dataset, train_loader = loader.run('warmup')
        print('\nWarmup Net2')
        warmup(net2,optimizer2,train_loader)                  
    else:      
        print('\n==== net 1 evaluate next epoch training data loss ====') 
        eval_dataset, eval_loader = loader.run('eval_train')  # evaluate training data loss for next epoch  
        prob1,losses1,paths1 = eval_train(net1) 
        print('\n==== net 2 evaluate next epoch training data loss ====') 
        eval_dataset, eval_loader = loader.run('eval_train')  
        prob2,losses2,paths2 = eval_train(net2) 
                
        thres_1 = np.sort(prob1)[int(args.num_batches*args.batch_size * args.nv)]
        thres_2 = np.sort(prob2)[int(args.num_batches*args.batch_size * args.nv)]
        
        pred1 = (prob1 > thres_1)
        pred2 = (prob2 > thres_2)      
                
        print('\n\nTrain Net1')
        labeled_traindataset, labeled_trainloader, unlabeled_traindataset, unlabeled_trainloader = loader.run('train',pred2,prob2,paths=paths2) # co-divide
        train(epoch,net1,net2,optimizer1,labeled_trainloader, unlabeled_trainloader,losses2)              # train net1
        print('\nTrain Net2')
        labeled_traindataset, labeled_trainloader, unlabeled_traindataset, unlabeled_trainloader = loader.run('train',pred1,prob1,paths=paths1) # co-divide
        train(epoch,net2,net1,optimizer2,labeled_trainloader, unlabeled_trainloader,losses1)              # train net2
    
    val_dataset, val_loader = loader.run('val') # validation
    acc1 = val(net1,val_loader,1)
    acc2 = val(net2,val_loader,2)   
    
    scheduler1.step()
    scheduler2.step()    
    
    log.write('Validation Epoch:%d      Acc1:%.2f  Acc2:%.2f\n'%(epoch,acc1,acc2))
    log.flush() 


test_dataset, test_loader = loader.run('test')
net1.load_state_dict(torch.load('./checkpoint/%s_net1.pth.tar'%args.id))
net2.load_state_dict(torch.load('./checkpoint/%s_net2.pth.tar'%args.id))
acc = test(net1,net2,test_loader)     



log.write('Test Accuracy:%.2f\n'%(acc))
log.flush() 

Warmup Net1
|Warm-up: Iter[1000/1000]	 CE-loss: 1.3454  Conf-Penalty: -2.1645
Warmup Net2
|Warm-up: Iter[1000/1000]	 CE-loss: 1.5236  Conf-Penalty: -2.2743
| Validation	 Net1  Acc: 68.24%
| Saving Best Net1 ...

| Validation	 Net2  Acc: 67.04%
| Saving Best Net2 ...

==== net 1 evaluate next epoch training data loss ====
| Evaluating loss Iter 999	

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  clean_mask = np.zeros((num_samples,),dtype = np.bool)



==== net 2 evaluate next epoch training data loss ====
| Evaluating loss Iter 999	

Train Net1
labeled data has a size of 25599
unlabeled data has a size of 6401
Clothing1M | Epoch [  1/100] Iter[800/800]	  Labeled loss: 1.1372 	 loss_o: 0.0334
Train Net2
labeled data has a size of 25599
unlabeled data has a size of 6401
Clothing1M | Epoch [  1/100] Iter[800/800]	  Labeled loss: 0.6710 	 loss_o: 0.0336
| Validation	 Net1  Acc: 68.23%

| Validation	 Net2  Acc: 69.49%
| Saving Best Net2 ...

==== net 1 evaluate next epoch training data loss ====
| Evaluating loss Iter 999	
==== net 2 evaluate next epoch training data loss ====
| Evaluating loss Iter 999	

Train Net1
labeled data has a size of 25599
unlabeled data has a size of 6401
Clothing1M | Epoch [  2/100] Iter[800/800]	  Labeled loss: 0.7295 	 loss_o: 0.0372
Train Net2
labeled data has a size of 25599
unlabeled data has a size of 6401
Clothing1M | Epoch [  2/100] Iter[800/800]	  Labeled loss: 0.5033 	 loss_o: 0.0293
| Validation	 N