In [1]:
from __future__ import print_function
from __future__ import division

import numpy as np
import argparse
import random
import shutil
import time
import warnings


import torch
import torch.nn as nn

import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim as optim

import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed

import torchvision
from torchvision import datasets, models, transforms
import os
import re
import sys
import copy

In [2]:



#################################   set args  #######################


def str2bool(v):
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

parser = argparse.ArgumentParser(description='DL19_FinalProject_PyTorch')

parser.add_argument('--model', type=str, default='densenet',
                    help='type of cnn ("resnet", "alexnet","vgg","squeezenet","densenet","inception")')
parser.add_argument('--model-folder', type=str, default='/scratch/by783/DL_Final_models/',
                    help='path to store model files')

parser.add_argument('--model-file', type=str, default = '190425_raw_vggae_fromscratch_s.pt',
                    help='path to autoencoder')

parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--save-folder', type=str, default='/scratch/by783/DL_Final_models/',
                    help='path to save the final model')

parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')
parser.add_argument('--num-classes', type=int, default=1000,
                    help='number of classes')
parser.add_argument('--epochs', type=int, default=25,
                    help='upper epoch limit')
parser.add_argument("--feature-pinning", type=str, default='False',
                    help="pin all the conv layers.")
parser.add_argument('--noise-level', type=float, default=0.3,
                    help='add noise to input')
parser.add_argument('--dataset-path', type=str, default='/scratch/by783/DL_Final/ssl_data_96',
                    help='path to dataset')
#args = parser.parse_args()

############################################

_StoreAction(option_strings=['--dataset-path'], dest='dataset_path', nargs=None, const=None, default='/scratch/by783/DL_Final/ssl_data_96', type=<class 'str'>, choices=None, help='path to dataset', metavar=None)

In [3]:
args=parser.parse_args('--model densenet --model-folder /home/by783/Self_Jupyter/DL_Final_Project_Models/  --model-file 190421_raw_densenet.pt --batch-size 160 --save-folder /scratch/by783/DL_Final_models/ --save 190502_d_aug_try.pt --epochs 50 --feature-pinning False --noise-level 0.3'.split())
# batch_size 128 2GPU ok
# 160 max

In [4]:
model_name = args.model

model_load_path = args.model_folder + args.model_file

save_path = args.save_folder + args.save

feature_pinning=str2bool(args.feature_pinning)
num_classes = args.num_classes

num_epochs = args.epochs
loader_batch_size = args.batch_size
loader_image_path = args.dataset_path
noise_level = args.noise_level

In [5]:
###################### report environment ###################################


sys.stdout.write("PyTorch Version: {}\n".format(torch.__version__))
sys.stdout.write("Torchvision Version:{}\n ".format(torchvision.__version__))

if torch.cuda.is_available():
    sys.stdout.write('GPU mode\n')
else:
    sys.stdout.write('Warning, CPU mode, pls check')

PyTorch Version: 0.4.1
Torchvision Version:0.2.2
 GPU mode


In [6]:
def image_loader(path, batch_size):
    transform = {
        'train':transforms.Compose([
            transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.RandomAffine(15),
            transforms.RandomResizedCrop(size=224,scale=(0.8, 1.0)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'val':transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    }
    sup_train_data = datasets.ImageFolder('{}/{}/train'.format(path, 'supervised'), transform=transform['train'])
    sup_val_data = datasets.ImageFolder('{}/{}/val'.format(path, 'supervised'), transform=transform['val'])
    #unsup_data = datasets.ImageFolder('{}/{}/'.format(path, 'unsupervised'), transform=transform)
    # source code: https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py
    # Main idea:
    data_loader_sup_train = torch.utils.data.DataLoader(
        sup_train_data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0
    )
    data_loader_sup_val = torch.utils.data.DataLoader(
        sup_val_data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0
    )
    '''
    data_loader_unsup = torch.utils.data.DataLoader(
        unsup_data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0
    )

    print('sup_train_data.class_to_idx==sup_val_data.class_to_idx: ',
          sup_train_data.class_to_idx == sup_val_data.class_to_idx)
    '''
    return data_loader_sup_train, data_loader_sup_val#, data_loader_unsup, sup_train_data.class_to_idx

In [7]:
####### load data, input_size is used ####

dataloaders={}

#dataloaders['train'], dataloaders['val'], data_loader_unsup, class_to_idx_dict = image_loader(loader_image_path,loader_batch_size)

dataloaders['train'], dataloaders['val'] = image_loader(loader_image_path,loader_batch_size)

In [8]:
for data, label in dataloaders['train']:
    print(data.shape)
    break

torch.Size([160, 3, 224, 224])


In [9]:
model_ft = torch.load(model_load_path)

In [10]:
model_ft.features[0].weight.requires_grad 

False

In [11]:
model_ft

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplac

In [12]:
model_ft.features.denseblock4.denselayer16.conv2.weight.requires_grad

False

In [13]:
model_ft.classifier.weight.requires_grad

True

In [14]:
def pin_features(model, pinning):
    if pinning:
        for param in model.parameters():
            param.requires_grad = False
        model.classifier.weight.requires_grad = True
        
    else:
        for param in model.parameters():
            param.requires_grad = True

In [15]:
pin_features(model_ft, feature_pinning)

In [16]:
model_ft.features.denseblock4.denselayer16.conv2.weight.requires_grad

True

In [17]:
model_ft.classifier.weight.requires_grad

True

In [18]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
    
    since = time.time()
    val_acc_history = []
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
            # 切换phase重置loss
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                print('batch_size checker')

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    # Special case for inception because in training it has an auxiliary output. In train
                    #   mode we calculate the loss by summing the final output and the auxiliary output
                    #   but in testing we only consider the final output.
                    if is_inception and phase == 'train':
                        # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
                        outputs, aux_outputs = model(inputs)
                        loss1 = criterion(outputs, labels)
                        loss2 = criterion(aux_outputs, labels)
                        loss = loss1 + 0.4*loss2
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            sys.stdout.write('{} Loss: {:.4f} Acc: {:.4f}\n'.format(phase, epoch_loss, epoch_acc))
            sys.stdout.write('training time: {:.0f}s\n'.format( time.time() - since ))

            # deep copy the model
            if phase == 'val':
                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    with open(save_path, 'wb') as f:
                        torch.save(model, f)
                
                val_acc_history.append(epoch_acc)
                with open(save_path+'_val_acc', 'w') as f:
                    for item in val_acc_history:
                        f.write("%s\n" % item)

        print()

    time_elapsed = time.time() - since
    sys.stdout.write('Training complete in {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60))
    sys.stdout.write('Best val Acc: {:4f}\n'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [20]:
print('number of GPUS:', torch.cuda.device_count())

if torch.cuda.device_count() > 1:
    model_ft = nn.DataParallel(model_ft)

model_ft = model_ft.to(device)

number of GPUS: 2


In [21]:
params_to_update = model_ft.parameters()
sys.stdout.write("Params to learn:\n")
if feature_pinning:
    params_to_update = []
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            sys.stdout.write("\t{}\n".format(name))
else:
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            sys.stdout.write("\t{}\n".format(name))

Params to learn:
	module.features.conv0.weight
	module.features.norm0.weight
	module.features.norm0.bias
	module.features.denseblock1.denselayer1.norm1.weight
	module.features.denseblock1.denselayer1.norm1.bias
	module.features.denseblock1.denselayer1.conv1.weight
	module.features.denseblock1.denselayer1.norm2.weight
	module.features.denseblock1.denselayer1.norm2.bias
	module.features.denseblock1.denselayer1.conv2.weight
	module.features.denseblock1.denselayer2.norm1.weight
	module.features.denseblock1.denselayer2.norm1.bias
	module.features.denseblock1.denselayer2.conv1.weight
	module.features.denseblock1.denselayer2.norm2.weight
	module.features.denseblock1.denselayer2.norm2.bias
	module.features.denseblock1.denselayer2.conv2.weight
	module.features.denseblock1.denselayer3.norm1.weight
	module.features.denseblock1.denselayer3.norm1.bias
	module.features.denseblock1.denselayer3.conv1.weight
	module.features.denseblock1.denselayer3.norm2.weight
	module.features.denseblock1.denselayer3.

In [22]:
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)

# Setup the loss fxn
criterion = nn.CrossEntropyLoss()
############################################ training ###########################
sys.stdout.write('Begin to train...\n')
# Train and evaluate
model_ft, hist = train_model(model_ft, dataloaders, criterion, optimizer_ft, num_epochs=num_epochs, is_inception=(model_name=="inception"))

sys.stdout.write('Finished')


Begin to train...
Epoch 0/49
----------
batch_size checker
batch_size checker
batch_size checker
batch_size checker
batch_size checker


KeyboardInterrupt: 