In [8]:
import torch
import torch.nn as nn
import torchvision
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import time
from typing import Type, Any, Callable, Union, List, Optional
from torch import Tensor 
from torchsummary import summary
import pickle
if torch.cuda.is_available():
    print("Using GPUs")
    device = torch.device("cuda") 
else:
    device = torch.device("cpu")

Using GPUs


In [9]:
torch.manual_seed(43)
batch_size = 128

### for CIFAR 10
# stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
## for CIFAR 100
stats = ((0.507, 0.487, 0.441), (0.267, 0.256, 0.276))

transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(*stats),
    torchvision.transforms.RandomCrop(32, padding=4, padding_mode='constant'),
    torchvision.transforms.RandomHorizontalFlip(p=0.5)
])

train_set = torchvision.datasets.CIFAR100(root="data", train=True, download=True, transform=transform)
# train_set, _ = torch.utils.data.random_split(train_set, [1, len(train_set)-1]) # For sanity checking
train_size = len(train_set)
test_set = torchvision.datasets.CIFAR100(root="data", train=False, download=True, transform=transform)
test_set, validation_set = torch.utils.data.random_split(test_set, [5000, 5000])
test_size = len(test_set)
validation_size = len(validation_set)


train_loader = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size, num_workers=4, pin_memory=True)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size, num_workers=4, pin_memory=True)

data_loaders = {"train": train_loader, "test": test_loader, "validation": validation_loader}
dataset_sizes = {"train": train_size, "test": test_size, "validation": validation_size}
print(dataset_sizes)

Files already downloaded and verified
Files already downloaded and verified
{'train': 50000, 'test': 5000, 'validation': 5000}


In [10]:
from models import ResNet, DenseNet, DSNet

#### Train Configurations, based on DSNet and ResNet paper
model_n = 8
epochs = 100 
milestones = [int(epochs*0.5), int(epochs*0.75)]
momentum = 0.9
weight_decay = 0.0005
gamma = 0.1
lr = 0.1

In [11]:
from pathlib import Path
Path("./_results").mkdir(exist_ok=True)

f1 = open('./_results/results_1.txt', 'w')
f2 = open('./_results/results_2.txt', 'w')

# def print(*args):
#     for arg in args:
#         f1.write(str(arg) + ' ')
#     f1.write('\n')
#     f1.flush()

def print2(*args):
    for arg in args:
        f2.write(str(arg) + ' ')
    f2.write('\n')
    f2.flush()
    
def mprint(b):
    gb = b / (10 ** 9)
    print('{} GB'.format(gb))
    
def print_n_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(total_params)
               

In [12]:
n_experiments = 5
model_names = ['DSNet', 'ResNet', 'DenseNet']
model_fns = [
    lambda: DSNet(model_n, num_classes=100, device=device),
    lambda: ResNet(model_n, num_classes=100, device=device),
    lambda: DenseNet(growth_rate=16, block_config=(2 * model_n, 2 * model_n, 2 * model_n),
                       num_init_features=16, bn_size=2, num_classes=100)
]

In [13]:
# Find params
for n, fn in zip(model_names, model_fns):
    model = fn()
    print(n)
    print_n_params(model)

DSNet
778212
ResNet
766116
DenseNet
771228


In [5]:

results = {model_name: [{'train': [], 'test': None} for _ in range(n_experiments)] for model_name in model_names}

i = 0
model_name = model_names[i]
model_fn = model_fns[i]

m1 = torch.cuda.memory_allocated()
print('m1:', m1)

model = model_fn()
model.to(device)

m2 = torch.cuda.memory_allocated()
print('m2:')
mprint(m2)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=gamma)

### Train loop + validation/ also test at the end
print("Configuration: ", "model:", model_name, " model_n:", model_n, " batch size:", batch_size, 
      " optimizer:SGD", " lr:", lr, " epochs:", epochs)

print("----------------------------- Train --------------------------------")
for epoch in range(epochs):
    start_time = time.time()
    print("Epoch {}/{}".format(epoch+1, epochs))
    print("-" * 30)


    epoch_loss = {"train": 0.0, "validation": 0.0}
    epoch_acc = {"train": 0.0, "validation": 0.0}

    running_loss = {"train": 0.0, "validation": 0.0}
    running_corrects = {"train": 0, "validation": 0}

    for phase in ["train", "validation"]:
        if phase == "train":
            model.train(True)
        else:
            model.train(False)

        for data in data_loaders[phase]:
            inputs, labels = data 

            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad() # clear all gradients

            outputs = model(inputs) # batch_size x num_classes
            _, preds = torch.max(outputs.data, 1) # values, indices
            loss = loss_fn(outputs, labels)
            
            print('before backward:')
            mprint(torch.cuda.memory_allocated())

            if phase == "train":
                loss.backward()  # compute gradients
                optimizer.step() # update weights/biases
            
            print('after backward:')
            mprint(torch.cuda.memory_allocated())
            
            running_loss[phase] += loss.data.item() * inputs.size(0)
            running_corrects[phase] += torch.sum(preds == labels.data).item()

        epoch_loss[phase] = running_loss[phase] / dataset_sizes[phase]
        epoch_acc[phase] =  running_corrects[phase] / dataset_sizes[phase]

    # Visualize the loss and accuracy values.
    results_dic = {
        'time': np.round(time.time()-start_time, 5),
        'train_loss': np.round(epoch_loss["train"], 5),
        'train_acc': np.round(epoch_acc["train"], 5),
        'val_loss': np.round(epoch_loss["validation"], 5),
        'val_acc': np.round(epoch_acc["validation"], 5),
    }
    print(results_dic)
    results[model_name][n]['train'].append(results_dic)

    scheduler.step()



### evaluating the model with test set
print("----------------------------- Test --------------------------------")
with torch.no_grad():
    model.eval()
    running_loss = 0
    running_corrects = 0

    for data in test_loader:
        inputs, labels = data 

        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad() # clear all gradients

        outputs = model(inputs) # batch_size x num_classes
        _, preds = torch.max(outputs.data, 1) # values, indices
        loss = loss_fn(outputs, labels)

        running_loss += loss.data.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data).item()

# Visualize the loss and accuracy values.
results_dic = {
'time': np.round(time.time()-start_time, 5),
'test_loss': np.round(running_loss/ dataset_sizes['test'], 5),
'test_acc': np.round(running_corrects/ dataset_sizes['test'], 5),
}
print(results_dic)
results[model_name][n]['test'] = results_dic

print2('Experiment {}'.format(n))
print2('test_acc', np.round(running_corrects/ dataset_sizes['test'], 5))

with open('./_results/results_3.pk', 'wb') as f:
    pickle.dump(results, f)
    f.flush()

m1: 0
m2:
0.001166848 GB
Configuration:  model: ResNet  model_n: 3  batch size: 128  optimizer:SGD  lr: 0.1  epochs: 100
----------------------------- Train --------------------------------
Epoch 1/100
------------------------------
before backward:
0.196360192 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.19863552 GB
after backward:
0.005069312 GB
before backward:
0.198

KeyboardInterrupt: 

In [None]:
with open('./_results/conf_bound_8/results_3.pk', 'rb') as f:
    l_results = pickle.load(f)

In [None]:
l_results['DSNet'][0]['train'] 

In [None]:
l_results['DSNet'][0]['test']