In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchvision.datasets as tv_datasets
import torchvision.transforms as tv_transforms
import numpy as np
from imp import reload


# ========= hyper parameters =========
num_epochs = 256
batch_size = 32
num_workers = 2

# choose path, dataset, model
run_on_local = 0 # 1 for True, 0 for False
dataset1 = 'CIFAR10' # choose between 'MNIST' or 'CIFAR10'
model_id = -2 # 0/2/4/6/8/10 for MNIST, 1/3/5/7/9/11 for CIFAR10
# 2025-3-8 running: resenet18 and cgg16 on cifar10 v/v4
modelnames = ['C3L2_MNIST', 'C3L2_cifar10', 'C5L3_MNIST', 'C5L3_cifar10', 'ResNet18_MNIST', 'ResNet18_cifar10' ,'ResNet20_MNIST', 'ResNet20_cifar10', 'ResNet50_MNIST', 'ResNet50_cifar10', 'VGG16_MNIST', 'VGG16_cifar10', 'ResNet20_omni', 'C5L4_base_cifar10', 'C5L4_BEST_cifar10', 'C5L4_BEST_cifar10_nodrp']

optimizer_id = 0 # 0 for Adam, 1 for SGD
data_aug = True

# ===================================



# add path for dataset/model script depending on testing on local or running in kaggle:
import sys

# some experimental setup
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device(type='cpu')
kaggle = "/kaggle/input/cifar-10-dlhw1-2/data"
local = "../data"

if run_on_local:
    data_path = local
    sys.path.append('./code')
    
else:
    data_path = kaggle
    sys.path.append('/kaggle/input')

from mymodels import models
print('successfully load all pac')


In [8]:

# huge main function for tuning params, simple logic: 

# choose optimizer -> choose dataloading strategy -> prepare data -> initialize model -> training loop -> print training info -> visualize training resilts.

def main(num_epochs = num_epochs, batch_size = batch_size, num_workers = num_workers, dataset1 = dataset1, data_path = data_path, model_id = model_id, optimizer_id = optimizer_id, data_aug = data_aug, modelnames = modelnames):

    # choose optimizer, both options use poprlar parameters
    optim_name = ["Adam", "SGD"]
    optim_kwargs = {"Adam": dict(lr=3e-4, weight_decay=1e-6,),
        "SGD": dict(lr = 1e-3, momentum = 0.9)}

    # choose whether or not to use data augmentation strategy
    if data_aug:
        # ============ MNIST transform with aug ===========
        if dataset1 == 'MNIST':
            transformation = dict()
            for data_type in ("train", "test"):
                is_train = data_type=="train"
                transformation[data_type] = tv_transforms.Compose(([
                tv_transforms.RandomRotation(15),  
                tv_transforms.RandomHorizontalFlip(),
                # tv_transforms.RandomAffine(0, translate=(0.1, 0.1)),  
                # tv_transforms.RandomResizedCrop(28, scale=(0.9, 1.1)), 
                tv_transforms.ToTensor(),  
                tv_transforms.Normalize((0.1307,), (0.3081,))  
            ] if is_train else [
                tv_transforms.ToTensor(),
                tv_transforms.Normalize((0.1307,), (0.3081,))]))

        elif dataset1 == 'CIFAR10':
        # ============== cifar transform with aug ================
            transformation = dict()
            for data_type in ("train", "test"):
                is_train = data_type=="train"
                transformation[data_type] = tv_transforms.Compose((
                    [
                        
                        tv_transforms.RandomRotation(degrees=15),
                        tv_transforms.RandomHorizontalFlip(),
                        tv_transforms.RandomAffine(degrees = 0, translate = (0.1, 0.1)),
                        tv_transforms.ColorJitter(
                            brightness=0.2, 
                            contrast=0.2,
                            saturation=0.2,
                            hue=0.1
                        ),
                        

                        tv_transforms.ToTensor(),
                        tv_transforms.Normalize(
                            mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
                        )
                    ] if is_train else [
                        tv_transforms.ToTensor(),
                        tv_transforms.Normalize(
                            mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
                        )
                    ]
                ))

    # ============= transformation without data augmentation ==================
    else:
        if dataset1 == 'CIFAR10':
            transformation = dict()
            for data_type in ('train', 'test'):
                is_train = data_type == 'train'
                transformation[data_type] = tv_transforms.Compose(([
                tv_transforms.RandomRotation(degrees=15),
                tv_transforms.RandomHorizontalFlip(),
                tv_transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
                tv_transforms.ToTensor(),
                tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ] if is_train else  [   
                tv_transforms.ToTensor(),
                tv_transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
            ]))
        elif dataset1 == 'MNIST':
            transformation = dict()
            for data_type in ('train', 'test'):
                is_train = data_type == 'train'
                transformation[data_type] = tv_transforms.Compose(([
                tv_transforms.ToTensor(),  
                tv_transforms.Normalize((0.1307,), (0.3081,))  
            ] if is_train else  [   
                tv_transforms.ToTensor(),  
                tv_transforms.Normalize((0.1307,), (0.3081,))  
            ]))

    # ======= prepare datasets ========
    dataloader = getattr(tv_datasets, dataset1)
    dataset, loader = {}, {}
    for data_type in ("train", "test"):
        is_train = data_type=="train"
        # root=./data: create data file in root path if there is none, dataset size ~340MB
        # path kaggle: "/kaggle/input/cifar-10-dlhw1/data"
        # path local: "../data"
        dataset[data_type] = dataloader(
            root=data_path, train=is_train, download=True, transform=transformation[data_type],
        )
        loader[data_type] = torch.utils.data.DataLoader(
            dataset[data_type], batch_size=batch_size, shuffle=is_train, num_workers=num_workers
        )
                        
    # ======== training loop =========

    # always re-import models when changing settings: avoid using weights from before.

    print('successfully load all pac')
    net = getattr(models, modelnames[model_id])
    # net(dummy_input)

    # move to device
    net.to(device)

    # print the info of hyper/parameters
    # avoid checking parameter info before passing data into the model if you use LazyConv2d next time, otherwise you need to extract 1 entry of data for initializing model

    # print(f'hyperparams: num_epochs: {num_epochs}, batch_size: {batch_size}, num_workers: {num_workers}, run_on_local: {run_on_local}, dataset: {dataset1}, model_id: {model_id}-{modelnames[model_id]}, optimizer_id: {optimizer_id}-{optim_name[optimizer_id]}, data_aug: {int(data_aug)}')
    # print(f"number of parameters: {sum(p.numel() for p in net.parameters() if p.requires_grad) / 1_000_000:.2f}M")

    # the network optimizer
    optimizer = getattr(optim, optim_name[optimizer_id])(net.parameters(), **optim_kwargs[optim_name[optimizer_id]])

    # loss function
    criterion = nn.CrossEntropyLoss()

    # storing statistics for plotting/recording:
    
    loss_train = []
    loss_test = []
    accuracy_test = []

    # training loop
    net.train()
    for epoch in range(num_epochs):

        running_loss = 0.0
        for i, (img, target) in enumerate(loader["train"]):
            img, target = img.to(device), target.to(device)

            pred = net(img)
            loss = criterion(pred, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()/batch_size

        # eval at each epoch: computation-consuming? i just don't care
        testing_loss, correct, total =0.0, 0, 0
        with torch.no_grad():
            for img, target in loader["test"]:
                img, target = img.to(device), target.to(device)
                
                # make prediction
                pred = net(img)
                loss = criterion(pred, target)
                testing_loss += loss.item()/batch_size
                
                # accumulate
                total += len(target)
                correct += (torch.argmax(pred, dim=1) == target).sum().item()
                
        loss_train.append(running_loss)
        loss_test.append(testing_loss)
        accuracy_test.append(correct / total)

        print(f"Epchs: {epoch+1}, train loss: {loss_train[-1]:.2f}, test loss: {loss_test[-1]:.2f}, test Accuracy: {100 * accuracy_test[-1]:.2f}%")

    print("Finished Training")

    # ===== save trained weights =====
    save_name = f'{modelnames[model_id]}_{optim_name[optimizer_id]}_{int(data_aug)}aug'
    PATH = '/kaggle/working/'+ 'weights_' + save_name +'.pth'
    torch.save(net.state_dict(), PATH)

    # save accuracy/loss statistics
    np.save('stats_' + save_name+'_accuracy_test.npy', np.array(accuracy_test))
    np.save('stats_' + save_name+'_loss_train.npy', np.array(loss_train))
    np.save('stats_' + save_name+'_loss_test.npy', np.array(loss_test))
    # reload
    # loaded_data = np.load('data.npy').tolist()


    # plotting training_loss and testing_accuracy curve:
    import matplotlib.pyplot as plt

    def visualize_accuracy(accuracy_test, loss_test, loss_train, save_path):
        import matplotlib.gridspec as gridspec

        plt.figure(figsize=(10, 10))  
        gs = gridspec.GridSpec(2, 2,  
                            height_ratios=[5, 5], 
                            width_ratios=[5, 5],   
                            hspace=0.2, wspace=0.2)  

        
        ax1 = plt.subplot(gs[0, 0])  
        ax2 = plt.subplot(gs[0, 1])  
        ax3 = plt.subplot(gs[1, :])  

        
        ax1.plot(loss_train, marker='o', linestyle='-', color='g', label='Train Loss')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Loss Value')
        ax1.set_title('Training Loss Progression')
        ax1.legend()
        ax1.grid(True)
        ax1.set_xlim(0, len(loss_train))  
        ax1.set_ylim(min(loss_train)*0.9, max(loss_train)*1.1)  

        
        ax2.plot(loss_test, marker='o', linestyle='-', color='r', label='Test Loss')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss Value')
        ax2.set_title('Validation Loss Progression')
        ax2.legend()
        ax2.grid(True)
        ax2.set_xlim(0, len(loss_test))   
        ax2.set_ylim(min(loss_test)*0.9, max(loss_test)*1.1)  

        
        ax3.plot(accuracy_test, marker='o', linestyle='-', color='b', label='Test Accuracy')
        ax3.set_xlabel('Epochs')
        ax3.set_ylabel('Accuracy (%)')
        ax3.set_title('Model Prediction Accuracy')
        ax3.legend()
        ax3.grid(True)
        ax3.set_xlim(0, len(accuracy_test))  
        ax3.set_ylim(min(accuracy_test)*0.95, 100 if max(accuracy_test)>95 else max(accuracy_test)*1.05)  

        
        plt.tight_layout()
        plt.savefig(save_path, bbox_inches='tight', dpi=300)
        plt.show()

    visualize_accuracy(accuracy_test, loss_test, loss_train, 'train_result.png')


In [None]:
# # ======== experiment 1 ===========
# # C5L3 base-Adam-dropout all
# # ========= hyper parameters =========
# num_epochs = 10
# batch_size = 64
# num_workers = 2

# # choose path, dataset, model
# run_on_local = 0 # 1 for True, 0 for False
# dataset1 = 'CIFAR10' # choose between 'MNIST' or 'CIFAR10'
# model_id = -3 # 0/2/4/6/8/10 for MNIST, 1/3/5/7/9/11 for CIFAR10
# # 2025-3-8 running: resenet18 and cgg16 on cifar10 v/v4
# modelnames = ['C3L2_MNIST', 'C3L2_cifar10', 'C5L3_MNIST', 'C5L3_cifar10', 'ResNet18_MNIST', 'ResNet18_cifar10' ,'ResNet20_MNIST', 'ResNet20_cifar10', 'ResNet50_MNIST', 'ResNet50_cifar10', 'VGG16_MNIST', 'VGG16_cifar10', 'ResNet20_omni', 'C5L4_base_cifar10', 'C5L4_BEST_cifar10', 'C5L4_BEST_cifar10_nodrp']

# optimizer_id = 0 # 0 for Adam, 1 for SGD
# data_aug = False # False = No extra data aug

# # ======= trainer ======
# main(num_epochs = num_epochs, batch_size = batch_size, num_workers = num_workers,  dataset1 = dataset1, data_path = data_path, model_id = model_id, optimizer_id = optimizer_id, data_aug = data_aug)

In [None]:
# # ======== experiment 2 ===========
# # C5L3 base-SGD-dropout all-no extra aug
# # ========= hyper parameters =========
# # num_epochs = 150
# batch_size = 64
# num_workers = 2

# # choose path, dataset, model
# run_on_local = 0 # 1 for True, 0 for False
# dataset1 = 'CIFAR10' # choose between 'MNIST' or 'CIFAR10'
# model_id = -3 # 0/2/4/6/8/10 for MNIST, 1/3/5/7/9/11 for CIFAR10
# # 2025-3-8 running: resenet18 and cgg16 on cifar10 v/v4
# modelnames = ['C3L2_MNIST', 'C3L2_cifar10', 'C5L3_MNIST', 'C5L3_cifar10', 'ResNet18_MNIST', 'ResNet18_cifar10' ,'ResNet20_MNIST', 'ResNet20_cifar10', 'ResNet50_MNIST', 'ResNet50_cifar10', 'VGG16_MNIST', 'VGG16_cifar10', 'ResNet20_omni', 'C5L4_base_cifar10', 'C5L4_BEST_cifar10', 'C5L4_BEST_cifar10_nodrp']

# optimizer_id = 1 # 0 for Adam, 1 for SGD
# data_aug = True

# # ======= trainer ======
# main(num_epochs = num_epochs, batch_size = batch_size, num_workers = num_workers,  dataset1 = dataset1, data_path = data_path, model_id = model_id, optimizer_id = optimizer_id, data_aug = data_aug)

In [None]:
# # ======== experiment 3 ===========
# # C5L3 base-Adam-dropout linear-BatchNorm
# # ========= hyper parameters =========
# num_epochs = 150
# batch_size = 64
# num_workers = 2

# # choose path, dataset, model
# run_on_local = 0 # 1 for True, 0 for False
# dataset1 = 'CIFAR10' # choose between 'MNIST' or 'CIFAR10'
# model_id = -2 # 0/2/4/6/8/10 for MNIST, 1/3/5/7/9/11 for CIFAR10
# # 2025-3-8 running: resenet18 and cgg16 on cifar10 v/v4
# modelnames = ['C3L2_MNIST', 'C3L2_cifar10', 'C5L3_MNIST', 'C5L3_cifar10', 'ResNet18_MNIST', 'ResNet18_cifar10' ,'ResNet20_MNIST', 'ResNet20_cifar10', 'ResNet50_MNIST', 'ResNet50_cifar10', 'VGG16_MNIST', 'VGG16_cifar10', 'ResNet20_omni', 'C5L4_base_cifar10', 'C5L4_BEST_cifar10', 'C5L4_BEST_cifar10_nodrp']

# optimizer_id = 0 # 0 for Adam, 1 for SGD
# data_aug = True

# # ======= trainer ======
# main(num_epochs = num_epochs, batch_size = batch_size, num_workers = num_workers,  dataset1 = dataset1, data_path = data_path, model_id = model_id, optimizer_id = optimizer_id, data_aug = data_aug)

In [None]:
# # ======== experiment 4 ===========
# # C5L3 base-Adam-dropout none-BatchNorm
# # ========= hyper parameters =========
# num_epochs = 150
# batch_size = 64
# num_workers = 2

# # choose path, dataset, model
# run_on_local = 0 # 1 for True, 0 for False
# dataset1 = 'CIFAR10' # choose between 'MNIST' or 'CIFAR10'
# model_id = -1 # 0/2/4/6/8/10 for MNIST, 1/3/5/7/9/11 for CIFAR10
# # 2025-3-8 running: resenet18 and cgg16 on cifar10 v/v4
# modelnames = ['C3L2_MNIST', 'C3L2_cifar10', 'C5L3_MNIST', 'C5L3_cifar10', 'ResNet18_MNIST', 'ResNet18_cifar10' ,'ResNet20_MNIST', 'ResNet20_cifar10', 'ResNet50_MNIST', 'ResNet50_cifar10', 'VGG16_MNIST', 'VGG16_cifar10', 'ResNet20_omni', 'C5L4_base_cifar10', 'C5L4_BEST_cifar10', 'C5L4_BEST_cifar10_nodrp']

# optimizer_id = 0 # 0 for Adam, 1 for SGD
# data_aug = True

# # ======= trainer ======
# main(num_epochs = num_epochs, batch_size = batch_size, num_workers = num_workers,  dataset1 = dataset1, data_path = data_path, model_id = model_id, optimizer_id = optimizer_id, data_aug = data_aug)

In [None]:
# ======== experiment 5 ===========
# C5L3 base-Adam-dropout linear-BatchNorm
# ========= hyper parameters =========
num_epochs = 150
batch_size = 64
num_workers = 2

# choose path, dataset, model
run_on_local = 0 # 1 for True, 0 for False
dataset1 = 'CIFAR10' # choose between 'MNIST' or 'CIFAR10'
model_id = -2 # 0/2/4/6/8/10 for MNIST, 1/3/5/7/9/11 for CIFAR10
# 2025-3-8 running: resenet18 and cgg16 on cifar10 v/v4
modelnames = ['C3L2_MNIST', 'C3L2_cifar10', 'C5L3_MNIST', 'C5L3_cifar10', 'ResNet18_MNIST', 'ResNet18_cifar10' ,'ResNet20_MNIST', 'ResNet20_cifar10', 'ResNet50_MNIST', 'ResNet50_cifar10', 'VGG16_MNIST', 'VGG16_cifar10', 'ResNet20_omni', 'C5L4_base_cifar10', 'C5L4_BEST_cifar10', 'C5L4_BEST_cifar10_nodrp']

optimizer_id = 0 # 0 for Adam, 1 for SGD
data_aug = False

# ======= trainer ======
main(num_epochs = num_epochs, batch_size = batch_size, num_workers = num_workers,  dataset1 = dataset1, data_path = data_path, model_id = model_id, optimizer_id = optimizer_id, data_aug = data_aug)

In [None]:
# # ======== experiment 6 ===========
# # C5L3 base-Adam-dropout linear-BatchNorm-extra aug-more epo
# # ========= hyper parameters =========
# num_epochs = 256
# batch_size = 64
# num_workers = 2

# # choose path, dataset, model
# run_on_local = 0 # 1 for True, 0 for False
# dataset1 = 'CIFAR10' # choose between 'MNIST' or 'CIFAR10'
# model_id = -2 # 0/2/4/6/8/10 for MNIST, 1/3/5/7/9/11 for CIFAR10
# # 2025-3-8 running: resenet18 and cgg16 on cifar10 v/v4
# modelnames = ['C3L2_MNIST', 'C3L2_cifar10', 'C5L3_MNIST', 'C5L3_cifar10', 'ResNet18_MNIST', 'ResNet18_cifar10' ,'ResNet20_MNIST', 'ResNet20_cifar10', 'ResNet50_MNIST', 'ResNet50_cifar10', 'VGG16_MNIST', 'VGG16_cifar10', 'ResNet20_omni', 'C5L4_base_cifar10', 'C5L4_BEST_cifar10', 'C5L4_BEST_cifar10_nodrp']

# optimizer_id = 0 # 0 for Adam, 1 for SGD
# data_aug = True

# # ======= trainer ======
# main(num_epochs = num_epochs, batch_size = batch_size, num_workers = num_workers,  dataset1 = dataset1, data_path = data_path, model_id = model_id, optimizer_id = optimizer_id, data_aug = data_aug)