In [11]:
from new_model import *
from tqdm import tqdm

In [12]:
seed = 777
random.seed(seed)
np.random.seed(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(seed)
if device == 'cuda':
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## 1. CIFAR 10 데이터셋 다운로드

In [13]:
#Data 다운로드 경로지정
DATA_ROOT = "../data_cifar10"
batch_size = 128


#Data Process 
transform_train = transforms.Compose([transforms.RandomCrop(32, padding=4),transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(),transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) 

transform_val = transforms.Compose([transforms.ToTensor(), 
                                    transforms.Normalize((0.4914, 0.4822, 0.4465),(0.2023, 0.1994, 0.2010))])

transform_test = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize((0.4914, 0.4822, 0.4465),(0.2023, 0.1994, 0.2010))])

train_CIFAR10 = torchvision.datasets.CIFAR10(root='../data', train=True, download=True, transform=transform_train)

test_CIFAR10 = torchvision.datasets.CIFAR10(root='../data', train=False, download=True, transform=transform_test)

num_train = int(1.0 * len(train_CIFAR10) * 95 / 100)
num_val = len(train_CIFAR10) - num_train
train_CIFAR10, val_CIFAR10 = torch.utils.data.random_split(train_CIFAR10, [num_train, num_val])

train_loader = torch.utils.data.DataLoader(train_CIFAR10, batch_size=batch_size, shuffle=True, num_workers=2) #num_workers는 데이터 로드시 sub process 몇개 쓸거냐 
val_loader = torch.utils.data.DataLoader(val_CIFAR10, batch_size=batch_size,shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_CIFAR10, batch_size=batch_size, shuffle=True, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


## 2. CIFAR 100 데이터셋 다운로드(참고)

In [14]:
#Data 다운로드 경로지정
DATA_ROOT = "../data_cifar100"
batch_size = 128

#Data Process
transform_train = transforms.Compose([transforms.RandomCrop(32, padding=4),transforms.RandomHorizontalFlip(),
                                      transforms.ToTensor(),transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) 

transform_val = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) 

transform_test = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) 

train_CIFAR100 = torchvision.datasets.CIFAR100(root='../data', train=True, download=True, transform=transform_train)

test_CIFAR100 = torchvision.datasets.CIFAR100(root='../data', train=False, download=True, transform=transform_test)

num_train = int(1.0 * len(train_CIFAR100) * 95 / 100)
num_val = len(train_CIFAR100) - num_train
train_CIFAR100, val_CIFAR100 = torch.utils.data.random_split(train_CIFAR100, [num_train, num_val])

train_loader = torch.utils.data.DataLoader(train_CIFAR100, batch_size=batch_size, shuffle=True, num_workers=2) #num_workers는 데이터 로드시 sub process 몇개 쓸거냐 
val_loader = torch.utils.data.DataLoader(val_CIFAR100, batch_size=batch_size,shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_CIFAR100, batch_size=batch_size, shuffle=True, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


## 4. train_model과 test_model 저장

In [15]:
def train_model(model, mode):    
    # total number of training epochs
    EPOCHS = 200
    CHECKPOINT_PATH = "./saved_model"
    
    best_val_acc = 0
    current_learning_rate = INITIAL_LR

    print("==> Training starts!")

    start = time.time()
    optimizer = torch.optim.SGD(model.parameters(), lr=INITIAL_LR, momentum=MOMENTUM, weight_decay=REG)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones = [60, 120, 160], gamma = 0.2)
    
    for i in tqdm(range(0, EPOCHS)):
        model.train()
        
        # this help you compute the training accuracy
        total_examples = 0
        correct_examples = 0
        train_loss = 0

        
        '''
        train loop
        '''
        # Train the model for 1 epoch.
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device).float(), targets.to(device).long() #inputs과 targets는 gpu로 계산

            # compute the output and loss
            y_preds = model(inputs)        
            loss = criterion(y_preds,targets)
            train_loss += loss.item()

            # zero the gradient
            optimizer.zero_grad()

            # backpropagation
            loss.backward()

            # apply gradient and update the weights
            optimizer.step()
            


            # count the number of correctly predicted samples in the current batch
            y_preds_class = torch.argmax(y_preds, dim=1)
            correct_examples += (targets == y_preds_class).sum().item()
            total_examples += targets.size(0)
        
        # scheduler
        scheduler.step()
        avg_loss_tr = train_loss / len(train_loader) ###
        avg_acc_tr = correct_examples / total_examples ### 

        
                                                      
        '''
        validation loop
        '''
                                                      
        # switch to eval mode
        model.eval()
        total_examples = 0
        correct_examples = 0
        val_loss = 0 # again, track the validation loss if you want
        
        # disable gradient during validation, which can save GPU memory
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(val_loader):
                inputs, targets = inputs.to(device).float(), targets.to(device).long()

                # compute the output and loss
                y_preds = model(inputs)        
                loss = criterion(y_preds,targets)
                val_loss += loss.item()

                # count the number of correctly predicted samples in the current batch
                y_preds_class = torch.argmax(y_preds, dim=1)
                correct_examples += (targets == y_preds_class).sum().item()
                total_examples += targets.size(0)

        avg_loss_val = val_loss / len(val_loader)
        avg_acc_val = correct_examples / total_examples

                                                      
        # save the model checkpoint
        if avg_acc_val > best_val_acc :
            best_val_acc = avg_acc_val
            if not os.path.exists(CHECKPOINT_PATH):
                os.makedirs(CHECKPOINT_PATH)

            state = {'state_dict': model.state_dict(),
                     'epoch': i,
                     'lr': current_learning_rate}
            
            torch.save(state, os.path.join(CHECKPOINT_PATH, '{}.pth'.format(mode)))
        
        if i % 10 == 9 :  #epoch를 5개씩 출력해서 봐보겠다
            end = time.time()
            diff_time = round(end - start,2)
            print("Epoch %d:" %(i+1), f"progress time is {diff_time} sec")
            print("Training loss: %.4f, Training accuracy: %.4f" %(avg_loss_tr, avg_acc_tr)) 
            print("Validation loss: %.4f, Validation accuracy: %.4f" % (avg_loss_val, avg_acc_val))
        time.sleep(0.1)
    print(f"==> Optimization finished! Best validation accuracy: {best_val_acc:.4f}")                                    

    return model #결국 history 변수를 지정해서 epoch 1개씩 loss와 accuracy값을 저장하려고 return 해야됨

In [16]:
def test_model(model):
    model.to(device)
    model.eval()

    total_examples = 0
    correct_examples = 0
    softmax = torch.nn.Softmax(dim=1)

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(test_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            pred = model(inputs)
            total_examples += inputs.shape[0]

            out = softmax(pred)
            out = torch.max(out, 1)

            correct_examples += torch.sum(targets==out[1]).cpu().data.numpy().tolist()

    avg_acc = correct_examples / total_examples
    print("Total examples is {}, correct examples is {}; Test accuracy: {}".format(total_examples, correct_examples, avg_acc))

In [17]:
INITIAL_LR = 0.1

# momentum for optimizer
MOMENTUM = 0.9

# L2 regularization strength
REG = 5e-4

# Define loss
criterion = nn.CrossEntropyLoss()

## 5. Wide_resnet model을 train 시켜보자
- net : 28x1, 28x2, 28x4, 28x10

## CIFAR10

In [7]:
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
model = Wide_ResNet(28, 10, 0.3, 10).to(device)
print("The model is deployed to", device)

| Wide-Resnet 28x10
The model is deployed to cuda:3


In [8]:
train_model(model, "resnet28x10_cifar10")

==> Training starts!


  5%|▌         | 10/200 [44:10<14:00:57, 265.56s/it]

Epoch 10: progress time is 2650.66 sec
Training loss: 0.5409, Training accuracy: 0.8158
Validation loss: 0.6667, Validation accuracy: 0.7664


 10%|█         | 20/200 [1:28:33<13:18:02, 266.02s/it]

Epoch 20: progress time is 5313.55 sec
Training loss: 0.3928, Training accuracy: 0.8657
Validation loss: 0.6820, Validation accuracy: 0.7768


 15%|█▌        | 30/200 [2:12:50<12:32:41, 265.66s/it]

Epoch 30: progress time is 7970.85 sec
Training loss: 0.3065, Training accuracy: 0.8934
Validation loss: 0.5902, Validation accuracy: 0.8100


 20%|██        | 40/200 [2:57:03<11:47:09, 265.18s/it]

Epoch 40: progress time is 10623.41 sec
Training loss: 0.3085, Training accuracy: 0.8943
Validation loss: 0.7783, Validation accuracy: 0.7704


 25%|██▌       | 50/200 [3:40:50<10:58:13, 263.29s/it]

Epoch 50: progress time is 13250.25 sec
Training loss: 0.2733, Training accuracy: 0.9064
Validation loss: 0.4347, Validation accuracy: 0.8496


 30%|███       | 60/200 [4:24:45<10:14:48, 263.49s/it]

Epoch 60: progress time is 15885.05 sec
Training loss: 0.2711, Training accuracy: 0.9082
Validation loss: 0.4642, Validation accuracy: 0.8464


 35%|███▌      | 70/200 [5:08:44<9:31:38, 263.83s/it]

Epoch 70: progress time is 18524.56 sec
Training loss: 0.0360, Training accuracy: 0.9893
Validation loss: 0.2195, Validation accuracy: 0.9344


 40%|████      | 80/200 [5:52:31<8:45:38, 262.82s/it]

Epoch 80: progress time is 21151.69 sec
Training loss: 0.0512, Training accuracy: 0.9843
Validation loss: 0.2268, Validation accuracy: 0.9252


 45%|████▌     | 90/200 [6:36:20<8:01:58, 262.90s/it]

Epoch 90: progress time is 23779.96 sec
Training loss: 0.0576, Training accuracy: 0.9813
Validation loss: 0.2411, Validation accuracy: 0.9256


 50%|█████     | 100/200 [7:20:05<7:17:50, 262.70s/it]

Epoch 100: progress time is 26405.64 sec
Training loss: 0.0524, Training accuracy: 0.9833
Validation loss: 0.2222, Validation accuracy: 0.9312


 55%|█████▌    | 110/200 [8:04:00<6:34:56, 263.29s/it]

Epoch 110: progress time is 29040.2 sec
Training loss: 0.0663, Training accuracy: 0.9787
Validation loss: 0.3375, Validation accuracy: 0.9088


 60%|██████    | 120/200 [8:47:52<5:51:14, 263.43s/it]

Epoch 120: progress time is 31672.53 sec
Training loss: 0.0662, Training accuracy: 0.9788
Validation loss: 0.2250, Validation accuracy: 0.9284


 65%|██████▌   | 130/200 [9:31:46<5:07:17, 263.39s/it]

Epoch 130: progress time is 34306.86 sec
Training loss: 0.0033, Training accuracy: 0.9998
Validation loss: 0.1500, Validation accuracy: 0.9576


 70%|███████   | 140/200 [10:15:38<4:23:19, 263.33s/it]

Epoch 140: progress time is 36938.14 sec
Training loss: 0.0026, Training accuracy: 0.9999
Validation loss: 0.1315, Validation accuracy: 0.9600


 75%|███████▌  | 150/200 [10:59:26<3:39:19, 263.19s/it]

Epoch 150: progress time is 39566.63 sec
Training loss: 0.0022, Training accuracy: 1.0000
Validation loss: 0.1213, Validation accuracy: 0.9632


 80%|████████  | 160/200 [11:43:14<2:55:00, 262.51s/it]

Epoch 160: progress time is 42193.91 sec
Training loss: 0.0021, Training accuracy: 1.0000
Validation loss: 0.1457, Validation accuracy: 0.9572


 85%|████████▌ | 170/200 [12:27:06<2:11:36, 263.21s/it]

Epoch 170: progress time is 44826.31 sec
Training loss: 0.0019, Training accuracy: 1.0000
Validation loss: 0.1331, Validation accuracy: 0.9600


 90%|█████████ | 180/200 [13:10:56<1:27:40, 263.01s/it]

Epoch 180: progress time is 47456.05 sec
Training loss: 0.0018, Training accuracy: 1.0000
Validation loss: 0.1217, Validation accuracy: 0.9644


 95%|█████████▌| 190/200 [13:54:43<43:46, 262.66s/it]

Epoch 190: progress time is 50083.15 sec
Training loss: 0.0024, Training accuracy: 1.0000
Validation loss: 0.1244, Validation accuracy: 0.9652


100%|██████████| 200/200 [14:38:33<00:00, 263.57s/it]

Epoch 200: progress time is 52713.68 sec
Training loss: 0.0023, Training accuracy: 1.0000
Validation loss: 0.1324, Validation accuracy: 0.9620
==> Optimization finished! Best validation accuracy: 0.9672





Wide_ResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
  (M_relu): M_relu()
  (layer1): Sequential(
    (0): M_BasicBlock(
      (conv1): Conv2d(16, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (M_relu1): M_relu()
      (dropout): Dropout(p=0.3, inplace=False)
      (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (M_relu2): M_relu()
      (shortcut): Sequential(
        (0): Conv2d(16, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): M_BasicBlock(
      (conv1): Conv2d(160, 160, kernel_si

## CIFAR100

In [18]:
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
model = Wide_ResNet(28, 10, 0.3, 100).to(device)
print("The model is deployed to", device)

| Wide-Resnet 28x10
The model is deployed to cuda:3


In [19]:
train_model(model, "resnet28x10_cifar100")

==> Training starts!


  5%|▌         | 10/200 [44:11<14:01:02, 265.59s/it]

Epoch 10: progress time is 2651.37 sec
Training loss: 1.6415, Training accuracy: 0.5423
Validation loss: 2.4632, Validation accuracy: 0.3964


 10%|█         | 20/200 [1:28:35<13:18:36, 266.20s/it]

Epoch 20: progress time is 5314.94 sec
Training loss: 1.0495, Training accuracy: 0.6927
Validation loss: 1.6678, Validation accuracy: 0.5532


 15%|█▌        | 30/200 [2:12:53<12:32:39, 265.65s/it]

Epoch 30: progress time is 7972.93 sec
Training loss: 0.8776, Training accuracy: 0.7388
Validation loss: 1.5876, Validation accuracy: 0.5832


 20%|██        | 40/200 [2:57:07<11:47:03, 265.15s/it]

Epoch 40: progress time is 10626.99 sec
Training loss: 0.8305, Training accuracy: 0.7529
Validation loss: 1.7076, Validation accuracy: 0.5588


 25%|██▌       | 50/200 [3:41:01<10:58:25, 263.37s/it]

Epoch 50: progress time is 13261.83 sec
Training loss: 0.7872, Training accuracy: 0.7664
Validation loss: 1.6541, Validation accuracy: 0.5832


 30%|███       | 60/200 [4:24:57<10:14:59, 263.57s/it]

Epoch 60: progress time is 15897.77 sec
Training loss: 0.7558, Training accuracy: 0.7756
Validation loss: 1.6346, Validation accuracy: 0.5772


 35%|███▌      | 70/200 [5:08:54<9:30:40, 263.39s/it]

Epoch 70: progress time is 18533.93 sec
Training loss: 0.0543, Training accuracy: 0.9916
Validation loss: 0.8496, Validation accuracy: 0.7860


 40%|████      | 80/200 [5:52:54<8:47:53, 263.94s/it]

Epoch 80: progress time is 21174.07 sec
Training loss: 0.1371, Training accuracy: 0.9666
Validation loss: 1.1948, Validation accuracy: 0.7044


 45%|████▌     | 90/200 [6:36:49<8:03:23, 263.67s/it]

Epoch 90: progress time is 23809.04 sec
Training loss: 0.1343, Training accuracy: 0.9664
Validation loss: 1.1873, Validation accuracy: 0.7036


 50%|█████     | 100/200 [7:20:46<7:19:17, 263.57s/it]

Epoch 100: progress time is 26446.72 sec
Training loss: 0.1225, Training accuracy: 0.9700
Validation loss: 1.1596, Validation accuracy: 0.7084


 55%|█████▌    | 110/200 [8:04:48<6:35:39, 263.78s/it]

Epoch 110: progress time is 29088.66 sec
Training loss: 0.1241, Training accuracy: 0.9689
Validation loss: 1.2716, Validation accuracy: 0.6968


 60%|██████    | 120/200 [8:48:45<5:51:28, 263.61s/it]

Epoch 120: progress time is 31725.66 sec
Training loss: 0.1151, Training accuracy: 0.9708
Validation loss: 1.2361, Validation accuracy: 0.7028


 65%|██████▌   | 130/200 [9:32:37<5:07:32, 263.61s/it]

Epoch 130: progress time is 34357.47 sec
Training loss: 0.0077, Training accuracy: 0.9996
Validation loss: 0.8770, Validation accuracy: 0.7888


 70%|███████   | 140/200 [10:16:30<4:23:25, 263.42s/it]

Epoch 140: progress time is 36990.33 sec
Training loss: 0.0079, Training accuracy: 0.9997
Validation loss: 0.8285, Validation accuracy: 0.7852


 75%|███████▌  | 150/200 [11:00:25<3:39:46, 263.72s/it]

Epoch 150: progress time is 39625.6 sec
Training loss: 0.0083, Training accuracy: 0.9997
Validation loss: 0.8632, Validation accuracy: 0.7832


 80%|████████  | 160/200 [11:44:25<2:56:03, 264.09s/it]

Epoch 160: progress time is 42265.73 sec
Training loss: 0.0090, Training accuracy: 0.9997
Validation loss: 0.8636, Validation accuracy: 0.7856


 85%|████████▌ | 170/200 [12:28:12<2:11:02, 262.10s/it]

Epoch 170: progress time is 44892.74 sec
Training loss: 0.0078, Training accuracy: 0.9997
Validation loss: 0.8532, Validation accuracy: 0.7904


 90%|█████████ | 180/200 [13:11:47<1:27:08, 261.41s/it]

Epoch 180: progress time is 47507.9 sec
Training loss: 0.0083, Training accuracy: 0.9998
Validation loss: 0.8746, Validation accuracy: 0.7832


 95%|█████████▌| 190/200 [13:55:28<43:36, 261.66s/it]

Epoch 190: progress time is 50128.46 sec
Training loss: 0.0088, Training accuracy: 0.9998
Validation loss: 0.8482, Validation accuracy: 0.7856


100%|██████████| 200/200 [14:39:05<00:00, 263.73s/it]

Epoch 200: progress time is 52745.14 sec
Training loss: 0.0081, Training accuracy: 0.9997
Validation loss: 0.8346, Validation accuracy: 0.7908
==> Optimization finished! Best validation accuracy: 0.7996





Wide_ResNet(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
  (M_relu): M_relu()
  (layer1): Sequential(
    (0): M_BasicBlock(
      (conv1): Conv2d(16, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (M_relu1): M_relu()
      (dropout): Dropout(p=0.3, inplace=False)
      (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (M_relu2): M_relu()
      (shortcut): Sequential(
        (0): Conv2d(16, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): M_BasicBlock(
      (conv1): Conv2d(160, 160, kernel_si

In [20]:
send_email('resnet28x10_cifar100 finished')

## test_model을 통해 결과확인

In [10]:
#wide_resnet 28x1의 결과값
checkpoint = torch.load("./saved_model/resnet28x10_cifar10.pth")
model.load_state_dict(checkpoint['state_dict'])
test_model(model)

Total examples is 10000, correct examples is 9572; Test accuracy: 0.9572


In [21]:
#wide_resnet 28x1의 결과값
checkpoint = torch.load("./saved_model/resnet28x10_cifar100.pth")
model.load_state_dict(checkpoint['state_dict'])
test_model(model)

Total examples is 10000, correct examples is 8002; Test accuracy: 0.8002
