# Data Loading

In [1]:
import torch
import torch.nn as nn
from data.dataset import data_loader

%load_ext autoreload

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
train_loader, val_loader = data_loader(
    data_dir='./data/datasets', batch_size=128
)
test_loader = data_loader(
    data_dir='./data/datasets', batch_size=128,
    test=True
)

len(train_loader), len(val_loader), len(test_loader)

Files already downloaded and verified
Files already downloaded and verified


(352, 40, 79)

In [6]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, downsample=False):
        super(ResidualBlock, self).__init__()
        
        stride = 2 if downsample else 1
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels)
        )
        
        self.downsample = downsample
        if downsample:
            self.downsampleLayer = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=2, padding=0, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        if self.downsample:
            residual = self.downsampleLayer(x)
        out += residual
        out = nn.ReLU()(out)
        return out

In [86]:
def get_layer(in_channels, out_channels, num_blocks):
    downsample = in_channels != out_channels
    blocks = [ResidualBlock(in_channels, out_channels, downsample=downsample)]

    for _ in range(num_blocks - 1):
        blocks.append(ResidualBlock(out_channels, out_channels))
    return nn.Sequential(*blocks)


class ResNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ResNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
        )
        self.layer2 = get_layer(16, 16, 9)
        self.layer3 = get_layer(16, 32, 9)
        self.layer4 = get_layer(32, 64, 9)
        self.avgpool = nn.AvgPool2d(kernel_size=8)
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64, num_classes),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = self.fc(x)
        
        return x

In [72]:
def train_model(epochs):
    EPOCHS = epochs
    train_samples_num = 45000
    val_samples_num = 5000
    train_costs, val_costs = [], []
    
    #Training phase.    
    for epoch in range(EPOCHS):

        train_running_loss = 0
        correct_train = 0
        
        model.train().cuda()
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            """ for every mini-batch during the training phase, we typically want to explicitly set the gradients 
            to zero before starting to do backpropragation """
            optimizer.zero_grad()
            
            # Start the forward pass
            prediction = model(inputs)
                        
            loss = criterion(prediction, labels)
          
            # do backpropagation and update weights with step()
            loss.backward()         
            optimizer.step()
            
            # print('outputs on which to apply torch.max ', prediction)
            # find the maximum along the rows, use dim=1 to torch.max()
            _, predicted_outputs = torch.max(prediction.data, 1)
            
            # Update the running corrects 
            correct_train += (predicted_outputs == labels).float().sum().item()
            
            ''' Compute batch loss
            multiply each average batch loss with batch-length. 
            The batch-length is inputs.size(0) which gives the number total images in each batch. 
            Essentially I am un-averaging the previously calculated Loss '''
            train_running_loss += (loss.data.item() * inputs.shape[0])


        train_epoch_loss = train_running_loss / train_samples_num
        
        train_costs.append(train_epoch_loss)
        
        train_acc =  correct_train / train_samples_num

        # Now check trained weights on the validation set
        val_running_loss = 0
        correct_val = 0
      
        model.eval().cuda()
    
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                # Forward pass.
                prediction = model(inputs)

                # Compute the loss.
                loss = criterion(prediction, labels)

                # Compute validation accuracy.
                _, predicted_outputs = torch.max(prediction.data, 1)
                correct_val += (predicted_outputs == labels).float().sum().item()

            # Compute batch loss.
            val_running_loss += (loss.data.item() * inputs.shape[0])

            val_epoch_loss = val_running_loss / val_samples_num
            val_costs.append(val_epoch_loss)
            val_acc =  correct_val / val_samples_num
        
        info = "[Epoch {}/{}]: train-loss = {:0.6f} | train-acc = {:0.3f} | val-loss = {:0.6f} | val-acc = {:0.3f}"
        
        print(info.format(epoch+1, EPOCHS, train_epoch_loss, train_acc, val_epoch_loss, val_acc))
        
        torch.save(model.state_dict(), './content/checkpoint_gpu_{}'.format(epoch + 1)) 
                                                                
    torch.save(model.state_dict(), './content/resnet-56_weights_gpu')  
        
    return train_costs, val_costs

In [87]:
model = ResNet(10)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [81]:
train_costs, val_costs = train_model(10)

[Epoch 1/10]: train-loss = 2.212842 | train-acc = 0.239 | val-loss = 0.003716 | val-acc = 0.252
[Epoch 2/10]: train-loss = 2.153141 | train-acc = 0.301 | val-loss = 0.003555 | val-acc = 0.299
[Epoch 3/10]: train-loss = 2.115063 | train-acc = 0.340 | val-loss = 0.003238 | val-acc = 0.354
[Epoch 4/10]: train-loss = 2.094359 | train-acc = 0.362 | val-loss = 0.002937 | val-acc = 0.339
[Epoch 5/10]: train-loss = 2.072569 | train-acc = 0.384 | val-loss = 0.003337 | val-acc = 0.394
[Epoch 6/10]: train-loss = 2.027759 | train-acc = 0.429 | val-loss = 0.003370 | val-acc = 0.431
[Epoch 7/10]: train-loss = 1.966469 | train-acc = 0.490 | val-loss = 0.003141 | val-acc = 0.493
[Epoch 8/10]: train-loss = 1.908447 | train-acc = 0.549 | val-loss = 0.002718 | val-acc = 0.508
[Epoch 9/10]: train-loss = 1.859220 | train-acc = 0.599 | val-loss = 0.002966 | val-acc = 0.585
[Epoch 10/10]: train-loss = 1.825647 | train-acc = 0.633 | val-loss = 0.003153 | val-acc = 0.635


In [88]:
train_costs, val_costs = train_model(175)

[Epoch 1/175]: train-loss = 2.216754 | train-acc = 0.225 | val-loss = 0.003512 | val-acc = 0.254
[Epoch 2/175]: train-loss = 2.164184 | train-acc = 0.289 | val-loss = 0.003693 | val-acc = 0.311
[Epoch 3/175]: train-loss = 2.116097 | train-acc = 0.338 | val-loss = 0.002944 | val-acc = 0.353
[Epoch 4/175]: train-loss = 2.070444 | train-acc = 0.385 | val-loss = 0.003479 | val-acc = 0.408
[Epoch 5/175]: train-loss = 2.038387 | train-acc = 0.417 | val-loss = 0.002930 | val-acc = 0.371
[Epoch 6/175]: train-loss = 2.010324 | train-acc = 0.445 | val-loss = 0.003282 | val-acc = 0.435
[Epoch 7/175]: train-loss = 1.978497 | train-acc = 0.478 | val-loss = 0.003238 | val-acc = 0.508
[Epoch 8/175]: train-loss = 1.931758 | train-acc = 0.525 | val-loss = 0.003315 | val-acc = 0.514
[Epoch 9/175]: train-loss = 1.896959 | train-acc = 0.561 | val-loss = 0.002715 | val-acc = 0.566
[Epoch 10/175]: train-loss = 1.862562 | train-acc = 0.596 | val-loss = 0.002929 | val-acc = 0.595
[Epoch 11/175]: train-loss = 

In [5]:
model = ResNet().cuda()
model.load_state_dict(torch.load('models/resnet-v2_20240417_013340/checkpoint_173'))



NameError: name 'model' is not defined

In [90]:
test_samples_num = 10000
correct = 0 

model.eval().cuda()

with  torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        # Make predictions.
        prediction = model(inputs)

        # Retrieve predictions indexes.
        _, predicted_class = torch.max(prediction.data, 1)

        # Compute number of correct predictions.
        correct += (predicted_class == labels).float().sum().item()

test_accuracy = correct / test_samples_num
print('Test accuracy: {}'.format(test_accuracy))

Test accuracy: 0.8159


In [100]:
a = ResNet(10)
a.load_state_dict(torch.load('./content/resnet-56_weights_gpu'))

<All keys matched successfully>

In [102]:
a.eval().cuda()

ResNet(
  (layer1): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (layer2): Sequential(
    (0): ResidualBlock(
      (conv1): Sequential(
        (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (conv2): Sequential(
        (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): ResidualBlock(
      (conv1): Sequential(
        (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
      )
      (conv2):

In [103]:
test_samples_num = 10000
correct = 0 

a.eval().cuda()

with  torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        # Make predictions.
        prediction = a(inputs)

        # Retrieve predictions indexes.
        _, predicted_class = torch.max(prediction.data, 1)

        # Compute number of correct predictions.
        correct += (predicted_class == labels).float().sum().item()

test_accuracy = correct / test_samples_num
print('Test accuracy: {}'.format(test_accuracy))

Test accuracy: 0.3684


In [285]:
inputs, labels = next(iter(test_loader))
inputs, labels = inputs.cuda(), labels.cuda()
pred = model(inputs)

loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(pred, labels).item()
print(f"Total batch loss: {loss:.4f}")

Total batch loss: 1.6217


In [286]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1, 3], gamma=0.1)

In [310]:
num_training = sum(inputs.size(0) for inputs, _ in train_loader)
num_val = sum(inputs.size(0) for inputs, _ in val_loader)

def train_epoch(model):
    running_loss = 0.
    num_correct = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.cuda(), labels.cuda()
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        running_loss += loss.item() * len(inputs)
        
        class_outputs = torch.argmax(outputs, dim=1)
        num_correct += (class_outputs == labels).sum()
        
        optimizer.step()
    scheduler.step()
    
    avg_loss = running_loss / num_training
    avg_acc = num_correct / num_training
    
    return avg_loss, avg_acc

In [304]:
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter

In [311]:
EPOCHS = 3
best_val_loss = torch.inf
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter(f"models/resnet_{timestamp}")

print('Starting training...')

for epoch in range(EPOCHS):
    model.train(True)
    avg_train_loss, avg_train_acc = train_epoch(model)
    model.eval()

    running_val_loss = 0.0
    val_correct = 0
    with torch.no_grad():
        for val_inputs, val_labels in val_loader:
            val_inputs, val_labels = val_inputs.cuda(), val_labels.cuda()
            
            val_outputs = model(val_inputs)
            val_loss = loss_fn(val_outputs, val_labels)
            running_val_loss += val_loss * len(val_inputs)
            
            _, class_val_outputs = torch.max(val_outputs.data, dim=1)
            val_correct += (class_val_outputs == val_labels).int().sum().item()
    avg_val_loss = running_val_loss / num_val
    avg_val_acc = val_correct / num_val
    
    print(f"[Epoch {epoch+1}/{EPOCHS}]: train-loss = {avg_train_loss:.4f} | train-acc = {avg_train_acc:.3f} "
          f"| val-loss = {avg_val_loss:.4f} | val-acc = {avg_val_acc}")

    writer.add_scalars('Training vs Validation Loss',
                       {'Training': avg_train_loss, 'Validation': avg_val_loss},
                       epoch + 1)
    writer.add_scalars('Training vs Validation Accuracy',
                       {'Training': avg_train_acc, 'Validation': avg_val_acc},
                       epoch + 1)
    writer.flush()

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model_path = f"models/resnet_{timestamp}/resnet_{epoch}"
        torch.save(model.state_dict(), model_path)

Starting training...
[Epoch 1/3]: train-loss = 1.5002 | train-acc = 0.961 | val-loss = 1.6199 | val-acc = 0.8398
[Epoch 2/3]: train-loss = 1.4994 | train-acc = 0.962 | val-loss = 1.6183 | val-acc = 0.8428


KeyboardInterrupt: 

In [7]:

from model import ResNet
model = ResNet().cuda()
model.load_state_dict(torch.load('models/resnet-v2_20240417_013340/checkpoint_173'))

model.modules()

<generator object Module.modules at 0x7f3760ca0930>

In [11]:
for m in model.modules():
    # print(m.bias)
    p

AttributeError: 'ResNet' object has no attribute 'bias'

tensor([17,  0,  9, 12, 16,  7, 11, 18, 13, 19, 15,  4, 10, 14,  3,  8,  2,  5,
         1,  6])