# Serial Execution

Resnet 50 Model with 20 CPUs for Resnet 50

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torchvision.models import resnet50, ResNet50_Weights
import time
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
num_cpu_cores = torch.get_num_threads()
print("Number of CPU cores:", num_cpu_cores)

Number of CPU cores: 20


In [3]:
data_dir = '/home/hindupur.v/varsha_hpp/dataset/lung_colon_image_set'

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [5]:
weights = ResNet50_Weights.DEFAULT
model = resnet50(weights=weights)
num_ftrs = model.fc.in_features
num_classes = 5
model.fc = nn.Linear(num_ftrs, num_classes)  # 5 classes
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [6]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class LungColonCancerDataset(datasets.ImageFolder):
    def __init__(self, root_dir, transform=None):
        super(LungColonCancerDataset, self).__init__(root=root_dir, transform=transform)


In [7]:
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.cuda()
            labels = labels.cuda()
            
            # Use AMP for evaluation
            with amp.autocast():
                outputs = model(inputs)
            
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy, precision, recall, f1

In [8]:
def plot_learning_curves(train_losses, valid_losses, valid_accuracies):
    plt.plot(train_losses, label='Train Loss')
    plt.plot(valid_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')
    plt.show()

    plt.plot(valid_accuracies, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Validation Accuracy')
    plt.show()

In [9]:
def train(num_workers, batch_size):
    print("Starting training...")

    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # train dataset
    train_dataset = LungColonCancerDataset(root_dir=os.path.join(data_dir, 'train'), transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)

    # valid dataset
    validation_dataset = LungColonCancerDataset(root_dir=os.path.join(data_dir, 'valid'), transform=transform)
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
    
    # test dataset
    test_dataset = LungColonCancerDataset(root_dir=os.path.join(data_dir, 'test'), transform=transform)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    train_losses = []
    valid_losses = []
    valid_accuracies = []

    start_time = time.time()
    print("Model Training Start Time:", start_time)

    for epoch in range(10):  # number of epochs
        epoch_start_time = time.time()  # Start time for the epoch
        running_loss = 0.0
        epoch_train_losses = []
        for batch_idx, (inputs, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            epoch_train_losses.append(loss.item())  # Appending the loss for all the batch

            if batch_idx % 10 == 0:  # Log every 10 batches
                print(f"Epoch {epoch+1} Batch {batch_idx+1}: Loss {loss.item()}")

        train_losses.append(sum(epoch_train_losses) / len(epoch_train_losses))

        epoch_duration = time.time() - epoch_start_time
        print("Each Epoch Duration:", epoch_duration)

        # Validation
        model.eval()
        valid_loss = 0.0
        correct = 0
        with torch.no_grad():
            for inputs, labels in validation_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                valid_loss += loss.item()
                _, preds = torch.max(outputs, 1)
                correct += torch.sum(preds == labels.data).item()

        valid_loss = valid_loss / len(validation_loader.dataset)
        valid_acc = correct / len(validation_loader.dataset)
        valid_losses.append(valid_loss)
        valid_accuracies.append(valid_acc)
        print(f'Epoch {epoch+1}, Validation Loss: {valid_loss}, Validation Acc: {valid_acc}')

    print("Test Results:")
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += torch.sum(preds == labels.data).item()

    test_loss = test_loss / len(test_loader.dataset)
    test_acc = correct / len(test_loader.dataset)
    print(f'Test Loss: {test_loss}, Test Acc: {test_acc}')

    end_time = time.time()
    print("Model Training End Time:", end_time)

    duration = end_time - start_time
    print("Training duration:", duration, "seconds")

    return model


In [10]:
# Calling the training function to get the trained model executing serially
num_workers = 20 # instead of 28 allocated resources we are using 20
batch_size = 1000
trained_model = train(num_workers, batch_size)

Starting training...
Model Training Start Time: 1713146766.8438027
Epoch 1 Batch 1: Loss 1.5834465026855469
Epoch 1 Batch 11: Loss 1.4695097208023071
Each Epoch Duration: 454.13565945625305
Epoch 1, Validation Loss: 0.0016140839099884033, Validation Acc: 0.6972
Epoch 2 Batch 1: Loss 1.312660813331604
Epoch 2 Batch 11: Loss 0.6506096124649048
Each Epoch Duration: 312.9125328063965
Epoch 2, Validation Loss: 0.00028225359618663787, Validation Acc: 0.9352
Epoch 3 Batch 1: Loss 0.18943431973457336
Epoch 3 Batch 11: Loss 0.1286216527223587
Each Epoch Duration: 312.2545623779297
Epoch 3, Validation Loss: 0.00010889855772256851, Validation Acc: 0.9664
Epoch 4 Batch 1: Loss 0.07314293831586838
Epoch 4 Batch 11: Loss 0.06757137179374695
Each Epoch Duration: 313.1047863960266
Epoch 4, Validation Loss: 9.073022864758968e-05, Validation Acc: 0.974
Epoch 5 Batch 1: Loss 0.04860764741897583
Epoch 5 Batch 11: Loss 0.045507095754146576
Each Epoch Duration: 313.2925455570221
Epoch 5, Validation Loss: 6.

Resnet 50 Model with 20 CPUs for Resnet 50

Conclusion:

The average time execution for each epoch is given below for Resnet 50:

Epoch 1 Duration: 304.2916314601898

Epoch 2 Duration: 292.7004778385162

Epoch 3 Duration: 312.2545623779297

Epoch 4 Duration: 313.1047863960266

Epoch 5 Duration: 313.2925455570221

Epoch 6 Duration: 314.3557872772217

Epoch 7 Duration: 312.7239208221435

Epoch 8 Duration: 313.0403878688812

Epoch 9 Duration: 311.6718096733093

Epoch 10 Duration: 319.4998972415924

So, the average time execution for each epoch for ResNet 50 is 312.09 seconds.
