In [104]:
BASE_IMAGE = "pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime"

In [105]:
import kfp
import kfp.dsl as dsl
from kfp.dsl import Input, Output
from kfp.dsl import Dataset, Artifact
from kfp.dsl import Model, Metrics, ClassificationMetrics

from typing import NamedTuple

In [106]:
@dsl.component(
    base_image=BASE_IMAGE,
)
def load_mnist_data(
    train_images_pickle: Output[Dataset],
    train_labels_pickle: Output[Dataset],
    test_images_pickle: Output[Dataset],
    test_labels_pickle: Output[Dataset],
):
    # import dataset
    import torch
    import torchvision
    import numpy as np
    import pickle
    
    
    # load dataset    
    subset_indices = list(range(1000))
    train_set = torch.utils.data.Subset(torchvision.datasets.MNIST('.', 
                        train=True, download=True),subset_indices)
    subset_indices = list(range(100))
    test_set = torch.utils.data.Subset(torchvision.datasets.MNIST('.', 
                        train=False, download=True),subset_indices)
    print("MNIST dataset has been downloaded")
    train_images = [img for img, label in train_set]
    train_labels = [label for img, label in train_set]
    test_images = [img for img, label in test_set]
    test_labels = [label for img, label in test_set]
    
    print("MNIST dataset has been separated")    
  
        # Save the data using pickle
    with open(train_images_pickle.path, 'wb') as f:
        pickle.dump(train_images, f)
    with open(train_labels_pickle.path, 'wb') as f:
        pickle.dump(train_labels, f)
    with open(test_images_pickle.path, 'wb') as f:
        pickle.dump(test_images, f)
    with open(test_labels_pickle.path, 'wb') as f:
         pickle.dump(test_labels, f)        

    print("MNIST dataset has been pickled.")    


In [107]:
@dsl.component(base_image=BASE_IMAGE)
def preprocess_data(
    train_images_pickle: Input[Dataset],
    train_labels_pickle: Input[Dataset],
    test_images_pickle: Input[Dataset],
    test_labels_pickle: Input[Dataset],
    train_images_prep: Output[Dataset],
    train_labels_prep: Output[Dataset],
    test_images_prep: Output[Dataset],
    test_labels_prep: Output[Dataset],
    
) -> NamedTuple("outputs", input_size=int, num_labels=int):
    import numpy as np
    import pickle
    from typing import NamedTuple

    with open(train_images_pickle.path, "rb") as file:
        train_images = pickle.load(file)
        
    with open(train_labels_pickle.path, "rb") as file:
        train_labels = pickle.load(file)
    
    with open(test_images_pickle.path, "rb") as file:
        test_images= pickle.load(file)
        
    with open(test_labels_pickle.path, "rb") as file:
        test_labels= pickle.load(file)

    input_size = len(train_images)
    num_labels = len(train_labels)
#     num_labels = len(np.unique(y_train))

#     y_train = to_categorical(y_train)
#     y_test = to_categorical(y_test)
#     image_size = x_train.shape[1]
#     input_size = image_size * image_size
#     # resize and normalize
#     x_train = np.reshape(x_train, [-1, input_size])
#     x_train = x_train.astype("float32") / 255
#     x_test = np.reshape(x_test, [-1, input_size])
#     x_test = x_test.astype("float32") / 255
    with open(train_images_prep.path, "wb") as file:
        pickle.dump(train_images, file)

    with open(train_labels_prep.path, "wb") as file:
        pickle.dump(train_labels, file)

    with open(test_images_prep.path, "wb") as file:
        pickle.dump(test_images, file)

    with open(test_labels_prep.path, "wb") as file:
        pickle.dump(test_labels, file)
        
    outputs = NamedTuple("outputs", input_size=int, num_labels=int)
    return outputs(input_size, num_labels)

In [126]:
@dsl.component(base_image=BASE_IMAGE)
def train(
    input_size: int,
    num_labels: int,
    epochs: int,
    train_images_prep: Input[Dataset],
    train_labels_prep: Input[Dataset],
    model_artifact: Output[Model],
    log: Output[Artifact],
):
    import torch
    import torchvision
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim    
    import pickle
    from datetime import datetime

    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x)     
    

    batch_size_train = 100
    batch_size_test = 100
    learning_rate = 0.01
    momentum = 0.5
    log_interval = 10
    epochs = 1

    random_seed = 1
    torch.backends.cudnn.enabled = False
    torch.manual_seed(random_seed)

    with open(train_images_prep.path, "rb") as file:
        train_images = pickle.load(file)

    with open(train_labels_prep.path, "rb") as file:
        train_labels = pickle.load(file)

    log_dir = f"{log.path}/logs/fit/{datetime.now().strftime('%Y%m%d-%H%M%S')}"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Running on {device}.")
    network = Net()
    optimizer = optim.SGD(network.parameters(), lr=learning_rate,
                          momentum=momentum)    
    # transform=torchvision.transforms.Compose([
    #     torchvision.transforms.ToTensor(),
    #     torchvision.transforms.Normalize(
    #                  (0.1307,), (0.3081,))])    
    train_losses = []
    train_counter = []
    test_losses = []
    test_counter = [i* input_size for i in range(epochs + 1)]   
    #transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
    transform=torchvision.transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.Resize((28,28)),
        torchvision.transforms.ToTensor()
    ])    
    print("started training")
    network.train()
    
#    for epoch in range(1, epochs + 1):
    for data, target in zip(train_images, train_labels):
        optimizer.zero_grad()
        output = network(data)
        loss = F.nll_loss(output, transform(target))
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
          print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_idx * len(data), input_size,
            100. * batch_idx / input_size, loss.item()))
          train_losses.append(loss.item())
          train_counter.append(
            (batch_idx*64) + ((epoch-1)*input_size))
          torch.save(network.state_dict(), model_artifact.path + '/model.pth')
          torch.save(optimizer.state_dict(), model_artifact.path + '/optimizer.pth')    

    print("finished training")

    
      

In [127]:
@dsl.component(
    base_image=BASE_IMAGE,
    packages_to_install=["scikit-learn"],
)
def evaluate(
    model_artifact: Input[Model],
    metrics: Output[ClassificationMetrics],
    scalar_metrics: Output[Metrics],
    test_images_prep: Input[Dataset],
    test_labels_prep: Input[Dataset],
):
    import torch
    import torchvision
    import torch.nn as nn
    import torch.nn.functional as F
    import torch.optim as optim    
    import numpy as np
    import pickle
    from sklearn.metrics import classification_report, confusion_matrix
    import seaborn as sns
    import matplotlib.pyplot as plt
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
            self.fc2 = nn.Linear(50, 10)

        def forward(self, x):
            x = F.relu(F.max_pool2d(self.conv1(x), 2))
            x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
            x = x.view(-1, 320)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, training=self.training)
            x = self.fc2(x)
            return F.log_softmax(x)     
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Running on {device}.")    
    
    with open(test_images_prep.path, "rb") as file:
        test_images = pickle.load(file)

    with open(test_labels_prep.path, "rb") as file:
        test_labels = pickle.load(file)  
  
         
    

    network = model.load_state_dict(torch.load(model_artifact.path + '/model.pth', weights_only=True))

    # Set the model to evaluation mode. This is important as certain layers like dropout behave differently during training and evaluation.
    network.eval()

    # Lists to store all predictions and true labels
    all_preds = []
    all_labels = []
    # Define the class labels for the Fashion MNIST dataset.
    classes = ('zero','one','two','three','four','five','six','seven','eight','nine')

    # We don't want to compute gradients during evaluation, hence wrap the code inside torch.no_grad()
    with torch.no_grad():
        # Iterate over all batches in the test loader
        for images, labels in zip(test_images, test_labels):        
            # Transfer images and labels to the computational device (either CPU or GPU)
            images, labels = images.to(device), labels.to(device)

            # Pass the images through the model to get predictions
            outputs = network(images)

            # Get the class with the maximum probability as the predicted class
            _, predicted = torch.max(outputs, 1)

            # Extend the all_preds list with predictions from this batch
            all_preds.extend(predicted.cpu().numpy())

            # Extend the all_labels list with true labels from this batch
            all_labels.extend(labels.cpu().numpy())

    # Print a classification report which provides an overview of the model's performance for each class
    print(classification_report(all_labels, all_preds, target_names=classes))


In [128]:
@dsl.pipeline(
    name="pytorch_mnist_pipeline_v3",
)
def pytorch_mnist_pipeline_v3(epochs: int):
    data = (
        load_mnist_data()
        .set_memory_limit("4G")
        .set_memory_request("2G")
        .set_cpu_limit("2")
        .set_cpu_request("1")
    )
    preprocess = (
        preprocess_data(
            train_images_pickle=data.outputs["train_images_pickle"],
            train_labels_pickle=data.outputs["train_labels_pickle"],
            test_images_pickle=data.outputs["test_images_pickle"],
            test_labels_pickle=data.outputs["test_labels_pickle"],
        )
        .set_memory_limit("4G")
        .set_memory_request("4G")
        .set_cpu_limit("1")
        .set_cpu_request("1")
    )
    preprocess.after(data)
    model = (
        train(
            input_size=preprocess.outputs["input_size"],
            num_labels=preprocess.outputs["num_labels"],
            epochs=epochs,
            train_images_prep=preprocess.outputs["train_images_prep"],
            train_labels_prep=preprocess.outputs["train_labels_prep"],
        )
        # .set_memory_limit("6G")
        # .set_memory_request("6G")
        # .set_cpu_limit("1")
        # .set_cpu_request("1")
    )
    model.after(preprocess)
    evaluation = (
        evaluate(
            model_artifact=model.outputs["model_artifact"],
            test_images_prep=preprocess.outputs["test_images_prep"],
            test_labels_prep=preprocess.outputs["test_labels_prep"],
        )
        .set_memory_limit("4G")
        .set_memory_request("4G")
        .set_cpu_limit("1")
        .set_cpu_request("1")
    )
    evaluation.after(model)


client = kfp.Client()
client.create_run_from_pipeline_func(
    pytorch_mnist_pipeline_v3,
    arguments={"epochs": 2},
    experiment_name="pytorch_mnist_pipeline_v3",
)

RunPipelineResult(run_id=8b1b0c14-7af9-450f-a2a1-97e9b2a800a6)