# Initialization

In [4]:
import torch
import time

In [5]:
# TimeTracker
class TimeTracker:
    def __init__(self, name):
        self.name = name
        self.start_time = None
        self.start()

    def start(self):
        self.start_time = time.time()
        print("############# Started Time Tracking:", self.name, "#############")

    def stop(self):
        end_time = time.time()
        duration = int((end_time - self.start_time) * 1000)

        print("Duration:", duration, "ms")
        print("############# Stopped Time Tracking:", self.name, "#############")

        return duration

In [6]:
# Sequential matrix multiply
def MatrixMultiplySeq(A, B):
    matrix_size = A.size(0)
    Result = torch.zeros((matrix_size, matrix_size))

    for ROW in range(matrix_size):
        for COL in range(matrix_size):
            tmp_sum = 0
            for i in range(matrix_size):
                tmp_sum += A[ROW, i] * B[i, COL]
            Result[ROW, COL] = tmp_sum

    return Result

In [7]:
# Check prerequisits
print("CUDA available: ", torch.cuda.is_available())
print("Available GPUs:", torch.cuda.device_count())

CUDA available:  True
Available GPUs: 1


In [None]:
# Initialize size
matrix_size = 1000

# Create random values
A = torch.rand(matrix_size, matrix_size)
B = torch.rand(matrix_size, matrix_size)
print("A: ", A)
print("B: ", B)

# Run matrix multiplikation on CPU (Sequential)

In [None]:
# This is really the stupid and long aproach, so the matrices are limited
# You will see the reason in the duration time
matrix_size_Seq = 100
A_Cpu_Seq = A[:matrix_size_Seq, :matrix_size_Seq]
B_Cpu_Seq = B[:matrix_size_Seq, :matrix_size_Seq]

# Start tracker
tracker = TimeTracker("CPU Sequential")

# Run Matrix Multiply on CPU (Sequential)
result_Cpu_Seq = MatrixMultiplySeq(A_Cpu_Seq, B_Cpu_Seq)

# Print results
duration_Cpu_Seq =tracker.stop()
print("Result: ", result_Cpu_Seq)

# Exercise 1: Run custom kernel on GPU (Parallel)

To activate the development environment for the Microsoft Visual Studio C++ compiler with a 64-bit configuration and for compiling CUDA code the following command must be executed in the Terminal.
1. File > New > Terminal
2. Run: ```& "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64```

In [None]:
# Set prerequisits
from torch.utils.cpp_extension import load
import os
os.environ['PATH'] += ";C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Tools\\MSVC\\14.37.32822\\bin\\Hostx64\\x64"

In [None]:
# Check prerequisits
print("Ninja available: ",torch.utils.cpp_extension.is_ninja_available())

In [None]:
# Load custom CUDA-Kernel
print("Start load kernel")
CustomMatrixMultiply = load(
    name="MatrixMultiply",
    sources=["MatrixMultiplyKernel.cu"],
    extra_cuda_cflags=["--expt-relaxed-constexpr"]
    # verbose=True # Activate for detail build output
).MatrixMultiply
print("Finish load kernel")

In [None]:
# Start tracker
tracker = TimeTracker("GPU_CustomKernel")

# Copy matrix to GPU
# TODO: Copy matrix A to GPU, e.g. A_GPU_Custom = ...
# Your Code here

# TODO: Copy matrix B to GPU, e.g. B_GPU_Custom = ...
# Your Code here

# Allocate an matrix with zeros on GPU
result_Gpu_Custom = torch.zeros(matrix_size, matrix_size).cuda()

# TODO: Run CustomMatrixMultiply on GPU, e.g. CustomMatrixMultiply(...)
# Your Code here 

# Print results
duration_Gpu_Custom = tracker.stop()
print("Result: ", result_Gpu_Custom)

# Exercise 2: Run matrix multipliktion on CPU (PyTorch)

In [None]:
# Start tracker
tracker = TimeTracker("CPU PyTorch")

# TODO: Run Matrix Multiply on CPU (Parallel), e.g. result_CPU_PyTorch = ...
# Your Code here 

# Print results
duration_Cpu_PyTorch = tracker.stop()
print("Result: ", result_Cpu_PyTorch)

# Exercise 3: Run matrix multiplikation on GPU (PyTorch)

In [None]:
# Start tracker
tracker = TimeTracker("GPU PyTorch")

# Copy matrix to GPU
# TODO: Copy matrix A to GPU, e.g. A_Gpu_PyTorch = ...
# Your Code here

# TODO: Copy matrix B to GPU, e.g. B_Gpu_PyTorch = ...
# Your Code here

# TODO: Run Matrix Multiply on GPU (Parallel), e.g. result_GPU_PyTorch = ...
# Your Code here 

# Print results
duration_Gpu_PyTorch = tracker.stop()
print("Result: ", result_Gpu_PyTorch)

# Summary

In [None]:
print("Matrix size: ", matrix_size)
print("A: ", A)
print("B: ", B)
print("Result: ", result_Gpu_PyTorch)
print("Duration CPU - Limited to 100x100 Matrix (Sequential): ", str(duration_Cpu_Seq) + " ms")
print("Duration GPU Custom (Parallel): ", str(duration_Gpu_Custom) + " ms")
print("Duration CPU PyTorch (Parallel): ", str(duration_Cpu_PyTorch) + " ms")
print("Duration GPU PyTorch (Parallel): ", str(duration_Gpu_PyTorch) + " ms") 