# Initialization

In [2]:
import torch
import time

In [3]:
# TimeTracker
class TimeTracker:
    def __init__(self, name):
        self.name = name
        self.start_time = None
        self.start()

    def start(self):
        self.start_time = time.time()
        print("############# Started Time Tracking:", self.name, "#############")

    def stop(self):
        end_time = time.time()
        duration = int((end_time - self.start_time) * 1000)

        print("Duration:", duration, "ms")
        print("############# Stopped Time Tracking:", self.name, "#############")

        return duration

In [4]:
# Sequential matrix multiply
def MatrixMultiplySeq(A, B):
    matrix_size = A.size(0)
    Result = torch.zeros((matrix_size, matrix_size))

    for ROW in range(matrix_size):
        for COL in range(matrix_size):
            tmp_sum = 0
            for i in range(matrix_size):
                tmp_sum += A[ROW, i] * B[i, COL]
            Result[ROW, COL] = tmp_sum

    return Result

In [5]:
# Check prerequisits
print("CUDA available: ", torch.cuda.is_available())
print("Available GPUs:", torch.cuda.device_count())

CUDA available:  True
Available GPUs: 1


In [6]:
# Initialize size
matrix_size = 1000

# Create random values
A = torch.rand(matrix_size, matrix_size)
B = torch.rand(matrix_size, matrix_size)
print("A: ", A)
print("B: ", B)

A:  tensor([[0.1839, 0.8148, 0.3589,  ..., 0.5889, 0.0855, 0.7762],
        [0.2786, 0.5245, 0.8086,  ..., 0.0329, 0.4164, 0.6768],
        [0.9556, 0.5047, 0.2211,  ..., 0.1578, 0.7966, 0.4926],
        ...,
        [0.2736, 0.0969, 0.4181,  ..., 0.3368, 0.7930, 0.1242],
        [0.8412, 0.9579, 0.2056,  ..., 0.9053, 0.7438, 0.9192],
        [0.0948, 0.0088, 0.6437,  ..., 0.4494, 0.1825, 0.1127]])
B:  tensor([[0.9794, 0.1497, 0.9190,  ..., 0.5045, 0.5841, 0.6646],
        [0.3410, 0.3125, 0.8289,  ..., 0.1147, 0.4573, 0.5800],
        [0.4055, 0.4458, 0.1741,  ..., 0.8753, 0.3773, 0.8459],
        ...,
        [0.3813, 0.7618, 0.0725,  ..., 0.9794, 0.1912, 0.1182],
        [0.6624, 0.9045, 0.1557,  ..., 0.1353, 0.3082, 0.4754],
        [0.4569, 0.4951, 0.5666,  ..., 0.9017, 0.1788, 0.2586]])


# Run matrix multiplikation on CPU (Sequential)

In [7]:
# This is really the stupid and long aproach, so the matrices are limited
# You will see the reason in the duration time
matrix_size_Seq = 100
A_Cpu_Seq = A[:matrix_size_Seq, :matrix_size_Seq]
B_Cpu_Seq = B[:matrix_size_Seq, :matrix_size_Seq]

# Start tracker
tracker = TimeTracker("CPU Sequential")

# Run Matrix Multiply on CPU (Sequential)
result_Cpu_Seq = MatrixMultiplySeq(A_Cpu_Seq, B_Cpu_Seq)

# Print results
duration_Cpu_Seq =tracker.stop()
print("Result: ", result_Cpu_Seq)

############# Started Time Tracking: CPU Sequential #############
Duration: 18755 ms
############# Stopped Time Tracking: CPU Sequential #############
Result:  tensor([[26.6406, 25.1083, 25.5114,  ..., 25.0780, 26.5163, 23.1867],
        [28.0048, 28.2908, 26.9479,  ..., 25.4912, 26.7530, 25.1488],
        [24.3587, 24.6768, 25.2260,  ..., 22.7775, 25.9049, 23.4128],
        ...,
        [26.1021, 24.8202, 25.7486,  ..., 24.5460, 23.8377, 23.4695],
        [22.3583, 21.7431, 22.1539,  ..., 20.3475, 21.0651, 19.5448],
        [26.8412, 26.5715, 25.6966,  ..., 24.4177, 26.0884, 24.8622]])


# Exercise 1: Run custom kernel on GPU (Parallel)

In [8]:
# Set prerequisits
from torch.utils.cpp_extension import load
import os
os.environ['PATH'] += ";C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.37.32822\\bin\\Hostx64\\x64"

In [9]:
# Check prerequisits
print("Ninja available: ",torch.utils.cpp_extension.is_ninja_available())

Ninja available:  True


In [10]:
# Load custom CUDA-Kernel
print("Start load kernel")
CustomMatrixMultiply = load(
    name="MatrixMultiply",
    sources=["MatrixMultiplyKernel.cu"],
    extra_cuda_cflags=["--expt-relaxed-constexpr"]
    # verbose=True # Activate for detail build output
).MatrixMultiply
print("Finish load kernel")

Start load kernel
Finish load kernel


In [11]:
# Start tracker
tracker = TimeTracker("GPU_CustomKernel")

# Copy matrix to GPU
# TODO: Copy matrix A to GPU, e.g. A_GPU_Custom = ...
# Your Code here

# TODO: Copy matrix B to GPU, e.g. B_GPU_Custom = ...
# Your Code here

# Allocate an matrix with zeros on GPU
result_Gpu_Custom = torch.zeros(matrix_size, matrix_size).cuda()

# TODO: Run CustomMatrixMultiply on GPU, e.g. CustomMatrixMultiply(...)
# Your Code here 

# Print results
duration_Gpu_Custom = tracker.stop()
print("Result: ", result_Gpu_Custom)

############# Started Time Tracking: GPU_CustomKernel #############
Duration: 191 ms
############# Stopped Time Tracking: GPU_CustomKernel #############
Result:  tensor([[253.9339, 250.9885, 249.6272,  ..., 261.6343, 248.2964, 252.8956],
        [259.5894, 260.4133, 248.2497,  ..., 262.2768, 250.4139, 256.7153],
        [246.5911, 249.2997, 237.7090,  ..., 255.3274, 246.4156, 236.3468],
        ...,
        [253.4617, 246.9597, 245.1122,  ..., 257.6799, 244.6112, 240.4261],
        [258.8254, 257.8255, 249.5302,  ..., 258.0111, 249.5849, 246.4512],
        [247.7889, 240.9982, 234.5899,  ..., 245.8352, 241.0048, 233.2119]],
       device='cuda:0')


# Exercise 2: Run matrix multipliktion on CPU (PyTorch)

In [12]:
# Start tracker
tracker = TimeTracker("CPU PyTorch")

# TODO: Run Matrix Multiply on CPU (Parallel), e.g. result_CPU_PyTorch = ...
# Your Code here 

# Print results
duration_Cpu_PyTorch = tracker.stop()
print("Result: ", result_Cpu_PyTorch)

############# Started Time Tracking: CPU PyTorch #############
Duration: 73 ms
############# Stopped Time Tracking: CPU PyTorch #############
Result:  tensor([[253.9340, 250.9885, 249.6272,  ..., 261.6343, 248.2964, 252.8955],
        [259.5894, 260.4132, 248.2497,  ..., 262.2769, 250.4138, 256.7155],
        [246.5911, 249.2997, 237.7092,  ..., 255.3273, 246.4155, 236.3471],
        ...,
        [253.4615, 246.9597, 245.1119,  ..., 257.6801, 244.6113, 240.4261],
        [258.8253, 257.8257, 249.5302,  ..., 258.0111, 249.5849, 246.4514],
        [247.7888, 240.9982, 234.5901,  ..., 245.8354, 241.0048, 233.2120]])


# Exercise 3: Run matrix multiplikation on GPU (PyTorch)

In [13]:
# Start tracker
tracker = TimeTracker("GPU PyTorch")

# Copy matrix to GPU
# TODO: Copy matrix A to GPU, e.g. A_Gpu_PyTorch = ...
# Your Code here

# TODO: Copy matrix B to GPU, e.g. B_Gpu_PyTorch = ...
# Your Code here

# TODO: Run Matrix Multiply on GPU (Parallel), e.g. result_GPU_PyTorch = ...
# Your Code here 

# Print results
duration_Gpu_PyTorch = tracker.stop()
print("Result: ", result_Gpu_PyTorch)

############# Started Time Tracking: GPU PyTorch #############
Duration: 177 ms
############# Stopped Time Tracking: GPU PyTorch #############
Result:  tensor([[253.9339, 250.9885, 249.6272,  ..., 261.6343, 248.2964, 252.8956],
        [259.5894, 260.4133, 248.2497,  ..., 262.2768, 250.4139, 256.7153],
        [246.5911, 249.2997, 237.7090,  ..., 255.3274, 246.4156, 236.3468],
        ...,
        [253.4617, 246.9597, 245.1122,  ..., 257.6799, 244.6112, 240.4261],
        [258.8254, 257.8255, 249.5302,  ..., 258.0111, 249.5849, 246.4512],
        [247.7889, 240.9982, 234.5899,  ..., 245.8352, 241.0048, 233.2119]],
       device='cuda:0')


# Summary

In [20]:
print("Matrix size: ", matrix_size)
print("A: ", A)
print("B: ", B)
print("Result: ", result_Gpu_PyTorch)
print("Duration CPU - Limited to 100x100 Matrix (Sequential): ", str(duration_Cpu_Seq) + " ms")
print("Duration GPU Custom (Parallel): ", str(duration_Gpu_Custom) + " ms")
print("Duration CPU PyTorch (Parallel): ", str(duration_Cpu_PyTorch) + " ms")
print("Duration GPU PyTorch (Parallel): ", str(duration_Gpu_PyTorch) + " ms") 

Matrix size:  1000
A:  tensor([[0.1839, 0.8148, 0.3589,  ..., 0.5889, 0.0855, 0.7762],
        [0.2786, 0.5245, 0.8086,  ..., 0.0329, 0.4164, 0.6768],
        [0.9556, 0.5047, 0.2211,  ..., 0.1578, 0.7966, 0.4926],
        ...,
        [0.2736, 0.0969, 0.4181,  ..., 0.3368, 0.7930, 0.1242],
        [0.8412, 0.9579, 0.2056,  ..., 0.9053, 0.7438, 0.9192],
        [0.0948, 0.0088, 0.6437,  ..., 0.4494, 0.1825, 0.1127]])
B:  tensor([[0.9794, 0.1497, 0.9190,  ..., 0.5045, 0.5841, 0.6646],
        [0.3410, 0.3125, 0.8289,  ..., 0.1147, 0.4573, 0.5800],
        [0.4055, 0.4458, 0.1741,  ..., 0.8753, 0.3773, 0.8459],
        ...,
        [0.3813, 0.7618, 0.0725,  ..., 0.9794, 0.1912, 0.1182],
        [0.6624, 0.9045, 0.1557,  ..., 0.1353, 0.3082, 0.4754],
        [0.4569, 0.4951, 0.5666,  ..., 0.9017, 0.1788, 0.2586]])
Result:  tensor([[253.9339, 250.9885, 249.6272,  ..., 261.6343, 248.2964, 252.8956],
        [259.5894, 260.4133, 248.2497,  ..., 262.2768, 250.4139, 256.7153],
        [246.591