CPU: Rolled vs Unrolled Loops
======

In [1]:
import numpy as np
import time

In [2]:
iterations = 10
X = np.random.random((4096, 1024)).astype('float32')
W256 = np.random.random((1024, 256)).astype('float32')

In [3]:
def MeanAndStandardError(sum_x, sum_x_squared, n):
    mean = sum_x / n
    mean_squared = sum_x_squared / n
    population_variance = mean_squared - (mean**2)
    sample_variance = (n / (n - 1)) * population_variance
    standard_error_of_mean = np.sqrt(sample_variance / n)
    # var((1/n) * x) = ((1/n)**2) * var(x)    [for 1 observation]
    # n * var(x) / (n**2) = var(x) / n    [for n observations]
    return (mean, standard_error_of_mean)

In [4]:
# rolled loop: 1 accumulator: Z[i,j]
W = np.expand_dims(W256[:, 0], axis = 1)    # only 1 feature
cumulative_elapsed_time = 0.0
cumulative_elapsed_time_squared = 0.0
for iteration in range(iterations):
    Z = np.zeros((X.shape[0], W.shape[1])).astype('float32')
    start_time = time.perf_counter()
    for i in range(Z.shape[0]):
        for j in range(Z.shape[1]):
            k = 0
            while (k < X.shape[1]):
                Z[i, j] += (X[i, k] * W[k, j])
                k += 1
    elapsed_time = (time.perf_counter() - start_time)
    cumulative_elapsed_time += elapsed_time
    cumulative_elapsed_time_squared += (elapsed_time**2)
print(Z[-1, 0], MeanAndStandardError(cumulative_elapsed_time, cumulative_elapsed_time_squared, iterations))

249.84029 (3.290670942980796, 0.0060688748885015095)


In [5]:
# unrolled loop: 4 accumulators { dot_product0, dot_product1, dot_product2, dot_product }
cumulative_elapsed_time = 0.0
cumulative_elapsed_time_squared = 0.0
for iteration in range(iterations):
    Z = np.zeros((X.shape[0], W.shape[1])).astype('float32')
    start_time = time.perf_counter()
    for i in range(Z.shape[0]):
        for j in range(Z.shape[1]):
            dot_product0 = np.float32(0)
            dot_product1 = np.float32(0)
            dot_product2 = np.float32(0)
            dot_product3 = np.float32(0)
            k = 0
            while (k < X.shape[1]):
                dot_product0 += (X[i, k + 0] * W[k + 0, j])
                dot_product1 += (X[i, k + 1] * W[k + 1, j])
                dot_product2 += (X[i, k + 2] * W[k + 2, j])
                dot_product3 += (X[i, k + 3] * W[k + 3, j])
                k += 4
            Z[i, j] = dot_product0 + dot_product1 + dot_product2 + dot_product3
    elapsed_time = (time.perf_counter() - start_time)
    cumulative_elapsed_time += elapsed_time
    cumulative_elapsed_time_squared += (elapsed_time**2)
print(Z[-1, 0], MeanAndStandardError(cumulative_elapsed_time, cumulative_elapsed_time_squared, iterations))

249.84035 (2.2719831331749445, 0.005825099914086097)


CPU vs GPU
======

In [6]:
W = W256    # 256 features instead of 1
import tensorflow as tf
from tensorflow.keras import Sequential, layers, initializers
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)    # only 1 thread
devices = tf.config.list_logical_devices()
for device in devices:
    print(device)
    with tf.device(device.name):
        model = Sequential()
        model.add(layers.Dense(W.shape[1], input_shape = (W.shape[0],), kernel_initializer = initializers.constant(W), use_bias = False))
        Z = model.predict(X, batch_size = X.shape[0])    # initialization
        cumulative_elapsed_time = 0.0
        cumulative_elapsed_time_squared = 0.0
        for iteration in range(iterations):
            start_time = time.perf_counter()
            Z = model.predict(X, batch_size = X.shape[0])
            elapsed_time = (time.perf_counter() - start_time)
            cumulative_elapsed_time += elapsed_time
            cumulative_elapsed_time_squared += (elapsed_time**2)
        print(Z[-1, 0], MeanAndStandardError(cumulative_elapsed_time, cumulative_elapsed_time_squared, iterations))


LogicalDevice(name='/device:CPU:0', device_type='CPU')
249.84029 (0.05882289730943739, 0.0012454675702734093)
LogicalDevice(name='/device:GPU:0', device_type='GPU')
249.83072 (0.05133588059106842, 0.0012193925211895044)
