<a href="https://colab.research.google.com/github/usm-cos-432/InClass/blob/master/chapter2/MatMultiplyPerformance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### In Class Exercise

Make sure you understand the code, then run on Google Colab

Be prepared to explain the how the different vectorization and pytorch techniques contribution to performance



code adapted from [blog](https://sanjayasubedi.com.np/python/efficient-matrix-multiplication-in-python/) 

In [None]:
import torch, numpy as np, datetime
cuda = torch.device('cuda')
cpu = torch.device('cpu')

In [None]:
import numpy as np

a = np.random.normal(size=(2000, 784)).astype('float32')
b = np.random.normal(size=(784, 10)).astype('float32')

expected = np.matmul(a, b)

In [None]:
def py_matmul1(a, b):
    ra, ca = a.shape
    rb, cb = b.shape
    assert ca == rb, f"{ca} != {rb}"
    
    output = np.zeros(shape=(ra, cb))
    for i in range(ra):
        for j in range(cb):
            for k in range(rb):
                output[i, j] += a[i, k] * b[k, j]
                
    return output

%time result = py_matmul1(a, b)
assert result.shape == expected.shape
assert np.allclose(result, expected, rtol=1e-01), (result, expected)

In [None]:
def py_matmul2(a, b):
    ra, ca = a.shape
    rb, cb = b.shape
    assert ca == rb, f"{ca} != {rb}"
    
    output = np.zeros(shape=(ra, cb))
    for i in range(ra):
        for j in range(cb):
	        # we replaced the loop with dot product
            output[i, j] = np.dot(a[i], b[:,j])
                
    return output

%time result = py_matmul2(a, b)
assert result.shape == expected.shape
assert np.allclose(result, expected, rtol=1e-01), (result, expected)

In [None]:
def py_matmul3(a, b):
    ra, ca = a.shape
    rb, cb = b.shape
    assert ca == rb, f"{ca} != {rb}"
    
    output = np.zeros(shape=(ra, cb))
    for i in range(ra):
        output[i] = np.dot(a[i], b)
        
                
    return output

%time result = py_matmul3(a, b)
assert result.shape == expected.shape
assert np.allclose(result, expected, rtol=1e-01), (result, expected)

In [None]:
def py_matmul4(a, b):
    ra, ca = a.shape
    rb, cb = b.shape
    assert ca == rb, f"{ca} != {rb}"
    
    return np.matmul(a, b)
    

%time result = py_matmul4(a, b)
assert result.shape == expected.shape
assert np.allclose(result, expected, rtol=1e-01), (result, expected)

**The Runtime type must support GPU inorder for this last bit of code to run**

In [None]:
def py_matmul5(a, b):
    ra, ca = a.shape
    rb, cb = b.shape
    assert ca == rb, f"{ca} != {rb}"
    
    return torch.matmul(a, b)
    
t_a = torch.tensor(a).to(cuda)
t_b = torch.tensor(b).to(cuda)
torch.cuda.synchronize()
%time result = py_matmul5(t_a, t_b)
assert result.shape == expected.shape
result_cpu = result.cpu()
assert np.allclose(result_cpu, expected, rtol=1e-01), (result_cpu, expected)