In [2]:
import numpy as np
from pylab import imshow, show
from timeit import default_timer as timer
from numba import autojit

from accelerate import cuda
from numba import *

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 30 days


In [34]:
grid_size_x              = 128;
grid_size_y              = 1024;
inlet_velocity           = 0.1;
directions               = 9;
directional_weights      = np.array([16,4,1,4,1,4,1,4,1]) / 36;
unit_x_velocities        = np.array([0, 1, 1, 0, -1, -1, -1, 0, 1]);
unit_y_velocities        = np.array([0, 0, 1, 1, 1, 0, -1, -1, -1]);
ei                       = np.stack((unit_x_velocities, unit_y_velocities));

x = np.linspace(0,1,grid_size_x);
y = np.linspace(0,1,grid_size_y);
[X,Y] = np.meshgrid(x,y);

U = np.ones((X.shape[0], X.shape[1], 2)) * inlet_velocity;

In [25]:
def equil(a,b,c,w,rho):
    f_eq = w * rho * (
    1 + 
    3.0 * a +
    4.5 * b +
    1.5 * c
    );
    return f_eq;

equil_gpu = cuda.jit(resttype=float64, argtypes=[float64,float64,float64,float64,float64], device=True)(equil)

@cuda.jit(argtypes=[float64[:,:,:], float64[:,:,:], float64[:,:], float64[:,:], float64[:,:,:], float64[:]])
def equil_kernel(a, b, c, rho, f_prev, w):
    
    width = U.shape[0];
    height = U.shape[1];
    
    startX, startY = cuda.grid(2);
    gridX = cuda.gridDim.x * cuda.blockDim.x;
    gridY = cuda.gridDim.y * cuda.blockDim.y;
    for i in range(0,9):
        for x in range(startX, width, gridX):
            for y in range(startY, height, gridY):
                a_send = a[x,y,i];
                b_send = b[x,y,i];
                c_send = c[x,y];
                w_send = w[i];
                r_send = rho[x,y];
                f_prev[x, y, i] = equil_gpu(a_send, b_send, c_send, w_send, r_send);

In [97]:
f_init = np.zeros((X.shape[0], X.shape[1], directions))
f_prev = np.zeros((X.shape[0], X.shape[1], directions))
f_next = np.zeros((X.shape[0], X.shape[1], directions))
f_eq   = np.zeros((X.shape[0], X.shape[1], directions))

blockdim = (8, 8)
griddim = (8, 8)
    
rho = np.ones(X.shape);
a = np.dot(U,ei);
b = np.power(a,2);
c = np.power(U[:,:,0],2) + np.power(U[:,:,1],2);

start = timer()
d_prev = cuda.to_device(f_prev)
equil_kernel[griddim, blockdim](a,b,c,rho,d_prev,directional_weights) 
d_prev.to_host()
dt = timer() - start

print(dt)

0.04015848321338922


In [102]:
f_init = np.zeros((X.shape[0], X.shape[1], directions))
f_prev = np.zeros((X.shape[0], X.shape[1], directions))
f_next = np.zeros((X.shape[0], X.shape[1], directions))
f_eq   = np.zeros((X.shape[0], X.shape[1], directions))

blockdim = (32, 8)
griddim = (32, 16)
    
rho = np.ones(X.shape);
a = np.dot(U,ei);
b = np.power(a,2);
c = np.power(U[:,:,0],2) + np.power(U[:,:,1],2);

start = timer()
for i in range(0,directions):
    f_prev[:,:,i] = directional_weights[i] * rho * (
        1 + 
        6 / 2 * a[:,:,i] + 
        9 / 2 * b[:,:,i] -
        3 / 2 * c[:,:]
    );
dt = timer() - start

print(dt)

0.06360582890795285
