In [None]:
import numpy as np
import pycuda.driver as cuda

In [8]:
import cv2
def resize_cpu(image, dsize):
    return cv2.resize(image, dsize=dsize)

In [24]:
import cv2
image = cv2.imread("./lena.png")
image = resize_cpu(image, (1280, 760))

In [26]:
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import cv2
def resize_cpu(image, dsize):
    return cv2.resize(image, dsize=dsize)

mod = SourceModule \
    (
    """
    #include <stdint.h>
    
__global__ void gpuResize( uint8_t* input, int iWidth, int iHeight, uint8_t* output, int oWidth, int oHeight )
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    const float2 scale = make_float2( float(iWidth) / float(oWidth), float(iHeight) / float(oHeight) );
    
    if( x >= oWidth || y >= oHeight )
        return;
    
    const int dx = ((float)x * scale.x);
    const int dy = ((float)y * scale.y);
    //printf("%d, %d\\n ", dx, dy);
    const uint8_t px = input[ dy * iWidth + dx ];
    output[y*oWidth+x] = px;
}
""")

input = cv2.imread("./lena.png", 0)
print(input.shape)
print(input[0])
#exit(0)
input = resize_cpu(input, (1280, 760))
#input = np.ones((512, 512), dtype=np.uint8)
target_size = 512
ref = resize_cpu(input, (target_size, target_size))
output = np.zeros((target_size, target_size), dtype=np.uint8) # specify output type to uint8
print(input.shape)
print(output.shape)
gpuResize = mod.get_function("gpuResize")

gpuResize(drv.In(input), np.int32(input.shape[1]), np.int32(input.shape[0]), drv.Out(output), np.int32(output.shape[1]), np.int32(output.shape[0]),block=(8, 8, 1), grid=(256, 256, 1))
# print(input[0:16, 0:16].sum() / 16 / 16)
# print(input[0:16, 16:32].sum() / 16 / 16)
# print(input[7:9, 7:9].mean())
print(output[0])
print(ref[0])
# print((output == ref).all())


(512, 512)
[169 169 168 167 169 164 170 167 170 167 169 167 163 169 166 162 164 163
 167 167 162 164 163 164 161 164 163 161 163 162 161 164 162 165 164 166
 166 173 167 173 173 173 173 177 177 181 179 176 178 178 174 180 175 173
 170 168 159 157 161 150 140 133 128 117 110 110 106 103 101 106 112 107
 112 110 114 115 116 116 116 122 120 119 118 117 116 119 118 120 118 119
 118 119 122 120 118 120 119 120 120 118 115 116 118 119 119 122 126 123
 125 128 132 130 125 134 132 132 134 133 132 136 136 140 140 134 135 139
 142 142 142 141 145 140 139 141 141 139 143 145 142 144 143 142 139 141
 144 138 140 137 142 140 140 140 139 141 139 139 144 144 144 144 140 145
 142 144 143 144 144 145 144 145 144 146 145 144 144 144 146 144 143 144
 143 143 141 144 144 144 145 145 142 142 143 142 144 145 143 144 146 146
 144 146 145 144 143 145 145 146 142 141 147 142 143 147 143 142 140 142
 141 137 142 142 143 143 140 143 141 139 144 142 143 144 140 144 143 143
 143 143 142 143 143 144 145 144 143 145

In [33]:
%timeit -n 1000 gpuResize(drv.In(input), np.int32(input.shape[1]), np.int32(input.shape[0]), drv.Out(output), np.int32(output.shape[1]), np.int32(output.shape[0]),block=(8, 8, 1), grid=(256, 256, 1))

318 µs ± 27.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [36]:
output.shape

(512, 512)

In [35]:
%timeit -n 1000 output = resize_cpu(input, (target_size, target_size))

311 µs ± 24.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [37]:
output.shape

(512, 512)

In [38]:
input.shape

(760, 1280)

In [30]:
target_size

512

In [None]:
img = np.array([[1,1,2,2],
               [1,1,2,2],
               [1,1,3,4],
               [1,1,3,4]], dtype=np.uint8)
print(img.shape)
resized = cv2.resize(img, (2, 2))
print(resized)

In [15]:
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

mod = SourceModule \
    (
    """
    #include <stdint.h>
    
__global__ void gpuResize( uint8_t* input, int iWidth, int iHeight, uint8_t* output, int oWidth, int oHeight )
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    const float2 scale = make_float2( float(iWidth) / float(oWidth), float(iHeight) / float(oHeight) );
    
    if( x >= oWidth || y >= oHeight )
        return;
    
    const int dx = ((float)x * scale.x);
    const int dy = ((float)y * scale.y);

    int count = 0;
    int sum = 0;
    for (size_t i = dx; i < dx + scale.x; ++i) {
       for (size_t j = dy; j < dy + scale.y; ++j) {
          if (i >= iWidth || j >= iHeight) { 
                 continue;
          }
          const uint8_t px = input[j * iWidth + i];
          sum += px;
          count += 1;
       }
    }
    output[y*oWidth+x] =  (int)floor((float) sum / count + 0.5f);
}
""")

#a = np.random.randint(0, 256, (512, 512, 3), dtype=np.uint8)
input = np.array([[1,1,2,2],
               [1,1,2,2],
               [3,3,4,3],
               [3,3,4,2]], dtype=np.uint8)
output = np.zeros((2, 2), dtype=np.uint8)

input = np.array([[3, 106, 107, 40, 148, 112, 254, 151],
                [62, 173, 91, 93, 33, 111, 139, 25],
                [99, 137, 80, 231, 101, 204, 74, 219],
                [240, 173, 85, 14, 40, 230, 160, 152],
                [230, 200, 177, 149, 173, 239, 103, 74],
                [19, 50, 209, 82, 241, 103, 3, 87],
                [252, 191, 55, 154, 171, 107, 6, 123],
                [7, 101, 168, 85, 115, 103, 32, 11]],
                dtype=np.uint8)
output = np.zeros((input.shape[1]//2, input.shape[0]//2), dtype=np.uint8)

input = cv2.imread("./lena.png", 0)
print(input.shape)
ref = resize_cpu(input, (512, 512))
output = np.zeros((512, 512))
print(input.shape)
print(output.shape)
gpuResize = mod.get_function("gpuResize")

gpuResize(drv.In(input), np.int32(input.shape[1]), np.int32(input.shape[0]), drv.Out(output), np.int32(output.shape[1]), np.int32(output.shape[0]),block=(8, 8, 1), grid=(256, 256, 1))
print(output[0][0])
print(ref[0][0])

(512, 512)
(512, 512)
(512, 512)
-1.3206821641083835e-117
169


In [None]:
import numpy as np
import cv2

def standardize_cpu(image):
    rgb_means = [0.485, 0.456, 0.406]
    rgb_means_array = np.reshape(np.array(rgb_means), (1, 1, len(rgb_means)))

    rgb_stddevs = [0.229, 0.224, 0.225]
    rgb_stddevs_array = np.reshape(np.array(rgb_stddevs), (1, 1, len(rgb_stddevs)))

    image = image / 255.0
    image -= rgb_means_array
    image /= rgb_stddevs_array
    return image

import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

mod = SourceModule \
    (
        """                                                                                                                                                                              
                                                                                                                                                                                                                                                                                                                                                      
__global__ void standardize(float *dest_r, float *dest_g, float *dest_b, float *r_img, float *g_img, float *b_img)                                                                       
{                                                                                                                                                                                        

unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y));

dest_r[idx] = ((r_img[idx]/255 - 0.485)) / 0.229;
dest_g[idx] = ((g_img[idx]/255 - 0.456)) / 0.224;
dest_b[idx] = ((b_img[idx]/255 - 0.406)) / 0.225;                                                                                                                                                                                  
}                                                                                                                                                                                        
                                                                                                                                                                                         
""")
a = np.random.randint(0, 256, (512, 512, 3), dtype=np.uint8)
#a = np.load("/home/nvidia/resized.npy")
#a = cv2.imread("./lena2.png")
# r_img = a[:, :, 0].reshape(262144, order='F').astype(np.float32)
# g_img = a[:, :, 1].reshape(262144, order='F').astype(np.float32)
# b_img = a[:, :, 2].reshape(262144, order='F').astype(np.float32)
r_img = a[:, :, 0].reshape(262144).astype(np.float32)
g_img = a[:, :, 1].reshape(262144).astype(np.float32)
b_img = a[:, :, 2].reshape(262144).astype(np.float32)
dest_r=np.zeros_like(r_img).astype(np.float32)
dest_g=np.zeros_like(g_img).astype(np.float32)
dest_b=np.zeros_like(b_img).astype(np.float32)
standardize = mod.get_function("standardize")
standardize(drv.Out(dest_r), drv.Out(dest_g), drv.Out(dest_b), drv.In(r_img), drv.In(g_img),drv.In(b_img),block=(1024, 1, 1), grid=(256, 1, 1))

res_cpu = standardize_cpu(a)
# dest_r=np.reshape(dest_r,(512,512), order='F')
# print(np.allclose(dest_r, res_cpu[:,:,0]))
# dest_g=np.reshape(dest_g,(512,512), order='F')
# print(np.allclose(dest_g, res_cpu[:,:,1]))
# dest_b=np.reshape(dest_b,(512,512), order='F')
# print(np.allclose(dest_b, res_cpu[:,:,2]))

res_gpu_2 = np.concatenate((dest_r, dest_g, dest_b), axis=None)
res_cpu_2 = res_cpu.transpose((2, 0, 1)).ravel()
print(np.allclose(res_gpu_2, res_cpu_2))

In [None]:
standardize_cpu(image)

In [None]:
import cv2
image = cv2.imread("./lena.png")
image2 = cv2.resize(image, (256, 256))
cv2.imwrite("./lena2.png", image2)
image = image2
print(image.shape)

In [None]:
standardize(image)

In [None]:
image

In [None]:
import cv2
import numpy as np
def standardize_cpu(image):
    rgb_means = [0.485, 0.456, 0.406]
    rgb_means_array = np.reshape(np.array(rgb_means), (1, 1, len(rgb_means)))

    rgb_stddevs = [0.229, 0.224, 0.225]
    rgb_stddevs_array = np.reshape(np.array(rgb_stddevs), (1, 1, len(rgb_stddevs)))

    image = image / 255.0
    image -= rgb_means_array
    image /= rgb_stddevs_array
    return image

In [None]:
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import scipy.misc as scm
import matplotlib.pyplot as p

mod = SourceModule \
    (
        """
#include<stdio.h>
#define INDEX(a, b) a*512+b

__global__ void standardize(float *dest_r, float *dest_g, float *dest_b, float *r_img, float *g_img, float *b_img)
{

unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y));

  unsigned int a = idx/512;
  unsigned int b = idx%512;

dest_r[INDEX(a, b)] = ((r_img[INDEX(a, b)]/255 - 0.485)) / 0.229;
dest_g[INDEX(a, b)] = ((g_img[INDEX(a, b)]/255 - 0.456)) / 0.224;
dest_b[INDEX(a, b)] = ((b_img[INDEX(a, b)]/255 - 0.406)) / 0.225;


}

""")
a = cv2.imread('./lena.png').astype(np.float32)
r_img = a[:, :, 0].reshape(262144, order='F')
g_img = a[:, :, 1].reshape(262144, order='F')
b_img = a[:, :, 2].reshape(262144, order='F')
dest_r=r_img
dest_g=g_img
dest_b=b_img
standardize = mod.get_function("standardize")
standardize(drv.Out(dest_r), drv.Out(dest_g), drv.Out(dest_b), drv.In(r_img), drv.In(g_img),drv.In(b_img),block=(1024, 1, 1), grid=(256, 1, 1))

dest_r=np.reshape(dest_r,(512,512), order='F')
dest_b=np.reshape(dest_b,(512,512), order='F')
p.imshow(dest_b)
p.show()

In [None]:
image = cv2.imread("./lena.png")
res_cpu = standardize_cpu(image)
dest_r=np.reshape(dest_r,(512,512), order='F')
print(np.allclose(dest_r, res_cpu[:,:,0]))
dest_g=np.reshape(dest_g,(512,512), order='F')
print(np.allclose(dest_g, res_cpu[:,:,1]))
dest_b=np.reshape(dest_b,(512,512), order='F')
print(np.allclose(dest_b, res_cpu[:,:,2]))

In [None]:
res_cpu = res_cpu.transpose((2, 0, 1)).ravel()

In [None]:
image = np.concatenate((dest_r, dest_g, dest_b), axis=None)

In [None]:
res_cpu.shape

In [None]:
image.shape

In [None]:
np.allclose(res_cpu, image)

In [None]:
%timeit -n 100 standardize(drv.Out(dest_r), drv.Out(dest_g), drv.Out(dest_b), drv.In(r_img), drv.In(g_img),drv.In(b_img),block=(1024, 1, 1), grid=(256, 1, 1))

In [None]:
%timeit -n 100 standardize_cpu(image)

In [None]:
image = a

In [None]:
dest = np.concatenate((dest_r, dest_g, dest_b), axis=None)

In [None]:
dest.shape

In [None]:
dest_r

In [None]:
res[:,:,0].shape

In [None]:
dest_r.shape

In [None]:
dest_g=np.reshape(dest_g,(512,512), order='F')
np.allclose(dest_g, res[:,:,1])

In [None]:
dest_b.shape

In [None]:
dest_b

In [None]:
res[:,:,1]

In [None]:
b_img

In [None]:
a = cv2.imread('./lena.png').astype(np.float32)
r_img = a[:, :, 0].reshape(262144, order='F')
g_img = a[:, :, 1].reshape(262144, order='F')
b_img = a[:, :, 2].reshape(262144, order='F')

In [None]:
b_img

In [None]:
r_img

In [None]:
import numpy as np
import scipy.misc as scm
import matplotlib.pyplot as p

mod = SourceModule \
    (
        """
#include<stdio.h>
#define INDEX(a, b) a*512+b

__global__ void standardize(float *dest_r, float *r_img)
{

unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y));

  unsigned int a = idx/512;
  unsigned int b = idx%512;
  dest_r[INDEX(a, b)] = ((r_img[INDEX(a, b)]/255 - 0.485)) / 0.229;
  //dest_r[INDEX(a, b)] = ((r_img[INDEX(a, b)]/255 - 0.406)) / 0.225;
}

""")
a = cv2.imread('./lena.png').astype(np.float32)
r_img = a[:, :, 0].reshape(262144, order='F')
g_img = a[:, :, 1].reshape(262144, order='F')
b_img = a[:, :, 2].reshape(262144, order='F')
dest_r=r_img
dest_g=g_img
dest_b=np.zeros_like(b_img)
print(dest_b)
standardize = mod.get_function("standardize")
standardize(drv.Out(dest_r), drv.In(r_img),block=(1024, 1, 1), grid=(256, 1, 1))

dest_r=np.reshape(dest_r,(512,512), order='F')
dest_b=np.reshape(dest_b,(512,512), order='F')
p.imshow(dest_r)
p.show()

In [None]:
np.allclose(dest_r, res[:,:,0])

In [None]:
dest_r

In [None]:
res[:,:,0]

In [None]:
dest_r.shape

In [None]:
res_gpu_2[100]

In [None]:
x = res_cpu.transpose((2, 0, 1))

In [None]:
x.shape

In [None]:
dest_r=np.reshape(dest_r,(512,512), order='F')
print(np.allclose(dest_r, x[0, :,:]))

In [None]:
dest_r=np.reshape(dest_r,(512,512), order='C')
print(np.allclose(dest_r, res_cpu[:,:,0]))
dest_g=np.reshape(dest_g,(512,512), order='C')
print(np.allclose(dest_g, res_cpu[:,:,1]))
dest_b=np.reshape(dest_b,(512,512), order='C')
print(np.allclose(dest_b, res_cpu[:,:,2]))

In [None]:
a.shape

In [None]:
import matplotlib.pyplot as plt
plt.imshow(a)
plt.show()

In [None]:
import cv2

In [None]:
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import scipy.misc as scm
import matplotlib.pyplot as p

mod = SourceModule \
    (
        """

__global__ void standardize(float *dest_r, float *dest_g, float *dest_b, float *r_img, float *g_img, float *b_img)
{

unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y));

dest_r[idx] = ((r_img[idx]/255 - 0.485)) / 0.229;
dest_g[idx] = ((g_img[idx]/255 - 0.456)) / 0.224;
dest_b[idx] = ((b_img[idx]/255 - 0.406)) / 0.225;


}

""")
a = cv2.imread('./lena.png').astype(np.float32)
r_img = a[:, :, 0].reshape(262144, order='F')
g_img = a[:, :, 1].reshape(262144, order='F')
b_img = a[:, :, 2].reshape(262144, order='F')
dest_r=r_img
dest_g=g_img
dest_b=b_img
standardize = mod.get_function("standardize")
standardize(drv.Out(dest_r), drv.Out(dest_g), drv.Out(dest_b), drv.In(r_img), drv.In(g_img),drv.In(b_img),block=(1024, 1, 1), grid=(256, 1, 1))

dest_r=np.reshape(dest_r,(512,512), order='F')
dest_b=np.reshape(dest_b,(512,512), order='F')
p.imshow(dest_b)
p.show()

In [None]:
import numpy as np
import cv2

def standardize_cpu(image):
    rgb_means = [0.485, 0.456, 0.406]
    rgb_means_array = np.reshape(np.array(rgb_means), (1, 1, len(rgb_means)))

    rgb_stddevs = [0.229, 0.224, 0.225]
    rgb_stddevs_array = np.reshape(np.array(rgb_stddevs), (1, 1, len(rgb_stddevs)))

    image = image / 255.0
    image -= rgb_means_array
    image /= rgb_stddevs_array
    return image

import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

mod = SourceModule \
    (
        """                                                                                                                                                                              
                                                                                                                                                              
                                                                                                                                                                                         
__global__ void standardize(float *dest_r, float *dest_g, float *dest_b, float *r_img, float *g_img, float *b_img)                                                                       
{                                                                                                                                                                                        
                                                                                                                                                                                         
unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y));

dest_r[idx] = ((r_img[idx]/255 - 0.485)) / 0.229;
dest_g[idx] = ((g_img[idx]/255 - 0.456)) / 0.224;
dest_b[idx] = ((b_img[idx]/255 - 0.406)) / 0.225;                                                                                                                                                                                  
}                                                                                                                                                                                        
                                                                                                                                                                                         
""")
#a = np.random.randint(0, 256, (512, 512, 3), dtype=np.uint8)
a = np.load("/home/nvidia/resized.npy")
# r_img = a[:, :, 0].reshape(262144, order='F').astype(np.float32)
# g_img = a[:, :, 1].reshape(262144, order='F').astype(np.float32)
# b_img = a[:, :, 2].reshape(262144, order='F').astype(np.float32)
r_img = a[:, :, 0].reshape(262144).astype(np.float32)
g_img = a[:, :, 1].reshape(262144).astype(np.float32)
b_img = a[:, :, 2].reshape(262144).astype(np.float32)
dest_r=np.zeros_like(r_img).astype(np.float32)
dest_g=np.zeros_like(g_img).astype(np.float32)
dest_b=np.zeros_like(b_img).astype(np.float32)
standardize = mod.get_function("standardize")
standardize(drv.Out(dest_r), drv.Out(dest_g), drv.Out(dest_b), drv.In(r_img), drv.In(g_img),drv.In(b_img),block=(1024, 1, 1), grid=(256, 1, 1))

res_cpu = standardize_cpu(a)
# dest_r=np.reshape(dest_r,(512,512), order='F')
# print(np.allclose(dest_r, res_cpu[:,:,0]))
# dest_g=np.reshape(dest_g,(512,512), order='F')
# print(np.allclose(dest_g, res_cpu[:,:,1]))
# dest_b=np.reshape(dest_b,(512,512), order='F')
# print(np.allclose(dest_b, res_cpu[:,:,2]))

res_gpu_2 = np.concatenate((dest_r, dest_g, dest_b), axis=None)
res_cpu_2 = res_cpu.transpose((2, 0, 1)).ravel()
print(np.allclose(res_gpu_2, res_cpu_2))

In [None]:
import numpy as np
import cv2

def standardize_cpu(image):
    rgb_means = [0.485, 0.456, 0.406]
    rgb_means_array = np.reshape(np.array(rgb_means), (1, 1, len(rgb_means)))

    rgb_stddevs = [0.229, 0.224, 0.225]
    rgb_stddevs_array = np.reshape(np.array(rgb_stddevs), (1, 1, len(rgb_stddevs)))

    image = image / 255.0
    image -= rgb_means_array
    image /= rgb_stddevs_array
    return image

import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np

mod = SourceModule \
    (
        """
__global__ void standardize(float *dest_r, float *dest_g, float *dest_b, float *r_img, float *g_img, float *b_img, int image_size)
{
    int total_thread_count = blockDim.x * gridDim.x;
    int global_thread_id = threadIdx.x+(blockIdx.x*(blockDim.x));
    for(int i = global_thread_id; i < image_size; i += total_thread_count)
    {
        dest_r[i] = ((r_img[i]/255 - 0.485)) / 0.229;
        dest_g[i] = ((g_img[i]/255 - 0.456)) / 0.224;
        dest_b[i] = ((b_img[i]/255 - 0.406)) / 0.225;
    }
}
""")
w = 512
h = 512

a = np.random.randint(0, 256, (h, w, 3), dtype=np.uint8)
#a = np.load("/home/nvidia/resized.npy")
# r_img = a[:, :, 0].reshape(262144, order='F').astype(np.float32)
# g_img = a[:, :, 1].reshape(262144, order='F').astype(np.float32)
# b_img = a[:, :, 2].reshape(262144, order='F').astype(np.float32)
r_img = a[:, :, 0].reshape(w * h).astype(np.float32)
g_img = a[:, :, 1].reshape(w * h).astype(np.float32)
b_img = a[:, :, 2].reshape(w * h).astype(np.float32)

r_img = np.ones(w * h).astype(np.float32)
g_img = np.ones(w * h).astype(np.float32) + 1
b_img = np.ones(w * h).astype(np.float32) + 2
a = np.stack([np.ones((w,h)), np.ones((w, h)) + 1, np.ones((w, h)) + 2], axis=2)

dest_r=np.zeros_like(r_img).astype(np.float32)
dest_g=np.zeros_like(g_img).astype(np.float32)
dest_b=np.zeros_like(b_img).astype(np.float32)

standardize = mod.get_function("standardize")
image_size = np.intc(w*h)
standardize(drv.Out(dest_r), drv.Out(dest_g), drv.Out(dest_b), drv.In(r_img), drv.In(g_img),drv.In(b_img), image_size,block=(1024, 1, 1), grid=(256, 1, 1))

res_cpu = standardize_cpu(a)
# dest_r=np.reshape(dest_r,(512,512), order='F')
# print(np.allclose(dest_r, res_cpu[:,:,0]))
# dest_g=np.reshape(dest_g,(512,512), order='F')
# print(np.allclose(dest_g, res_cpu[:,:,1]))
# dest_b=np.reshape(dest_b,(512,512), order='F')
# print(np.allclose(dest_b, res_cpu[:,:,2]))

res_gpu_2 = np.concatenate((dest_r, dest_g, dest_b), axis=None)
res_cpu_2 = res_cpu.transpose((2, 0, 1)).ravel()
print(np.allclose(res_gpu_2, res_cpu_2))

In [None]:
res_gpu_2

In [None]:
res_cpu

In [None]:
dest_r

In [None]:
res_cpu.shape

In [None]:
res_cpu

In [None]:
res_cpu[:,:,0]

In [None]:
np.allclose(res_cpu[:,:,0], dest_r.reshape(res_cpu[:,:,0].shape))

In [None]:
np.allclose(res_cpu[:,:,1], dest_g.reshape(res_cpu[:,:,1].shape))

In [None]:
res_cpu[:,:,1]

In [None]:
dest_g.reshape(res_cpu[:,:,2].shape)

In [None]:
x.shape

In [None]:
r_img.shape

In [None]:
x = np.stack([np.ones((w,h)), np.ones((w, h)) + 1, np.ones((w, h)) + 2], axis=2)

In [None]:
x.shape

In [None]:
a = np.ones((w, h)) + 1

In [None]:
b = np.ones((w, h)) + 2

In [None]:
c = np.ones((w, h))

In [None]:
x = np.concatenate(a, b, c)

In [None]:
a.shape