Skip to content

Test suite - Add QA pass/fail tests for F32 bit depth #549

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 3, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions api/rppt_tensor_color_augmentations.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ RppStatus rppt_gamma_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alpha, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alphaTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);

#ifdef GPU_SUPPORT
/*! \brief Blend augmentation on HIP backend for a NCHW/NHWC layout tensor
Expand All @@ -171,7 +171,7 @@ RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alpha, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alphaTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! \brief Color Twist augmentation on HOST backend for a NCHW/NHWC layout tensor
Expand Down
8 changes: 4 additions & 4 deletions api/rppt_tensor_geometric_augmentations.h
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr,
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HOST memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiTensorPtrDst ROI data in HOST memory, for each image in destination tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] cropRoiTensor crop co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] patchRoiTensor patch co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
Expand All @@ -509,7 +509,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr,
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrDst, RpptROIPtr cropRoi, RpptROIPtr patchRoi, RpptRoiType roiType, rppHandle_t rppHandle);
RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrDst, RpptROIPtr cropRoiTensor, RpptROIPtr patchRoiTensor, RpptRoiType roiType, rppHandle_t rppHandle);

#ifdef GPU_SUPPORT
/*! \brief Crop and Patch augmentation on HIP backend for a NCHW/NHWC layout tensor
Expand All @@ -526,7 +526,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
* \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiTensorPtrDst ROI data in HIP memory, for each image in destination tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] cropRoiTensor crop co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] patchRoiTensor patch co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
Expand All @@ -535,7 +535,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrDst, RpptROIPtr cropRoi, RpptROIPtr patchRoi, RpptRoiType roiType, rppHandle_t rppHandle);
RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrDst, RpptROIPtr cropRoiTensor, RpptROIPtr patchRoiTensor, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

/*! \brief Flip voxel augmentation HOST
Expand Down
12 changes: 6 additions & 6 deletions src/include/common/cpu/rpp_cpu_simd_load_store.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,16 +273,16 @@ inline void rpp_mm256_print_ps(__m256 vPrintArray)
}
}

inline __m256 rpp_pixel_check_0to1_avx(__m256 p)
inline void rpp_pixel_check_0to1(__m256 *p, Rpp32s numVectors)
{
p = _mm256_min_ps(_mm256_max_ps(p, avx_p0), avx_p1);
return p;
for (int i = 0; i < numVectors; i++)
p[i] = _mm256_min_ps(_mm256_max_ps(p[i], avx_p0), avx_p1);
}

inline __m128 rpp_pixel_check_0to1_sse(__m128 p)
inline void rpp_pixel_check_0to1(__m128 *p, Rpp32s numVectors)
{
p = _mm_min_ps(_mm_max_ps(p, xmm_p0), xmm_p1);
return p;
for (int i = 0; i < numVectors; i++)
p[i] = _mm_min_ps(_mm_max_ps(p[i], xmm_p0), xmm_p1);
}

inline void rpp_saturate64_0to1_avx(__m256 *p)
Expand Down
12 changes: 12 additions & 0 deletions src/modules/tensor/cpu/kernel/blend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,8 @@ RppStatus blend_f32_f32_host_tensor(Rpp32f *srcPtr1,
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
p1[1] = _mm_fmadd_ps(_mm_sub_ps(p1[1], p2[1]), pMul, p2[1]); // alpha-blending adjustment
p1[2] = _mm_fmadd_ps(_mm_sub_ps(p1[2], p2[2]), pMul, p2[2]); // alpha-blending adjustment
//boundary checks for f32
rpp_pixel_check_0to1(p1, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p1); // simd stores

srcPtr1Temp += 12;
Expand Down Expand Up @@ -403,6 +405,8 @@ RppStatus blend_f32_f32_host_tensor(Rpp32f *srcPtr1,
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
p1[1] = _mm_fmadd_ps(_mm_sub_ps(p1[1], p2[1]), pMul, p2[1]); // alpha-blending adjustment
p1[2] = _mm_fmadd_ps(_mm_sub_ps(p1[2], p2[2]), pMul, p2[2]); // alpha-blending adjustment
//boundary checks for f32
rpp_pixel_check_0to1(p1, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp, p1); // simd stores

srcPtr1TempR += 4;
Expand Down Expand Up @@ -465,6 +469,8 @@ RppStatus blend_f32_f32_host_tensor(Rpp32f *srcPtr1,
rpp_simd_load(rpp_load4_f32_to_f32, srcPtr1Temp, p1); // simd loads
rpp_simd_load(rpp_load4_f32_to_f32, srcPtr2Temp, p2); // simd loads
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
//boundary checks for f32
rpp_pixel_check_0to1(p1, 1);
rpp_simd_store(rpp_store4_f32_to_f32, dstPtrTemp, p1); // simd stores

srcPtr1Temp += 4;
Expand Down Expand Up @@ -572,6 +578,8 @@ RppStatus blend_f16_f16_host_tensor(Rpp16f *srcPtr1,
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
p1[1] = _mm_fmadd_ps(_mm_sub_ps(p1[1], p2[1]), pMul, p2[1]); // alpha-blending adjustment
p1[2] = _mm_fmadd_ps(_mm_sub_ps(p1[2], p2[2]), pMul, p2[2]); // alpha-blending adjustment
//Boundary checks for f16
rpp_pixel_check_0to1(p1, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTemp_ps, dstPtrTemp_ps + 4, dstPtrTemp_ps + 8, p1); // simd stores

for(int cnt = 0; cnt < 4; cnt++)
Expand Down Expand Up @@ -656,6 +664,8 @@ RppStatus blend_f16_f16_host_tensor(Rpp16f *srcPtr1,
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
p1[1] = _mm_fmadd_ps(_mm_sub_ps(p1[1], p2[1]), pMul, p2[1]); // alpha-blending adjustment
p1[2] = _mm_fmadd_ps(_mm_sub_ps(p1[2], p2[2]), pMul, p2[2]); // alpha-blending adjustment
//boundary checks for f16
rpp_pixel_check_0to1(p1, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp_ps, p1); // simd stores

for(int cnt = 0; cnt < 12; cnt++)
Expand Down Expand Up @@ -731,6 +741,8 @@ RppStatus blend_f16_f16_host_tensor(Rpp16f *srcPtr1,
rpp_simd_load(rpp_load4_f32_to_f32, srcPtr1Temp_ps, p1); // simd loads
rpp_simd_load(rpp_load4_f32_to_f32, srcPtr2Temp_ps, p2); // simd loads
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
//boundary checks for f16
rpp_pixel_check_0to1(p1, 1);
rpp_simd_store(rpp_store4_f32_to_f32, dstPtrTemp_ps, p1); // simd stores

for(int cnt = 0; cnt < 4; cnt++)
Expand Down
20 changes: 16 additions & 4 deletions src/modules/tensor/cpu/kernel/brightness.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ inline void compute_brightness_48_host(__m128 *p, __m128 *pBrightnessParams)

inline void compute_brightness_24_host(__m256 *p, __m256 *pBrightnessParams)
{
p[0] = rpp_pixel_check_0to1_avx(_mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1])); // brightness adjustment
p[1] = rpp_pixel_check_0to1_avx(_mm256_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1])); // brightness adjustment
p[2] = rpp_pixel_check_0to1_avx(_mm256_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1])); // brightness adjustment
p[0] = _mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[1] = _mm256_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
p[2] = _mm256_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
}

inline void compute_brightness_24_host(__m128 *p, __m128 *pBrightnessParams)
Expand Down Expand Up @@ -90,7 +90,7 @@ inline void compute_brightness_12_host(__m128 *p, __m128 *pBrightnessParams)

inline void compute_brightness_8_host(__m256 *p, __m256 *pBrightnessParams)
{
p[0] = rpp_pixel_check_0to1_avx(_mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1])); // brightness adjustment
p[0] = _mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
}

inline void compute_brightness_8_host(__m128 *p, __m128 *pBrightnessParams)
Expand Down Expand Up @@ -395,6 +395,8 @@ RppStatus brightness_f32_f32_host_tensor(Rpp32f *srcPtr,
__m256 p[3];
rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads
compute_brightness_24_host(p, pBrightnessParams); // brightness adjustment
//Boundary check for f32
rpp_pixel_check_0to1(p, 1);
rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores
#else
__m128 p[3];
Expand Down Expand Up @@ -450,6 +452,8 @@ RppStatus brightness_f32_f32_host_tensor(Rpp32f *srcPtr,
__m256 p[3];
rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
compute_brightness_24_host(p, pBrightnessParams); // brightness adjustment
//Boundary check for f32
rpp_pixel_check_0to1(p, 1);
rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p); // simd stores
#else
__m128 p[4];
Expand Down Expand Up @@ -506,6 +510,8 @@ RppStatus brightness_f32_f32_host_tensor(Rpp32f *srcPtr,

rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, p); // simd loads
compute_brightness_8_host(p, pBrightnessParams); // brightness adjustment
//Boundary check for f32
rpp_pixel_check_0to1(p, 1);
rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p); // simd stores
#else
__m128 p[1];
Expand Down Expand Up @@ -621,6 +627,8 @@ RppStatus brightness_f16_f16_host_tensor(Rpp16f *srcPtr,
__m256 p[3];
rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); // simd loads
compute_brightness_24_host(p, pBrightnessParams); // brightness adjustment
//Boundary check for f16
rpp_pixel_check_0to1(p, 1);
rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, p); // simd stores
#else
__m128 p[3];
Expand Down Expand Up @@ -692,6 +700,8 @@ RppStatus brightness_f16_f16_host_tensor(Rpp16f *srcPtr,
__m256 p[3];
rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); // simd loads
compute_brightness_24_host(p, pBrightnessParams); // brightness adjustment
//Boundary check for f16
rpp_pixel_check_0to1(p, 1);
rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, p); // simd stores
#else
__m128 p[4];
Expand Down Expand Up @@ -756,6 +766,8 @@ RppStatus brightness_f16_f16_host_tensor(Rpp16f *srcPtr,

rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads
compute_brightness_8_host(p, pBrightnessParams); // brightness adjustment
//Boundary check for f16
rpp_pixel_check_0to1(p, 1);
rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, p); // simd stores
#else
__m128 p[1];
Expand Down
16 changes: 16 additions & 0 deletions src/modules/tensor/cpu/kernel/color_cast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,8 @@ RppStatus color_cast_f32_f32_host_tensor(Rpp32f *srcPtr,

rpp_simd_load(rpp_load12_f32pkd3_to_f32pln3, srcPtrTemp, p); // simd loads
compute_color_cast_12_host(p, pMul, pAdd); // color_cast adjustment
//boundary checks for f32
rpp_pixel_check_0to1(p, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores

srcPtrTemp += 12;
Expand Down Expand Up @@ -423,6 +425,8 @@ RppStatus color_cast_f32_f32_host_tensor(Rpp32f *srcPtr,

rpp_simd_load(rpp_load12_f32pln3_to_f32pln3, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
compute_color_cast_12_host(p, pMul, pAdd); // color_cast adjustment
//boundary checks for f32
rpp_pixel_check_0to1(p, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp, p); // simd stores

srcPtrTempR += 4;
Expand Down Expand Up @@ -471,6 +475,8 @@ RppStatus color_cast_f32_f32_host_tensor(Rpp32f *srcPtr,

rpp_simd_load(rpp_load12_f32pkd3_to_f32pln3, srcPtrTemp, p); // simd loads
compute_color_cast_12_host(p, pMul, pAdd); // color_cast adjustment
//boundary checks for f32
rpp_pixel_check_0to1(p, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp, p); // simd stores

srcPtrTemp += 12;
Expand Down Expand Up @@ -521,6 +527,8 @@ RppStatus color_cast_f32_f32_host_tensor(Rpp32f *srcPtr,

rpp_simd_load(rpp_load12_f32pln3_to_f32pln3, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
compute_color_cast_12_host(p, pMul, pAdd); // color_cast adjustment
//boundary checks for f32
rpp_pixel_check_0to1(p, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores

srcPtrTempR += 4;
Expand Down Expand Up @@ -633,6 +641,8 @@ RppStatus color_cast_f16_f16_host_tensor(Rpp16f *srcPtr,

rpp_simd_load(rpp_load12_f32pkd3_to_f32pln3, srcPtrTemp_ps, p); // simd loads
compute_color_cast_12_host(p, pMul, pAdd); // color_cast adjustment
//boundary checks for f16
rpp_pixel_check_0to1(p, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTemp_ps, dstPtrTemp_ps + 4, dstPtrTemp_ps + 8, p); // simd stores

for(int cnt = 0; cnt < 4; cnt++)
Expand Down Expand Up @@ -701,6 +711,8 @@ RppStatus color_cast_f16_f16_host_tensor(Rpp16f *srcPtr,

rpp_simd_load(rpp_load12_f32pln3_to_f32pln3, srcPtrTemp_ps, srcPtrTemp_ps + 4, srcPtrTemp_ps + 8, p); // simd loads
compute_color_cast_12_host(p, pMul, pAdd); // color_cast adjustment
//boundary checks for f16
rpp_pixel_check_0to1(p, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp_ps, p); // simd stores

for(int cnt = 0; cnt < 12; cnt++)
Expand Down Expand Up @@ -761,6 +773,8 @@ RppStatus color_cast_f16_f16_host_tensor(Rpp16f *srcPtr,

rpp_simd_load(rpp_load12_f32pkd3_to_f32pln3, srcPtrTemp_ps, p); // simd loads
compute_color_cast_12_host(p, pMul, pAdd); // color_cast adjustment
//boundary checks for f16
rpp_pixel_check_0to1(p, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp_ps, p); // simd stores

for(int cnt = 0; cnt < 12; cnt++)
Expand Down Expand Up @@ -825,6 +839,8 @@ RppStatus color_cast_f16_f16_host_tensor(Rpp16f *srcPtr,

rpp_simd_load(rpp_load12_f32pln3_to_f32pln3, srcPtrTemp_ps, srcPtrTemp_ps + 4, srcPtrTemp_ps + 8, p); // simd loads
compute_color_cast_12_host(p, pMul, pAdd); // color_cast adjustment
//boundary checks for f16
rpp_pixel_check_0to1(p, 3);
rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTemp_ps, dstPtrTemp_ps + 4, dstPtrTemp_ps + 8, p); // simd stores

for(int cnt = 0; cnt < 4; cnt++)
Expand Down
Loading