Skip to content

Commit 04b0d62

Browse files
r-abishekHazarathKumarMkiritigowdaLakshmiKumar23
authored
Test suite - Add QA pass/fail tests for F32 bit depth (#549)
* Add F32 QA Golden outputs * modify Doxygen comments * modify range check functions * RPP F32 QA : Review Comments Resolution (#431) * Modified SIMD print functions to use union * remove redundant unions in print functions * removed pixel checks * remove pixel check in threshold * resolve review comments --------- Co-authored-by: HazarathKumarM <hazarathkumar@multicorewareinc.com> Co-authored-by: Kiriti Gowda <kiritigowda@gmail.com> Co-authored-by: HazarathKumarM <119284987+HazarathKumarM@users.noreply.github.com> Co-authored-by: Lakshmi Kumar <lakshmi.kumar@amd.com>
1 parent 3c938e5 commit 04b0d62

21 files changed

+282
-102
lines changed

api/rppt_tensor_color_augmentations.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ RppStatus rppt_gamma_correction_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, Rpp
148148
* \retval RPP_SUCCESS Successful completion.
149149
* \retval RPP_ERROR* Unsuccessful completion.
150150
*/
151-
RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alpha, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
151+
RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alphaTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
152152

153153
#ifdef GPU_SUPPORT
154154
/*! \brief Blend augmentation on HIP backend for a NCHW/NHWC layout tensor
@@ -171,7 +171,7 @@ RppStatus rppt_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDes
171171
* \retval RPP_SUCCESS Successful completion.
172172
* \retval RPP_ERROR* Unsuccessful completion.
173173
*/
174-
RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alpha, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
174+
RppStatus rppt_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *alphaTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
175175
#endif // GPU_SUPPORT
176176

177177
/*! \brief Color Twist augmentation on HOST backend for a NCHW/NHWC layout tensor

api/rppt_tensor_geometric_augmentations.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr,
500500
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
501501
* \param [out] dstPtr destination tensor in HOST memory
502502
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
503-
* \param [in] roiTensorPtrSrc ROI data in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
503+
* \param [in] roiTensorPtrDst ROI data in HOST memory, for each image in destination tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
504504
* \param [in] cropRoiTensor crop co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
505505
* \param [in] patchRoiTensor patch co-ordinates in HOST memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
506506
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
@@ -509,7 +509,7 @@ RppStatus rppt_slice_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr,
509509
* \retval RPP_SUCCESS Successful completion.
510510
* \retval RPP_ERROR* Unsuccessful completion.
511511
*/
512-
RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrDst, RpptROIPtr cropRoi, RpptROIPtr patchRoi, RpptRoiType roiType, rppHandle_t rppHandle);
512+
RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrDst, RpptROIPtr cropRoiTensor, RpptROIPtr patchRoiTensor, RpptRoiType roiType, rppHandle_t rppHandle);
513513

514514
#ifdef GPU_SUPPORT
515515
/*! \brief Crop and Patch augmentation on HIP backend for a NCHW/NHWC layout tensor
@@ -526,7 +526,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP
526526
* \param [in] srcDescPtr source tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = 1/3)
527527
* \param [out] dstPtr destination tensor in HIP memory
528528
* \param [in] dstDescPtr destination tensor descriptor (Restrictions - numDims = 4, offsetInBytes >= 0, dataType = U8/F16/F32/I8, layout = NCHW/NHWC, c = same as that of srcDescPtr)
529-
* \param [in] roiTensorPtrSrc ROI data in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
529+
* \param [in] roiTensorPtrDst ROI data in HIP memory, for each image in destination tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
530530
* \param [in] cropRoiTensor crop co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
531531
* \param [in] patchRoiTensor patch co-ordinates in HIP memory, for each image in source tensor (2D tensor of size batchSize * 4, in either format - XYWH(xy.x, xy.y, roiWidth, roiHeight) or LTRB(lt.x, lt.y, rb.x, rb.y))
532532
* \param [in] roiType ROI type used (RpptRoiType::XYWH or RpptRoiType::LTRB)
@@ -535,7 +535,7 @@ RppStatus rppt_crop_and_patch_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescP
535535
* \retval RPP_SUCCESS Successful completion.
536536
* \retval RPP_ERROR* Unsuccessful completion.
537537
*/
538-
RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrDst, RpptROIPtr cropRoi, RpptROIPtr patchRoi, RpptRoiType roiType, rppHandle_t rppHandle);
538+
RppStatus rppt_crop_and_patch_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, RpptROIPtr roiTensorPtrDst, RpptROIPtr cropRoiTensor, RpptROIPtr patchRoiTensor, RpptRoiType roiType, rppHandle_t rppHandle);
539539
#endif // GPU_SUPPORT
540540

541541
/*! \brief Flip voxel augmentation HOST

src/include/common/cpu/rpp_cpu_simd_load_store.hpp

Lines changed: 69 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -169,17 +169,37 @@ const __m256i avx_pxMaskR = _mm256_setr_epi8(0, 0x80, 0x80, 3, 0x80, 0x80, 6, 0x
169169
const __m256i avx_pxMaskG = _mm256_setr_epi8(0x80, 1, 0x80, 0x80, 4, 0x80, 0x80, 7, 0x80, 0x80, 10, 0x80, 0x80, 13, 0x80, 0x80, 16, 0x80, 0x80, 19, 0x80, 0x80, 22, 0x80, 0x80, 25, 0x80, 0x80, 28, 0x80, 0x80, 0x80);
170170
const __m256i avx_pxMaskB = _mm256_setr_epi8(0x80, 0x80, 2, 0x80, 0x80, 5, 0x80, 0x80, 8, 0x80, 0x80, 11, 0x80, 0x80, 14, 0x80, 0x80, 17, 0x80, 0x80, 20, 0x80, 0x80, 23, 0x80, 0x80, 26, 0x80, 0x80, 29, 0x80, 0x80);
171171

172+
// Union for handling 128-bit SIMD data (SSE).
173+
union RppSIMD128
174+
{
175+
__m128i m128iVal;
176+
__m128 m128Val;
177+
char i8[16];
178+
short i16[8];
179+
int i32[4];
180+
float f32[4];
181+
};
182+
183+
// Union for handling 256-bit SIMD data (AVX).
184+
union RppSIMD256
185+
{
186+
__m256i m256iVal;
187+
__m256 m256Val;
188+
unsigned char u8[32];
189+
short i16[16];
190+
int i32[8];
191+
float f32[8];
192+
};
193+
172194
// Print helpers
173195

174-
inline void rpp_mm_print_epi8(__m128i vPrintArray)
196+
inline void rpp_mm_print_epi8(__m128i *v)
175197
{
176-
char printArray[16];
177-
_mm_storeu_si128((__m128i *)printArray, vPrintArray);
198+
RppSIMD128 u;
199+
u.m128iVal = v[0];
178200
printf("\n");
179-
for (int ct = 0; ct < 16; ct++)
180-
{
181-
printf("%d ", printArray[ct]);
182-
}
201+
for (int i = 0; i < 16; ++i)
202+
printf("%d ", u.i8[i]);
183203
}
184204

185205
inline void rpp_storeu_si32(void *__p,
@@ -198,93 +218,80 @@ inline void rpp_storeu_si64(void *__p,
198218
((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
199219
}
200220

201-
inline void rpp_mm_print_epi32(__m128i vPrintArray)
221+
inline void rpp_mm_print_epi16(__m128i *v)
202222
{
203-
int printArray[4];
204-
_mm_storeu_si128((__m128i *)printArray, vPrintArray);
223+
RppSIMD128 u;
224+
u.m128iVal = v[0];
205225
printf("\n");
206-
for (int ct = 0; ct < 4; ct++)
207-
{
208-
printf("%d ", printArray[ct]);
209-
}
226+
for (int i = 0; i < 8; ++i)
227+
printf("%hd ", u.i16[i]);
210228
}
211229

212-
inline void rpp_mm_print_epi16(__m128i vPrintArray)
230+
inline void rpp_mm_print_epi32(__m128i *v)
213231
{
214-
unsigned short int printArray[8];
215-
_mm_storeu_si128((__m128i *)printArray, vPrintArray);
232+
RppSIMD128 u;
233+
u.m128iVal = v[0];
216234
printf("\n");
217-
for (int ct = 0; ct < 8; ct++)
218-
{
219-
printf("%hu ", printArray[ct]);
220-
}
235+
for (int i = 0; i < 4; ++i)
236+
printf("%d ", u.i32[i]);
221237
}
222238

223-
inline void rpp_mm_print_ps(__m128 vPrintArray)
239+
inline void rpp_mm_print_ps(__m128 *v)
224240
{
225-
float printArray[4];
226-
_mm_storeu_ps(printArray, vPrintArray);
241+
RppSIMD128 u;
242+
u.m128Val = v[0];
227243
printf("\n");
228-
for (int ct = 0; ct < 4; ct++)
229-
{
230-
printf("%0.6f ", printArray[ct]);
231-
}
244+
for (int i = 0; i < 4; ++i)
245+
printf("%0.6f ", u.f32[i]);
232246
}
233247

234-
inline void rpp_mm256_print_epi8(__m256i vPrintArray)
248+
249+
inline void rpp_mm256_print_epi8(__m256i *v)
235250
{
236-
unsigned char printArray[32];
237-
_mm256_storeu_si256((__m256i *)printArray, vPrintArray);
251+
RppSIMD256 u;
252+
u.m256iVal = v[0];
238253
printf("\n");
239-
for (int ct = 0; ct < 32; ct++)
240-
{
241-
printf("%d ", (unsigned char)printArray[ct]);
242-
}
254+
for (int i = 0; i < 32; ++i)
255+
printf("%u ", u.u8[i]);
243256
}
244257

245-
inline void rpp_mm256_print_epi32(__m256i vPrintArray)
258+
inline void rpp_mm256_print_epi16(__m256i *v)
246259
{
247-
int printArray[8];
248-
_mm256_storeu_si256((__m256i *)printArray, vPrintArray);
260+
RppSIMD256 u;
261+
u.m256iVal = v[0];
249262
printf("\n");
250-
for (int ct = 0; ct < 8; ct++)
251-
{
252-
printf("%d ", printArray[ct]);
253-
}
263+
for (int i = 0; i < 16; ++i)
264+
printf("%hd ", u.i16[i]);
254265
}
255266

256-
inline void rpp_mm256_print_epi16(__m256i vPrintArray)
267+
inline void rpp_mm256_print_epi32(__m256i *v)
257268
{
258-
unsigned short int printArray[8];
259-
_mm256_storeu_si256((__m256i *)printArray, vPrintArray);
269+
RppSIMD256 u;
270+
u.m256iVal = v[0];
260271
printf("\n");
261-
for (int ct = 0; ct < 16; ct++)
262-
{
263-
printf("%hu ", printArray[ct]);
264-
}
272+
for (int i = 0; i < 8; ++i)
273+
printf("%d ", u.i32[i]);
265274
}
266275

267-
inline void rpp_mm256_print_ps(__m256 vPrintArray)
276+
inline void rpp_mm256_print_ps(__m256 *v)
268277
{
269-
float printArray[8];
270-
_mm256_storeu_ps(printArray, vPrintArray);
278+
RppSIMD256 u;
279+
u.m256Val = v[0];
271280
printf("\n");
272-
for (int ct = 0; ct < 8; ct++)
273-
{
274-
printf("%0.6f ", printArray[ct]);
275-
}
281+
for (int i = 0; i < 8; ++i)
282+
printf("%0.6f ", u.f32[i]);
276283
}
277284

278-
inline __m256 rpp_pixel_check_0to1_avx(__m256 p)
285+
inline void rpp_pixel_check_0to1(__m256 *p, Rpp32s numVectors)
279286
{
280-
p = _mm256_min_ps(_mm256_max_ps(p, avx_p0), avx_p1);
281-
return p;
287+
for (int i = 0; i < numVectors; i++)
288+
p[i] = _mm256_min_ps(_mm256_max_ps(p[i], avx_p0), avx_p1);
282289
}
283290

284-
inline __m128 rpp_pixel_check_0to1_sse(__m128 p)
291+
inline void rpp_pixel_check_0to1(__m128 *p, Rpp32s numVectors)
285292
{
286-
p = _mm_min_ps(_mm_max_ps(p, xmm_p0), xmm_p1);
287-
return p;
293+
for (int i = 0; i < numVectors; i++)
294+
p[i] = _mm_min_ps(_mm_max_ps(p[i], xmm_p0), xmm_p1);
288295
}
289296

290297
inline void rpp_saturate64_0to1_avx(__m256 *p)

src/modules/tensor/cpu/kernel/blend.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,8 @@ RppStatus blend_f16_f16_host_tensor(Rpp16f *srcPtr1,
572572
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
573573
p1[1] = _mm_fmadd_ps(_mm_sub_ps(p1[1], p2[1]), pMul, p2[1]); // alpha-blending adjustment
574574
p1[2] = _mm_fmadd_ps(_mm_sub_ps(p1[2], p2[2]), pMul, p2[2]); // alpha-blending adjustment
575+
//Boundary checks for f16
576+
rpp_pixel_check_0to1(p1, 3);
575577
rpp_simd_store(rpp_store12_f32pln3_to_f32pln3, dstPtrTemp_ps, dstPtrTemp_ps + 4, dstPtrTemp_ps + 8, p1); // simd stores
576578

577579
for(int cnt = 0; cnt < 4; cnt++)
@@ -656,6 +658,8 @@ RppStatus blend_f16_f16_host_tensor(Rpp16f *srcPtr1,
656658
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
657659
p1[1] = _mm_fmadd_ps(_mm_sub_ps(p1[1], p2[1]), pMul, p2[1]); // alpha-blending adjustment
658660
p1[2] = _mm_fmadd_ps(_mm_sub_ps(p1[2], p2[2]), pMul, p2[2]); // alpha-blending adjustment
661+
//boundary checks for f16
662+
rpp_pixel_check_0to1(p1, 3);
659663
rpp_simd_store(rpp_store12_f32pln3_to_f32pkd3, dstPtrTemp_ps, p1); // simd stores
660664

661665
for(int cnt = 0; cnt < 12; cnt++)
@@ -731,6 +735,8 @@ RppStatus blend_f16_f16_host_tensor(Rpp16f *srcPtr1,
731735
rpp_simd_load(rpp_load4_f32_to_f32, srcPtr1Temp_ps, p1); // simd loads
732736
rpp_simd_load(rpp_load4_f32_to_f32, srcPtr2Temp_ps, p2); // simd loads
733737
p1[0] = _mm_fmadd_ps(_mm_sub_ps(p1[0], p2[0]), pMul, p2[0]); // alpha-blending adjustment
738+
//boundary checks for f16
739+
rpp_pixel_check_0to1(p1, 1);
734740
rpp_simd_store(rpp_store4_f32_to_f32, dstPtrTemp_ps, p1); // simd stores
735741

736742
for(int cnt = 0; cnt < 4; cnt++)

src/modules/tensor/cpu/kernel/brightness.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ inline void compute_brightness_48_host(__m128 *p, __m128 *pBrightnessParams)
5252

5353
inline void compute_brightness_24_host(__m256 *p, __m256 *pBrightnessParams)
5454
{
55-
p[0] = rpp_pixel_check_0to1_avx(_mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1])); // brightness adjustment
56-
p[1] = rpp_pixel_check_0to1_avx(_mm256_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1])); // brightness adjustment
57-
p[2] = rpp_pixel_check_0to1_avx(_mm256_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1])); // brightness adjustment
55+
p[0] = _mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
56+
p[1] = _mm256_fmadd_ps(p[1], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
57+
p[2] = _mm256_fmadd_ps(p[2], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
5858
}
5959

6060
inline void compute_brightness_24_host(__m128 *p, __m128 *pBrightnessParams)
@@ -90,7 +90,7 @@ inline void compute_brightness_12_host(__m128 *p, __m128 *pBrightnessParams)
9090

9191
inline void compute_brightness_8_host(__m256 *p, __m256 *pBrightnessParams)
9292
{
93-
p[0] = rpp_pixel_check_0to1_avx(_mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1])); // brightness adjustment
93+
p[0] = _mm256_fmadd_ps(p[0], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment
9494
}
9595

9696
inline void compute_brightness_8_host(__m128 *p, __m128 *pBrightnessParams)
@@ -395,6 +395,8 @@ RppStatus brightness_f32_f32_host_tensor(Rpp32f *srcPtr,
395395
__m256 p[3];
396396
rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp, p); // simd loads
397397
compute_brightness_24_host(p, pBrightnessParams); // brightness adjustment
398+
//Boundary check for f32
399+
rpp_pixel_check_0to1(p, 3);
398400
rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR, dstPtrTempG, dstPtrTempB, p); // simd stores
399401
#else
400402
__m128 p[3];
@@ -450,6 +452,8 @@ RppStatus brightness_f32_f32_host_tensor(Rpp32f *srcPtr,
450452
__m256 p[3];
451453
rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR, srcPtrTempG, srcPtrTempB, p); // simd loads
452454
compute_brightness_24_host(p, pBrightnessParams); // brightness adjustment
455+
//Boundary check for f32
456+
rpp_pixel_check_0to1(p, 3);
453457
rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp, p); // simd stores
454458
#else
455459
__m128 p[4];
@@ -506,6 +510,8 @@ RppStatus brightness_f32_f32_host_tensor(Rpp32f *srcPtr,
506510

507511
rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp, p); // simd loads
508512
compute_brightness_8_host(p, pBrightnessParams); // brightness adjustment
513+
//Boundary check for f32
514+
rpp_pixel_check_0to1(p, 1);
509515
rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp, p); // simd stores
510516
#else
511517
__m128 p[1];
@@ -621,6 +627,8 @@ RppStatus brightness_f16_f16_host_tensor(Rpp16f *srcPtr,
621627
__m256 p[3];
622628
rpp_simd_load(rpp_load24_f32pkd3_to_f32pln3_avx, srcPtrTemp_ps, p); // simd loads
623629
compute_brightness_24_host(p, pBrightnessParams); // brightness adjustment
630+
//Boundary check for f16
631+
rpp_pixel_check_0to1(p, 3);
624632
rpp_simd_store(rpp_store24_f32pln3_to_f32pln3_avx, dstPtrTempR_ps, dstPtrTempG_ps, dstPtrTempB_ps, p); // simd stores
625633
#else
626634
__m128 p[3];
@@ -692,6 +700,8 @@ RppStatus brightness_f16_f16_host_tensor(Rpp16f *srcPtr,
692700
__m256 p[3];
693701
rpp_simd_load(rpp_load24_f32pln3_to_f32pln3_avx, srcPtrTempR_ps, srcPtrTempG_ps, srcPtrTempB_ps, p); // simd loads
694702
compute_brightness_24_host(p, pBrightnessParams); // brightness adjustment
703+
//Boundary check for f16
704+
rpp_pixel_check_0to1(p, 3);
695705
rpp_simd_store(rpp_store24_f32pln3_to_f32pkd3_avx, dstPtrTemp_ps, p); // simd stores
696706
#else
697707
__m128 p[4];
@@ -756,6 +766,8 @@ RppStatus brightness_f16_f16_host_tensor(Rpp16f *srcPtr,
756766

757767
rpp_simd_load(rpp_load8_f32_to_f32_avx, srcPtrTemp_ps, p); // simd loads
758768
compute_brightness_8_host(p, pBrightnessParams); // brightness adjustment
769+
//Boundary check for f16
770+
rpp_pixel_check_0to1(p, 1);
759771
rpp_simd_store(rpp_store8_f32_to_f32_avx, dstPtrTemp_ps, p); // simd stores
760772
#else
761773
__m128 p[1];

0 commit comments

Comments
 (0)