Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IDWT 5x3 single-pass lifting and SSE2/AVX2 implementation #957

Merged
merged 6 commits into from Jun 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 34 additions & 0 deletions .travis.yml
Expand Up @@ -2,9 +2,12 @@ language: cpp

matrix:
include:
# OSX
- os: osx
compiler: clang
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_INCLUDE_IF_DEPLOY=1

# Test code style
- os: linux
compiler: clang-3.8
env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_CHECK_STYLE=1 OPJ_CI_SKIP_TESTS=1
Expand All @@ -16,12 +19,31 @@ matrix:
packages:
- clang-3.8
- flip

# Performance test with GCC
- os: linux
compiler: g++
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_INCLUDE_IF_DEPLOY=1 OPJ_CI_PERF_TESTS=1

# Test compilation with AVX2
- os: linux
compiler: clang-3.8
# skip tests since Travis doesn't have AVX2 compatible machines
env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_INSTRUCTION_SETS="-mavx2" OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_SKIP_TESTS=1
addons:
apt:
sources:
- llvm-toolchain-precise-3.8
- ubuntu-toolchain-r-test
packages:
- clang-3.8

# Test multi-threading
- os: linux
compiler: g++
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_NUM_THREADS=2

# Test 32-bit compilation
- os: linux
compiler: g++
env: OPJ_CI_ARCH=i386 OPJ_CI_BUILD_CONFIGURATION=Release
Expand All @@ -30,16 +52,22 @@ matrix:
packages:
- gcc-multilib
- g++-multilib

# Profile code (gcc -pg)
- os: linux
compiler: g++
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Debug OPJ_CI_PROFILE=1
addons:
apt:
packages:
- valgrind

# Test under ASAN
- os: linux
compiler: clang
env: OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Debug OPJ_CI_ASAN=1

# Test with CLang 3.8
- os: linux
compiler: clang-3.8
env: OPJ_CI_CC=clang-3.8 OPJ_CI_CXX=clang-3.8 OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release OPJ_CI_PERF_TESTS=1
Expand All @@ -50,6 +78,8 @@ matrix:
- ubuntu-toolchain-r-test
packages:
- clang-3.8

# Test with mingw 32 bit
- os: linux
compiler: x86_64-w64-mingw32-g++
env: OPJ_CI_CC=x86_64-w64-mingw32-gcc OPJ_CI_CXX=x86_64-w64-mingw32-g++ OPJ_CI_ARCH=i386 OPJ_CI_BUILD_CONFIGURATION=Release
Expand All @@ -63,6 +93,8 @@ matrix:
- g++-mingw-w64-i686
- gcc-multilib
- g++-multilib

# Test with mingw 64 bit
- os: linux
compiler: x86_64-w64-mingw32-g++
env: OPJ_CI_CC=x86_64-w64-mingw32-gcc OPJ_CI_CXX=x86_64-w64-mingw32-g++ OPJ_CI_ARCH=x86_64 OPJ_CI_BUILD_CONFIGURATION=Release
Expand All @@ -74,6 +106,8 @@ matrix:
- gcc-mingw-w64-x86-64
- gcc-mingw-w64
- g++-mingw-w64-x86-64

# Test with gcc 4.8
- os: linux
compiler: g++-4.8
env: OPJ_CI_CC=gcc-4.8 OPJ_CI_CXX=g++-4.8 OPJ_CI_ABI_CHECK=1
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Expand Up @@ -253,6 +253,7 @@ if(BUILD_JPIP_SERVER)
endif()
add_subdirectory(src/lib)
option(BUILD_LUTS_GENERATOR "Build utility to generate t1_luts.h" OFF)
option(BUILD_BENCH_DWT "Build bench_dwt utility (development benchmark)" OFF)

#-----------------------------------------------------------------------------
# Build Applications
Expand Down
10 changes: 10 additions & 0 deletions src/lib/openjp2/CMakeLists.txt
Expand Up @@ -183,3 +183,13 @@ endif(OPJ_USE_THREAD AND NOT Threads_FOUND)
if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
TARGET_LINK_LIBRARIES(${OPENJPEG_LIBRARY_NAME} ${CMAKE_THREAD_LIBS_INIT})
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)

if(BUILD_BENCH_DWT)
add_executable(bench_dwt bench_dwt.c dwt.c opj_malloc.c thread.c)
if(UNIX)
target_link_libraries(bench_dwt m)
endif()
if(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
target_link_libraries(bench_dwt ${CMAKE_THREAD_LIBS_INIT})
endif(OPJ_USE_THREAD AND Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
endif(BUILD_BENCH_DWT)
241 changes: 241 additions & 0 deletions src/lib/openjp2/bench_dwt.c
@@ -0,0 +1,241 @@
/*
* The copyright in this software is being made available under the 2-clauses
* BSD License, included below. This software may be subject to other third
* party and contributor rights, including patent rights, and no such rights
* are granted under this license.
*
* Copyright (c) 2017, IntoPix SA <contact@intopix.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

#include "opj_includes.h"

#ifdef _WIN32
#include <windows.h>
#else
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/times.h>
#endif /* _WIN32 */

OPJ_INT32 getValue(OPJ_UINT32 i)
{
return ((OPJ_INT32)i % 511) - 256;
}

void init_tilec(opj_tcd_tilecomp_t * l_tilec,
OPJ_INT32 x0,
OPJ_INT32 y0,
OPJ_INT32 x1,
OPJ_INT32 y1,
OPJ_UINT32 numresolutions)
{
opj_tcd_resolution_t* l_res;
OPJ_UINT32 resno, l_level_no;
size_t i, nValues;

memset(l_tilec, 0, sizeof(*l_tilec));
l_tilec->x0 = x0;
l_tilec->y0 = y0;
l_tilec->x1 = x1;
l_tilec->y1 = y1;
nValues = (size_t)(l_tilec->x1 - l_tilec->x0) *
(size_t)(l_tilec->y1 - l_tilec->y0);
l_tilec->data = opj_malloc(sizeof(OPJ_INT32) * nValues);
for (i = 0; i < nValues; i++) {
l_tilec->data[i] = getValue(i);
}
l_tilec->numresolutions = numresolutions;
l_tilec->resolutions = opj_calloc(l_tilec->numresolutions,
sizeof(opj_tcd_resolution_t));

l_level_no = l_tilec->numresolutions;
l_res = l_tilec->resolutions;

/* Adapted from opj_tcd_init_tile() */
for (resno = 0; resno < l_tilec->numresolutions; ++resno) {

--l_level_no;

/* border for each resolution level (global) */
l_res->x0 = opj_int_ceildivpow2(l_tilec->x0, (OPJ_INT32)l_level_no);
l_res->y0 = opj_int_ceildivpow2(l_tilec->y0, (OPJ_INT32)l_level_no);
l_res->x1 = opj_int_ceildivpow2(l_tilec->x1, (OPJ_INT32)l_level_no);
l_res->y1 = opj_int_ceildivpow2(l_tilec->y1, (OPJ_INT32)l_level_no);

++l_res;
}
}

void free_tilec(opj_tcd_tilecomp_t * l_tilec)
{
opj_free(l_tilec->data);
opj_free(l_tilec->resolutions);
}

void usage(void)
{
printf(
"bench_dwt [-size value] [-check] [-display] [-num_resolutions val]\n");
printf(
" [-offset x y] [-num_threads val]\n");
exit(1);
}


OPJ_FLOAT64 opj_clock(void)
{
#ifdef _WIN32
/* _WIN32: use QueryPerformance (very accurate) */
LARGE_INTEGER freq, t ;
/* freq is the clock speed of the CPU */
QueryPerformanceFrequency(&freq) ;
/* cout << "freq = " << ((double) freq.QuadPart) << endl; */
/* t is the high resolution performance counter (see MSDN) */
QueryPerformanceCounter(& t) ;
return freq.QuadPart ? (t.QuadPart / (OPJ_FLOAT64) freq.QuadPart) : 0 ;
#else
/* Unix or Linux: use resource usage */
struct rusage t;
OPJ_FLOAT64 procTime;
/* (1) Get the rusage data structure at this moment (man getrusage) */
getrusage(0, &t);
/* (2) What is the elapsed time ? - CPU time = User time + System time */
/* (2a) Get the seconds */
procTime = (OPJ_FLOAT64)(t.ru_utime.tv_sec + t.ru_stime.tv_sec);
/* (2b) More precisely! Get the microseconds part ! */
return (procTime + (OPJ_FLOAT64)(t.ru_utime.tv_usec + t.ru_stime.tv_usec) *
1e-6) ;
#endif
}

int main(int argc, char** argv)
{
int num_threads = 0;
opj_tcd_tilecomp_t tilec;
opj_thread_pool_t* tp;
OPJ_INT32 i, j, k;
OPJ_BOOL display = OPJ_FALSE;
OPJ_BOOL check = OPJ_FALSE;
OPJ_INT32 size = 16384 - 1;
OPJ_FLOAT64 start, stop;
OPJ_UINT32 offset_x = (size + 1) / 2 - 1;
OPJ_UINT32 offset_y = (size + 1) / 2 - 1;
OPJ_UINT32 num_resolutions = 6;

for (i = 1; i < argc; i++) {
if (strcmp(argv[i], "-display") == 0) {
display = OPJ_TRUE;
check = OPJ_TRUE;
} else if (strcmp(argv[i], "-check") == 0) {
check = OPJ_TRUE;
} else if (strcmp(argv[i], "-size") == 0 && i + 1 < argc) {
size = atoi(argv[i + 1]);
i ++;
} else if (strcmp(argv[i], "-num_threads") == 0 && i + 1 < argc) {
num_threads = atoi(argv[i + 1]);
i ++;
} else if (strcmp(argv[i], "-num_resolutions") == 0 && i + 1 < argc) {
num_resolutions = atoi(argv[i + 1]);
if (num_resolutions == 0 || num_resolutions > 32) {
fprintf(stderr,
"Invalid value for num_resolutions. Should be >= 1 and <= 32\n");
exit(1);
}
i ++;
} else if (strcmp(argv[i], "-offset") == 0 && i + 2 < argc) {
offset_x = atoi(argv[i + 1]);
offset_y = atoi(argv[i + 2]);
i += 2;
} else {
usage();
}
}

tp = opj_thread_pool_create(num_threads);

init_tilec(&tilec, offset_x, offset_y, offset_x + size, offset_y + size,
num_resolutions);

if (display) {
printf("Before\n");
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
printf("%d ", tilec.data[k]);
k ++;
}
printf("\n");
}
}

start = opj_clock();
opj_dwt_decode(tp, &tilec, tilec.numresolutions);
stop = opj_clock();
printf("time for dwt_decode: %.03f s\n", stop - start);

if (display || check) {
if (display) {
printf("After IDWT\n");
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
printf("%d ", tilec.data[k]);
k ++;
}
printf("\n");
}
}

opj_dwt_encode(&tilec);
if (display) {
printf("After FDWT\n");
k = 0;
for (j = 0; j < tilec.y1 - tilec.y0; j++) {
for (i = 0; i < tilec.x1 - tilec.x0; i++) {
printf("%d ", tilec.data[k]);
k ++;
}
printf("\n");
}
}

if (check) {
size_t idx;
size_t nValues = (size_t)(tilec.x1 - tilec.x0) *
(size_t)(tilec.y1 - tilec.y0);
for (idx = 0; i < nValues; i++) {
if (tilec.data[idx] != getValue(idx)) {
printf("Difference found at idx = %u\n", (OPJ_UINT32)idx);
exit(1);
}
}
}
}

free_tilec(&tilec);

opj_thread_pool_destroy(tp);
return 0;
}