More tests for decompression

Add separate testing for bulk and row-by-row decompression, so that the errors in one don't mask the errors in the other. Also add fuzzing for row-by-row decompression, for text columns as well.
timescale · Dec 20, 2023 · 4f2f658 · 4f2f658
1 parent 1797f8e
commit 4f2f658
Show file tree

Hide file tree

Showing 71 changed files with 1,204 additions and 678 deletions.
diff --git a/.github/workflows/libfuzzer.yaml b/.github/workflows/libfuzzer.yaml
@@ -9,17 +9,14 @@ name: Libfuzzer
       - prerelease_test
       - trigger/libfuzzer
   pull_request:
-    paths: .github/workflows/libfuzzer.yaml
+    paths:
+      - .github/workflows/libfuzzer.yaml
+      - 'tsl/test/fuzzing/compression/**'
 
 jobs:
-  fuzz:
-    strategy:
-      fail-fast: false
-      matrix:
-        case: [ { algo: gorilla, type: float8 }, { algo: deltadelta, type: int8 } ]
-
-    name: Fuzz decompression ${{ matrix.case.algo }} ${{ matrix.case.type }}
+  build:
     runs-on: ubuntu-22.04
+    name: Build PostgreSQL and TimescaleDB
     env:
       PG_SRC_DIR: pgbuild
       PG_INSTALL_DIR: postgresql
@@ -30,7 +27,7 @@ jobs:
         # Don't add ddebs here because the ddebs mirror is always 503 Service Unavailable.
         # If needed, install them before opening the core dump.
         sudo apt-get update
-        sudo apt-get install clang lld llvm flex bison lcov systemd-coredump gdb libipc-run-perl \
+        sudo apt-get install 7zip clang lld llvm flex bison libipc-run-perl \
           libtest-most-perl tree
 
     - name: Checkout TimescaleDB
@@ -68,7 +65,7 @@ jobs:
         CC=clang ./configure --prefix=$HOME/$PG_INSTALL_DIR --with-openssl \
           --without-readline --without-zlib --without-libxml --enable-cassert \
           --enable-debug CC=clang \
-          CFLAGS="-DTS_COMPRESSION_FUZZING=1 -fuse-ld=lld -ggdb3 -Og -fno-omit-frame-pointer"
+          CFLAGS="-DTS_COMPRESSION_FUZZING=1 -fuse-ld=lld -ggdb3 -O2 -fno-omit-frame-pointer"
         make -j$(nproc)
 
     - name: Install PostgreSQL
@@ -89,13 +86,68 @@ jobs:
 
         export LIBFUZZER_PATH=$(dirname "$(find $(llvm-config --libdir) -name libclang_rt.fuzzer_no_main-x86_64.a | head -1)")
 
+        # Some pointers for the next time we have linking/undefined symbol problems:
+        # http://web.archive.org/web/20200926071757/https://github.com/google/sanitizers/issues/111
+        # http://web.archive.org/web/20231101091231/https://github.com/cms-sw/cmssw/issues/40680
+
         cmake -B build -S . -DASSERTIONS=ON -DLINTER=OFF -DCMAKE_VERBOSE_MAKEFILE=1 \
-            -DWARNINGS_AS_ERRORS=1 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=clang \
+            -DWARNINGS_AS_ERRORS=1 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=clang \
             -DCMAKE_C_FLAGS="-fsanitize=fuzzer-no-link -lstdc++ -L$LIBFUZZER_PATH -l:libclang_rt.fuzzer_no_main-x86_64.a -static-libsan" \
             -DPG_PATH=$HOME/$PG_INSTALL_DIR
 
         make -C build -j$(nproc) install
 
+    # Incredibly, the upload-artifact action can't preserve executable permissions:
+    # https://github.com/actions/upload-artifact/issues/38
+    # It's also extremely slow.
+    - name: Compress the installation directory
+      run: 7z a install.7z $HOME/$PG_INSTALL_DIR
+
+    - name: Save the installation directory
+      uses: actions/upload-artifact@v3
+      with:
+        name: fuzzing-install-dir
+        path: install.7z
+        if-no-files-found: error
+        retention-days: 1
+
+  fuzz:
+    needs: build
+    strategy:
+      fail-fast: false
+      matrix:
+        case: [
+            { algo: gorilla   , pgtype: float8, bulk: false, runs:  500000000 },
+            { algo: deltadelta, pgtype: int8  , bulk: false, runs:  500000000 },
+            { algo: gorilla   , pgtype: float8, bulk: true , runs: 1000000000 },
+            { algo: deltadelta, pgtype: int8  , bulk: true , runs: 1000000000 },
+            # array has a peculiar recv function that recompresses all input, so
+            # fuzzing it is much slower. The dictionary recv also uses it.
+            { algo: array     , pgtype: text  , bulk: false, runs:   10000000 },
+            { algo: dictionary, pgtype: text  , bulk: false, runs:  100000000 },
+            ]
+
+    name: Fuzz decompression ${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}
+    runs-on: ubuntu-22.04
+    env:
+      PG_SRC_DIR: pgbuild
+      PG_INSTALL_DIR: postgresql
+
+    steps:
+    - name: Install Linux dependencies
+      run: sudo apt install 7zip systemd-coredump gdb
+
+    - name: Checkout TimescaleDB
+      uses: actions/checkout@v3
+
+    - name: Download the installation directory
+      uses: actions/download-artifact@v3
+      with:
+        name: fuzzing-install-dir
+
+    - name: Unpack the installation directory
+      run: 7z x -o$HOME install.7z
+
     - name: initdb
       run: |
         # Have to do this before initializing the corpus, or initdb will complain.
@@ -108,24 +160,37 @@ jobs:
         initdb
         echo "shared_preload_libraries = 'timescaledb'" >> $PGDATA/postgresql.conf
 
-    - name: Restore the cached fuzzing corpus
-      id: restore-corpus-cache
+    - name: Set configuration
+      id: config
+      run: |
+        set -x
+        echo "cache_prefix=${{ format('libfuzzer-corpus-2-{0}-{1}', matrix.case.algo, matrix.case.pgtype) }}" >> $GITHUB_OUTPUT
+        echo "name=${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}" >> $GITHUB_OUTPUT
+
+    - name: Restore the cached fuzzing corpus (bulk)
+      id: restore-corpus-cache-bulk
       uses: actions/cache/restore@v3
       with:
-        path: db/corpus
-        # If the initial corpus changes, probably it was updated by hand with
-        # some important examples, and it makes sense to start anew from it.
-        key: "libfuzzer-corpus-2-${{ matrix.case.algo }}-${{ matrix.case.type }}-\
-          ${{ hashFiles(format('tsl/test/fuzzing/compression/{0}-{1}', matrix.case.algo, matrix.case.type)) }}"
+        path: db/corpus-bulk
+        key: "${{ steps.config.outputs.cache_prefix }}-bulk"
+
+    # We save the row-by-row corpus separately from the bulk corpus, so that
+    # they don't overwrite each other. Now we are going to combine them.
+    - name: Restore the cached fuzzing corpus (rowbyrow)
+      id: restore-corpus-cache-rowbyrow
+      uses: actions/cache/restore@v3
+      with:
+        path: db/corpus-rowbyrow
+        key: "${{ steps.config.outputs.cache_prefix }}-rowbyrow"
 
     - name: Initialize the fuzzing corpus
-      # cache-hit is only true for exact key matches, and we use prefix matches.
-      if: steps.restore-corpus-cache.outputs.cache-matched-key == ''
       run: |
-        # Copy the intial corpus files from the repository. The github actions
-        # cache doesn't follow symlinks.
-        mkdir -p db/corpus
-        find "tsl/test/fuzzing/compression/${{ matrix.case.algo }}-${{ matrix.case.type }}" -type f -exec cp -t db/corpus {} +
+        # Combine the cached corpus from rowbyrow and bulk fuzzing, and from repository.
+        mkdir -p db/corpus{,-rowbyrow,-bulk}
+        find "tsl/test/fuzzing/compression/${{ matrix.case.algo }}-${{ matrix.case.pgtype }}" -type f -exec cp -n -t db/corpus {} +
+        find "db/corpus-rowbyrow" -type f -exec cp -n -t db/corpus {} +
+        find "db/corpus-bulk" -type f -exec cp -n -t db/corpus {} +
+        ls db/corpus | wc -l
 
     - name: Run libfuzzer for compression
       run: |
@@ -135,25 +200,36 @@ jobs:
         export PGPORT=5432
         export PGDATABASE=postgres
         export PATH=$HOME/$PG_INSTALL_DIR/bin:$PATH
-        pg_ctl -l postmaster.log start
+        pg_ctl -l postgres.log start
 
         psql -c "create extension timescaledb;"
 
-        # Create the fuzzing function
+        # Create the fuzzing functions
         export MODULE_NAME=$(basename $(find $HOME/$PG_INSTALL_DIR -name "timescaledb-tsl-*.so"))
-        psql -a -c "create or replace function fuzz(algo cstring, type regtype, runs int) returns int as '"$MODULE_NAME"', 'ts_fuzz_compression' language c;"
+        psql -a -c "create or replace function fuzz(algo cstring, pgtype regtype,
+                bulk bool, runs int)
+            returns int as '"$MODULE_NAME"', 'ts_fuzz_compression' language c;
+
+            create or replace function ts_read_compressed_data_directory(algo cstring,
+                pgtype regtype, path cstring, bulk bool)
+            returns table(path text, bytes int, rows int, sqlstate text, location text)
+            as '"$MODULE_NAME"', 'ts_read_compressed_data_directory' language c;
+
+            "
 
         # Start more fuzzing processes in the background. We won't even monitor
         # their progress, because the server will panic if they find an error.
         for x in {2..$(nproc)}
         do
-          psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}', '${{ matrix.case.type }}', 100000000);" &
+          psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}',
+            '${{ matrix.case.pgtype }}', '${{ matrix.case.bulk }}', ${{ matrix.case.runs }});" &
         done
 
         # Start the one fuzzing process that we will monitor, in foreground.
         # The LLVM fuzzing driver calls exit(), so we expect to lose the connection.
         ret=0
-        psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}', '${{ matrix.case.type }}', 100000000);" || ret=$?
+        psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}',
+            '${{ matrix.case.pgtype }}', '${{ matrix.case.bulk }}', ${{ matrix.case.runs }});" || ret=$?
         if ! [ $ret -eq 2 ]
         then
             >&2 echo "Unexpected psql exit code $ret"
@@ -163,11 +239,38 @@ jobs:
         # Check that the server is still alive.
         psql -c "select 1"
 
+        ls db/corpus | wc -l
+
+        fn="ts_read_compressed_data_directory('${{ matrix.case.algo }}',
+                '${{ matrix.case.pgtype }}', 'corpus', '${{ matrix.case.bulk }}')"
+
+        # Show the statistics about fuzzing corpus
+        psql -c "select count(*), location, min(sqlstate), min(path)
+            from $fn
+            group by location order by count(*) desc
+        "
+
+        # Save interesting cases because the caches are not available for download from UI
+        mkdir -p interesting
+        psql -qtAX -c "select distinct on (location) 'db/' || path from $fn
+            order by location, bytes
+        " | xargs cp -t interesting
+
+        # Check that we don't have any internal errors
+        errors=$(psql -qtAX --set=ON_ERROR_STOP=1 -c "select count(*)
+            from $fn
+            where sqlstate = 'XX000'")
+        echo "Internal program errors: $errors"
+        [ $errors -eq 0 ] || exit 1
+
+
+        # Shouldn't have any WARNINGS in the log.
+        ! grep -F "] WARNING: " postgres.log
+
     - name: Collect the logs
       if: always()
       id: collectlogs
       run: |
-        find . -name postmaster.log -exec cat {} + > postgres.log
         # wait in case there are in-progress coredumps
         sleep 10
         if coredumpctl -q list >/dev/null; then echo "coredumps=true" >>$GITHUB_OUTPUT; fi
@@ -178,26 +281,52 @@ jobs:
       if: always()
       uses: actions/upload-artifact@v3
       with:
-        name: PostgreSQL log for ${{ matrix.case.algo }} ${{ matrix.case.type }}
+        name: PostgreSQL log for ${{ steps.config.outputs.name }}
         path: postgres.log
 
     - name: Save fuzzer-generated crash cases
       if: always()
       uses: actions/upload-artifact@v3
       with:
-        name: Crash cases for ${{ matrix.case.algo }} ${{ matrix.case.type }}
+        name: Crash cases for ${{ steps.config.outputs.name }}
         path: db/crash-*
 
+    - name: Save interesting cases
+      if: always()
+      uses: actions/upload-artifact@v3
+      with:
+        name: Interesting cases for ${{ steps.config.outputs.name }}
+        path: interesting/
+
     # We use separate restore/save actions, because the default action won't
-    # save the updated folder after the cache hit. We also can't overwrite the
-    # existing cache, so we add a unique suffix. The cache is matched by key
-    # prefix, not exact key, and picks the newest matching item, so this works.
+    # save the updated folder after the cache hit. We also want to save the
+    # cache after fuzzing errors, and the default action doesn't save after
+    # errors.
+    # We can't overwrite the existing cache, so we add a unique suffix. The
+    # cache is matched by key prefix, not exact key, and picks the newest
+    # matching item, so this works.
+    # The caches for rowbyrow and bulk fuzzing are saved separately, otherwise
+    # the slower job would always overwrite the cache from the faster one. We
+    # want to combine corpuses from bulk and rowbyrow fuzzing for better
+    # coverage.
+    # Note that the cache action cannot be restored on a path different from the
+    # one it was saved from. To make our lives more interesting, it is not
+    # directly documented anywhere, but we can deduce it from path influencing
+    # the version.
+    - name: Change corpus path to please the 'actions/cache' GitHub Action
+      if: always()
+      run: |
+        rm -rf db/corpus-{bulk,rowbyrow} ||:
+        mv -fT db/corpus{,-${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}}
+
     - name: Save fuzzer corpus
+      if: always()
       uses: actions/cache/save@v3
       with:
-        path: db/corpus
-        key: "${{ format('{0}-{1}-{2}',
-            steps.restore-corpus-cache.outputs.cache-primary-key,
+        path: db/corpus-${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}
+        key: "${{ format('{0}-{1}-{2}-{3}',
+            steps.config.outputs.cache_prefix,
+			matrix.case.bulk && 'bulk' || 'rowbyrow',
             github.run_id, github.run_attempt) }}"
 
     - name: Stack trace
@@ -224,5 +353,5 @@ jobs:
       if: always() && steps.collectlogs.outputs.coredumps == 'true'
       uses: actions/upload-artifact@v3
       with:
-        name: Coredumps for ${{ matrix.case.algo }} ${{ matrix.case.type }}
+        name: Coredumps for ${{ steps.config.outputs.name }}
         path: coredumps
diff --git a/src/adts/bit_array_impl.h b/src/adts/bit_array_impl.h
@@ -347,8 +347,6 @@ bit_array_append_bucket(BitArray *array, uint8 bits_used, uint64 bucket)
 static uint64
 bit_array_low_bits_mask(uint8 bits_used)
 {
-	if (bits_used >= 64)
-		return PG_UINT64_MAX;
-	else
-		return (UINT64CONST(1) << bits_used) - UINT64CONST(1);
+	Assert(bits_used > 0);
+	return -1ULL >> (64 - bits_used);
 }
diff --git a/tsl/src/compression/CMakeLists.txt b/tsl/src/compression/CMakeLists.txt
@@ -2,6 +2,8 @@ set(SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/api.c
     ${CMAKE_CURRENT_SOURCE_DIR}/array.c
     ${CMAKE_CURRENT_SOURCE_DIR}/compression.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/compression_test.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/decompress_text_test_impl.c
     ${CMAKE_CURRENT_SOURCE_DIR}/create.c
     ${CMAKE_CURRENT_SOURCE_DIR}/datum_serialize.c
     ${CMAKE_CURRENT_SOURCE_DIR}/deltadelta.c

diff --git a/tsl/src/compression/array.c b/tsl/src/compression/array.c
@@ -240,6 +240,7 @@ array_compression_serialization_size(ArrayCompressorSerializationInfo *info)
 uint32
 array_compression_serialization_num_elements(ArrayCompressorSerializationInfo *info)
 {
+	CheckCompressedData(info->sizes != NULL);
 	return info->sizes->num_elements;
 }
 
@@ -405,12 +406,12 @@ array_decompression_iterator_try_next_forward(DecompressionIterator *general_ite
 			.is_done = true,
 		};
 
-	Assert(iter->data_offset + datum_size.val <= iter->num_data_bytes);
+	CheckCompressedData(iter->data_offset + datum_size.val <= iter->num_data_bytes);
 
 	start_pointer = iter->data + iter->data_offset;
 	val = bytes_to_datum_and_advance(iter->deserializer, &start_pointer);
 	iter->data_offset += datum_size.val;
-	Assert(iter->data + iter->data_offset == start_pointer);
+	CheckCompressedData(iter->data + iter->data_offset == start_pointer);
 
 	return (DecompressResult){
 		.val = val,
@@ -602,7 +603,6 @@ array_compressed_data_send(StringInfo buffer, const char *_serialized_data, Size
 Datum
 array_compressed_recv(StringInfo buffer)
 {
-	ArrayCompressorSerializationInfo *data;
 	uint8 has_nulls;
 	Oid element_type;
 
@@ -611,9 +611,12 @@ array_compressed_recv(StringInfo buffer)
 
 	element_type = binary_string_get_type(buffer);
 
-	data = array_compressed_data_recv(buffer, element_type);
+	ArrayCompressorSerializationInfo *info = array_compressed_data_recv(buffer, element_type);
 
-	PG_RETURN_POINTER(array_compressed_from_serialization_info(data, element_type));
+	CheckCompressedData(info->sizes != NULL);
+	CheckCompressedData(has_nulls == (info->nulls != NULL));
+
+	PG_RETURN_POINTER(array_compressed_from_serialization_info(info, element_type));
 }
 
 void