Skip to content

Commit

Permalink
More tests for decompression
Browse files Browse the repository at this point in the history
Add separate testing for bulk and row-by-row decompression, so that the
errors in one don't mask the errors in the other.

Also add fuzzing for row-by-row decompression, for text columns as well.
  • Loading branch information
akuzm committed Dec 20, 2023
1 parent 1797f8e commit 4f2f658
Show file tree
Hide file tree
Showing 71 changed files with 1,204 additions and 678 deletions.
207 changes: 168 additions & 39 deletions .github/workflows/libfuzzer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,14 @@ name: Libfuzzer
- prerelease_test
- trigger/libfuzzer
pull_request:
paths: .github/workflows/libfuzzer.yaml
paths:
- .github/workflows/libfuzzer.yaml
- 'tsl/test/fuzzing/compression/**'

jobs:
fuzz:
strategy:
fail-fast: false
matrix:
case: [ { algo: gorilla, type: float8 }, { algo: deltadelta, type: int8 } ]

name: Fuzz decompression ${{ matrix.case.algo }} ${{ matrix.case.type }}
build:
runs-on: ubuntu-22.04
name: Build PostgreSQL and TimescaleDB
env:
PG_SRC_DIR: pgbuild
PG_INSTALL_DIR: postgresql
Expand All @@ -30,7 +27,7 @@ jobs:
# Don't add ddebs here because the ddebs mirror is always 503 Service Unavailable.
# If needed, install them before opening the core dump.
sudo apt-get update
sudo apt-get install clang lld llvm flex bison lcov systemd-coredump gdb libipc-run-perl \
sudo apt-get install 7zip clang lld llvm flex bison libipc-run-perl \
libtest-most-perl tree
- name: Checkout TimescaleDB
Expand Down Expand Up @@ -68,7 +65,7 @@ jobs:
CC=clang ./configure --prefix=$HOME/$PG_INSTALL_DIR --with-openssl \
--without-readline --without-zlib --without-libxml --enable-cassert \
--enable-debug CC=clang \
CFLAGS="-DTS_COMPRESSION_FUZZING=1 -fuse-ld=lld -ggdb3 -Og -fno-omit-frame-pointer"
CFLAGS="-DTS_COMPRESSION_FUZZING=1 -fuse-ld=lld -ggdb3 -O2 -fno-omit-frame-pointer"
make -j$(nproc)
- name: Install PostgreSQL
Expand All @@ -89,13 +86,68 @@ jobs:
export LIBFUZZER_PATH=$(dirname "$(find $(llvm-config --libdir) -name libclang_rt.fuzzer_no_main-x86_64.a | head -1)")
# Some pointers for the next time we have linking/undefined symbol problems:
# http://web.archive.org/web/20200926071757/https://github.com/google/sanitizers/issues/111
# http://web.archive.org/web/20231101091231/https://github.com/cms-sw/cmssw/issues/40680
cmake -B build -S . -DASSERTIONS=ON -DLINTER=OFF -DCMAKE_VERBOSE_MAKEFILE=1 \
-DWARNINGS_AS_ERRORS=1 -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=clang \
-DWARNINGS_AS_ERRORS=1 -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=clang \
-DCMAKE_C_FLAGS="-fsanitize=fuzzer-no-link -lstdc++ -L$LIBFUZZER_PATH -l:libclang_rt.fuzzer_no_main-x86_64.a -static-libsan" \
-DPG_PATH=$HOME/$PG_INSTALL_DIR
make -C build -j$(nproc) install
# Incredibly, the upload-artifact action can't preserve executable permissions:
# https://github.com/actions/upload-artifact/issues/38
# It's also extremely slow.
- name: Compress the installation directory
run: 7z a install.7z $HOME/$PG_INSTALL_DIR

- name: Save the installation directory
uses: actions/upload-artifact@v3
with:
name: fuzzing-install-dir
path: install.7z
if-no-files-found: error
retention-days: 1

fuzz:
needs: build
strategy:
fail-fast: false
matrix:
case: [
{ algo: gorilla , pgtype: float8, bulk: false, runs: 500000000 },
{ algo: deltadelta, pgtype: int8 , bulk: false, runs: 500000000 },
{ algo: gorilla , pgtype: float8, bulk: true , runs: 1000000000 },
{ algo: deltadelta, pgtype: int8 , bulk: true , runs: 1000000000 },
# array has a peculiar recv function that recompresses all input, so
# fuzzing it is much slower. The dictionary recv also uses it.
{ algo: array , pgtype: text , bulk: false, runs: 10000000 },
{ algo: dictionary, pgtype: text , bulk: false, runs: 100000000 },
]

name: Fuzz decompression ${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}
runs-on: ubuntu-22.04
env:
PG_SRC_DIR: pgbuild
PG_INSTALL_DIR: postgresql

steps:
- name: Install Linux dependencies
run: sudo apt install 7zip systemd-coredump gdb

- name: Checkout TimescaleDB
uses: actions/checkout@v3

- name: Download the installation directory
uses: actions/download-artifact@v3
with:
name: fuzzing-install-dir

- name: Unpack the installation directory
run: 7z x -o$HOME install.7z

- name: initdb
run: |
# Have to do this before initializing the corpus, or initdb will complain.
Expand All @@ -108,24 +160,37 @@ jobs:
initdb
echo "shared_preload_libraries = 'timescaledb'" >> $PGDATA/postgresql.conf
- name: Restore the cached fuzzing corpus
id: restore-corpus-cache
- name: Set configuration
id: config
run: |
set -x
echo "cache_prefix=${{ format('libfuzzer-corpus-2-{0}-{1}', matrix.case.algo, matrix.case.pgtype) }}" >> $GITHUB_OUTPUT
echo "name=${{ matrix.case.algo }} ${{ matrix.case.pgtype }} ${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}" >> $GITHUB_OUTPUT
- name: Restore the cached fuzzing corpus (bulk)
id: restore-corpus-cache-bulk
uses: actions/cache/restore@v3
with:
path: db/corpus
# If the initial corpus changes, probably it was updated by hand with
# some important examples, and it makes sense to start anew from it.
key: "libfuzzer-corpus-2-${{ matrix.case.algo }}-${{ matrix.case.type }}-\
${{ hashFiles(format('tsl/test/fuzzing/compression/{0}-{1}', matrix.case.algo, matrix.case.type)) }}"
path: db/corpus-bulk
key: "${{ steps.config.outputs.cache_prefix }}-bulk"

# We save the row-by-row corpus separately from the bulk corpus, so that
# they don't overwrite each other. Now we are going to combine them.
- name: Restore the cached fuzzing corpus (rowbyrow)
id: restore-corpus-cache-rowbyrow
uses: actions/cache/restore@v3
with:
path: db/corpus-rowbyrow
key: "${{ steps.config.outputs.cache_prefix }}-rowbyrow"

- name: Initialize the fuzzing corpus
# cache-hit is only true for exact key matches, and we use prefix matches.
if: steps.restore-corpus-cache.outputs.cache-matched-key == ''
run: |
# Copy the intial corpus files from the repository. The github actions
# cache doesn't follow symlinks.
mkdir -p db/corpus
find "tsl/test/fuzzing/compression/${{ matrix.case.algo }}-${{ matrix.case.type }}" -type f -exec cp -t db/corpus {} +
# Combine the cached corpus from rowbyrow and bulk fuzzing, and from repository.
mkdir -p db/corpus{,-rowbyrow,-bulk}
find "tsl/test/fuzzing/compression/${{ matrix.case.algo }}-${{ matrix.case.pgtype }}" -type f -exec cp -n -t db/corpus {} +
find "db/corpus-rowbyrow" -type f -exec cp -n -t db/corpus {} +
find "db/corpus-bulk" -type f -exec cp -n -t db/corpus {} +
ls db/corpus | wc -l
- name: Run libfuzzer for compression
run: |
Expand All @@ -135,25 +200,36 @@ jobs:
export PGPORT=5432
export PGDATABASE=postgres
export PATH=$HOME/$PG_INSTALL_DIR/bin:$PATH
pg_ctl -l postmaster.log start
pg_ctl -l postgres.log start
psql -c "create extension timescaledb;"
# Create the fuzzing function
# Create the fuzzing functions
export MODULE_NAME=$(basename $(find $HOME/$PG_INSTALL_DIR -name "timescaledb-tsl-*.so"))
psql -a -c "create or replace function fuzz(algo cstring, type regtype, runs int) returns int as '"$MODULE_NAME"', 'ts_fuzz_compression' language c;"
psql -a -c "create or replace function fuzz(algo cstring, pgtype regtype,
bulk bool, runs int)
returns int as '"$MODULE_NAME"', 'ts_fuzz_compression' language c;
create or replace function ts_read_compressed_data_directory(algo cstring,
pgtype regtype, path cstring, bulk bool)
returns table(path text, bytes int, rows int, sqlstate text, location text)
as '"$MODULE_NAME"', 'ts_read_compressed_data_directory' language c;
"
# Start more fuzzing processes in the background. We won't even monitor
# their progress, because the server will panic if they find an error.
for x in {2..$(nproc)}
do
psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}', '${{ matrix.case.type }}', 100000000);" &
psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}',
'${{ matrix.case.pgtype }}', '${{ matrix.case.bulk }}', ${{ matrix.case.runs }});" &
done
# Start the one fuzzing process that we will monitor, in foreground.
# The LLVM fuzzing driver calls exit(), so we expect to lose the connection.
ret=0
psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}', '${{ matrix.case.type }}', 100000000);" || ret=$?
psql -v ON_ERROR_STOP=1 -c "select fuzz('${{ matrix.case.algo }}',
'${{ matrix.case.pgtype }}', '${{ matrix.case.bulk }}', ${{ matrix.case.runs }});" || ret=$?
if ! [ $ret -eq 2 ]
then
>&2 echo "Unexpected psql exit code $ret"
Expand All @@ -163,11 +239,38 @@ jobs:
# Check that the server is still alive.
psql -c "select 1"
ls db/corpus | wc -l
fn="ts_read_compressed_data_directory('${{ matrix.case.algo }}',
'${{ matrix.case.pgtype }}', 'corpus', '${{ matrix.case.bulk }}')"
# Show the statistics about fuzzing corpus
psql -c "select count(*), location, min(sqlstate), min(path)
from $fn
group by location order by count(*) desc
"
# Save interesting cases because the caches are not available for download from UI
mkdir -p interesting
psql -qtAX -c "select distinct on (location) 'db/' || path from $fn
order by location, bytes
" | xargs cp -t interesting
# Check that we don't have any internal errors
errors=$(psql -qtAX --set=ON_ERROR_STOP=1 -c "select count(*)
from $fn
where sqlstate = 'XX000'")
echo "Internal program errors: $errors"
[ $errors -eq 0 ] || exit 1
# Shouldn't have any WARNINGS in the log.
! grep -F "] WARNING: " postgres.log
- name: Collect the logs
if: always()
id: collectlogs
run: |
find . -name postmaster.log -exec cat {} + > postgres.log
# wait in case there are in-progress coredumps
sleep 10
if coredumpctl -q list >/dev/null; then echo "coredumps=true" >>$GITHUB_OUTPUT; fi
Expand All @@ -178,26 +281,52 @@ jobs:
if: always()
uses: actions/upload-artifact@v3
with:
name: PostgreSQL log for ${{ matrix.case.algo }} ${{ matrix.case.type }}
name: PostgreSQL log for ${{ steps.config.outputs.name }}
path: postgres.log

- name: Save fuzzer-generated crash cases
if: always()
uses: actions/upload-artifact@v3
with:
name: Crash cases for ${{ matrix.case.algo }} ${{ matrix.case.type }}
name: Crash cases for ${{ steps.config.outputs.name }}
path: db/crash-*

- name: Save interesting cases
if: always()
uses: actions/upload-artifact@v3
with:
name: Interesting cases for ${{ steps.config.outputs.name }}
path: interesting/

# We use separate restore/save actions, because the default action won't
# save the updated folder after the cache hit. We also can't overwrite the
# existing cache, so we add a unique suffix. The cache is matched by key
# prefix, not exact key, and picks the newest matching item, so this works.
# save the updated folder after the cache hit. We also want to save the
# cache after fuzzing errors, and the default action doesn't save after
# errors.
# We can't overwrite the existing cache, so we add a unique suffix. The
# cache is matched by key prefix, not exact key, and picks the newest
# matching item, so this works.
# The caches for rowbyrow and bulk fuzzing are saved separately, otherwise
# the slower job would always overwrite the cache from the faster one. We
# want to combine corpuses from bulk and rowbyrow fuzzing for better
# coverage.
# Note that the cache action cannot be restored on a path different from the
# one it was saved from. To make our lives more interesting, it is not
# directly documented anywhere, but we can deduce it from path influencing
# the version.
- name: Change corpus path to please the 'actions/cache' GitHub Action
if: always()
run: |
rm -rf db/corpus-{bulk,rowbyrow} ||:
mv -fT db/corpus{,-${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}}
- name: Save fuzzer corpus
if: always()
uses: actions/cache/save@v3
with:
path: db/corpus
key: "${{ format('{0}-{1}-{2}',
steps.restore-corpus-cache.outputs.cache-primary-key,
path: db/corpus-${{ matrix.case.bulk && 'bulk' || 'rowbyrow' }}
key: "${{ format('{0}-{1}-{2}-{3}',
steps.config.outputs.cache_prefix,
matrix.case.bulk && 'bulk' || 'rowbyrow',
github.run_id, github.run_attempt) }}"

- name: Stack trace
Expand All @@ -224,5 +353,5 @@ jobs:
if: always() && steps.collectlogs.outputs.coredumps == 'true'
uses: actions/upload-artifact@v3
with:
name: Coredumps for ${{ matrix.case.algo }} ${{ matrix.case.type }}
name: Coredumps for ${{ steps.config.outputs.name }}
path: coredumps
6 changes: 2 additions & 4 deletions src/adts/bit_array_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -347,8 +347,6 @@ bit_array_append_bucket(BitArray *array, uint8 bits_used, uint64 bucket)
static uint64
bit_array_low_bits_mask(uint8 bits_used)
{
if (bits_used >= 64)
return PG_UINT64_MAX;
else
return (UINT64CONST(1) << bits_used) - UINT64CONST(1);
Assert(bits_used > 0);
return -1ULL >> (64 - bits_used);
}
2 changes: 2 additions & 0 deletions tsl/src/compression/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ set(SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/api.c
${CMAKE_CURRENT_SOURCE_DIR}/array.c
${CMAKE_CURRENT_SOURCE_DIR}/compression.c
${CMAKE_CURRENT_SOURCE_DIR}/compression_test.c
${CMAKE_CURRENT_SOURCE_DIR}/decompress_text_test_impl.c
${CMAKE_CURRENT_SOURCE_DIR}/create.c
${CMAKE_CURRENT_SOURCE_DIR}/datum_serialize.c
${CMAKE_CURRENT_SOURCE_DIR}/deltadelta.c
Expand Down
13 changes: 8 additions & 5 deletions tsl/src/compression/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ array_compression_serialization_size(ArrayCompressorSerializationInfo *info)
uint32
array_compression_serialization_num_elements(ArrayCompressorSerializationInfo *info)
{
CheckCompressedData(info->sizes != NULL);
return info->sizes->num_elements;
}

Expand Down Expand Up @@ -405,12 +406,12 @@ array_decompression_iterator_try_next_forward(DecompressionIterator *general_ite
.is_done = true,
};

Assert(iter->data_offset + datum_size.val <= iter->num_data_bytes);
CheckCompressedData(iter->data_offset + datum_size.val <= iter->num_data_bytes);

start_pointer = iter->data + iter->data_offset;
val = bytes_to_datum_and_advance(iter->deserializer, &start_pointer);
iter->data_offset += datum_size.val;
Assert(iter->data + iter->data_offset == start_pointer);
CheckCompressedData(iter->data + iter->data_offset == start_pointer);

return (DecompressResult){
.val = val,
Expand Down Expand Up @@ -602,7 +603,6 @@ array_compressed_data_send(StringInfo buffer, const char *_serialized_data, Size
Datum
array_compressed_recv(StringInfo buffer)
{
ArrayCompressorSerializationInfo *data;
uint8 has_nulls;
Oid element_type;

Expand All @@ -611,9 +611,12 @@ array_compressed_recv(StringInfo buffer)

element_type = binary_string_get_type(buffer);

data = array_compressed_data_recv(buffer, element_type);
ArrayCompressorSerializationInfo *info = array_compressed_data_recv(buffer, element_type);

PG_RETURN_POINTER(array_compressed_from_serialization_info(data, element_type));
CheckCompressedData(info->sizes != NULL);
CheckCompressedData(has_nulls == (info->nulls != NULL));

PG_RETURN_POINTER(array_compressed_from_serialization_info(info, element_type));
}

void
Expand Down
Loading

0 comments on commit 4f2f658

Please sign in to comment.