From 9ae7287c33fa78d5e0cae5e73e16e802b48a5467 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 24 May 2026 21:11:35 +0100 Subject: [PATCH 1/2] sort: add a test reproducing failure during sort --merge sort -m takes: 3 lines, 96003 bytes and emits: 4 lines, 96004 bytes The output line lengths before the fix are: ``` a x 32000 b x 23809 b x 8191 c x 32000 ``` So it splits one of the lines into two (23809 + 8191 = 32000). In addition, the output becomes unsorted because the shorter 'b' fragment sorts before the longer 'b' fragment. The issue is that in `chunks.rs`, `sep_iter` is relatve to `search_start`. But the returned value needs to be absolute position relative to the `buffer`. We end up with these particular numbers because - in merge.rs, initial chunk is created as `RecycledChunk::new(8 * 1024)` (8192 bytes) - `search_start = 8192`; newline is at absolute buffer index `32000` - `memchr_iter` returns `32000 - 8192 = 23808`, and newline adds + 1 byte --- tests/by-util/test_sort.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index fe7c0c3403..e9dc9bfa6f 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1102,6 +1102,34 @@ fn test_merge_interleaved() { .stdout_only_fixture("merge_ints_interleaved.expected"); } +#[test] +fn test_merge_preserves_long_lines() { + use std::fmt::Write; + + const N_ROWS: usize = 3; + const LINE_LEN: usize = 32_000; + const LINE_VALUES: [&str; N_ROWS] = ["a", "b", "c"]; + // Exercise merge reads where long lines span internal chunk boundaries. + let input = LINE_VALUES.into_iter().fold( + String::with_capacity(N_ROWS * (LINE_LEN + 1)), + |mut acc, value| { + writeln!(acc, "{}", value.repeat(LINE_LEN)).unwrap(); + acc + }, + ); + + let (at, mut ucmd) = at_and_ucmd!(); + at.write("long-lines.txt", &input); + + let result = ucmd.arg("-m").arg("long-lines.txt").succeeds(); + result.no_stderr(); + + let stdout = result.stdout_move_bytes(); + assert_eq!(bytecount::count(&stdout, b'\n'), N_ROWS); + assert_eq!(stdout.len(), input.len()); + assert_eq!(stdout.as_slice(), input.as_bytes()); +} + #[test] fn test_merge_unique() { new_ucmd!() From 2a9589919f5b5bd9a1ee0fb03d0078e30ea1310c Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Sun, 24 May 2026 21:49:48 +0100 Subject: [PATCH 2/2] sort: fix incorrect sort ordering with long line inputs The issue is that in `chunks.rs`, `sep_iter` is relative to `search_start`. But the returned value needs to be absolute position relative to the `buffer`. --- src/uu/sort/src/chunks.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index 62fb9fe4ea..a465deb440 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -380,14 +380,15 @@ fn read_to_buffer( } } + let search_start = newline_search_offset; let mut sep_iter = - memchr_iter(separator, &buffer[newline_search_offset..buffer.len()]).rev(); + memchr_iter(separator, &buffer[search_start..buffer.len()]).rev(); newline_search_offset = buffer.len(); if let Some(last_line_end) = sep_iter.next() { if found_newline || sep_iter.next().is_some() { // We read enough lines. // We want to include the separator here, because it shouldn't be carried over. - return Ok((last_line_end + 1, true)); + return Ok((search_start + last_line_end + 1, true)); } found_newline = true; }