diff --git a/src/uu/sort/src/chunks.rs b/src/uu/sort/src/chunks.rs index 62fb9fe4ea..a465deb440 100644 --- a/src/uu/sort/src/chunks.rs +++ b/src/uu/sort/src/chunks.rs @@ -380,14 +380,15 @@ fn read_to_buffer( } } + let search_start = newline_search_offset; let mut sep_iter = - memchr_iter(separator, &buffer[newline_search_offset..buffer.len()]).rev(); + memchr_iter(separator, &buffer[search_start..buffer.len()]).rev(); newline_search_offset = buffer.len(); if let Some(last_line_end) = sep_iter.next() { if found_newline || sep_iter.next().is_some() { // We read enough lines. // We want to include the separator here, because it shouldn't be carried over. - return Ok((last_line_end + 1, true)); + return Ok((search_start + last_line_end + 1, true)); } found_newline = true; } diff --git a/tests/by-util/test_sort.rs b/tests/by-util/test_sort.rs index fe7c0c3403..e9dc9bfa6f 100644 --- a/tests/by-util/test_sort.rs +++ b/tests/by-util/test_sort.rs @@ -1102,6 +1102,34 @@ fn test_merge_interleaved() { .stdout_only_fixture("merge_ints_interleaved.expected"); } +#[test] +fn test_merge_preserves_long_lines() { + use std::fmt::Write; + + const N_ROWS: usize = 3; + const LINE_LEN: usize = 32_000; + const LINE_VALUES: [&str; N_ROWS] = ["a", "b", "c"]; + // Exercise merge reads where long lines span internal chunk boundaries. + let input = LINE_VALUES.into_iter().fold( + String::with_capacity(N_ROWS * (LINE_LEN + 1)), + |mut acc, value| { + writeln!(acc, "{}", value.repeat(LINE_LEN)).unwrap(); + acc + }, + ); + + let (at, mut ucmd) = at_and_ucmd!(); + at.write("long-lines.txt", &input); + + let result = ucmd.arg("-m").arg("long-lines.txt").succeeds(); + result.no_stderr(); + + let stdout = result.stdout_move_bytes(); + assert_eq!(bytecount::count(&stdout, b'\n'), N_ROWS); + assert_eq!(stdout.len(), input.len()); + assert_eq!(stdout.as_slice(), input.as_bytes()); +} + #[test] fn test_merge_unique() { new_ucmd!()