Start parsing the chunks file with serde

This implements a hand-written parser which scans through the `chunks` file line-by-line, and parses the various headers and line records with serde. The most complex part here is parsing the line records. If that complexity starts to be unreasonable, a hybrid approach is also possible in which the hand-written parser is used along with the simpler serde-based `header` parsers, and still falling back to the existing parser-combinator based parser for the line records.
codecov · Swatinem · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024 · Nov 12, 2024
commit 452d71bc731731bf5ffb29eb81fa75be4a40d88a
diff --git a/core/benches/pyreport.rs b/core/benches/pyreport.rs
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 
 use codecov_rs::{
-    parsers::pyreport::{chunks, report_json},
+    parsers::pyreport::{chunks, chunks_serde, report_json},
     test_utils::test_report::{TestReport, TestReportBuilder},
 };
 use criterion::{criterion_group, criterion_main, Criterion};
@@ -58,7 +58,7 @@ fn simple_chunks(c: &mut Criterion) {
     let chunks = &[
         // Header and one chunk with an empty line
         "{}\n<<<<< end_of_header >>>>>\n{}\n",
-        // No header, one chunk with a populated line and an  empty line
+        // No header, one chunk with a populated line and an empty line
         "{}\n[1, null, [[0, 1]]]\n",
         // No header, two chunks, the second having just one empty line
         "{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n",
@@ -116,3 +116,63 @@ fn parse_chunks_file(input: &str, files: HashMap<usize, i64>, sessions: HashMap<
         .parse_next(&mut chunks_stream)
         .unwrap();
 }
+
+#[divan::bench]
+fn simple_chunks_serde() {
+    let chunks: &[&[u8]] = &[
+        // Header and one chunk with an empty line
+        b"{}\n<<<<< end_of_header >>>>>\n{}\n",
+        // No header, one chunk with a populated line and an empty line
+        b"{}\n[1, null, [[0, 1]]]\n",
+        // No header, two chunks, the second having just one empty line
+        b"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n",
+        // Header, two chunks, the second having multiple data lines and an empty line
+        b"{}\n<<<<< end_of_header >>>>>\n{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n[1, null, [[0, 1]]]\n[1, null, [[0, 1]]]\n",
+    ];
+
+    for input in chunks {
+        parse_chunks_file_serde(input)
+    }
+}
+
+// this is currently <300 ms on my machine
+#[divan::bench(sample_count = 10)]
+fn complex_chunks_serde(bencher: Bencher) {
+    // this is a ~96M `chunks` file
+    let chunks =
+        load_fixture("pyreport/large/worker-c71ddfd4cb1753c7a540e5248c2beaa079fc3341-chunks.txt");
+
+    bencher.bench(|| parse_chunks_file_serde(&chunks));
+}
+
+fn parse_chunks_file_serde(input: &[u8]) {
+    let mut parser = chunks_serde::Parser::new(input);
+    loop {
+        // TODO: these are just for debugging
+        let rest = parser.rest;
+        let expecting = parser.expecting;
+        let event = parser.next();
+        match event {
+            Ok(None) => break,
+            Ok(Some(_)) => {}
+            Err(err) => {
+                let rest = std::str::from_utf8(rest).unwrap();
+                let rest = rest.get(..32).unwrap_or(rest);
+                dbg!(rest, expecting);
+                panic!("{err}");
+            }
+        }
+    }
+}
+
+#[track_caller]
+fn load_fixture(path: &str) -> Vec<u8> {
+    let path = format!("./fixtures/{path}");
+    let contents = std::fs::read(path).unwrap();
+
+    if contents.starts_with(b"version https://git-lfs.github.com/spec/v1") {
+        panic!("Fixture has not been pulled from Git LFS");
+    }
+
+    contents
+}