Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start parsing the chunks file with serde #31

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Next Next commit
Start parsing the chunks file with serde
This implements a hand-written parser which scans through the `chunks` file line-by-line, and parses the various headers and line records with serde.

The most complex part here is parsing the line records.
If that complexity starts to be unreasonable, a hybrid approach is also possible in which the hand-written parser is used along with the simpler serde-based `header` parsers, and still falling back to the existing parser-combinator based parser for the line records.
  • Loading branch information
Swatinem committed Nov 19, 2024
commit 452d71bc731731bf5ffb29eb81fa75be4a40d88a
64 changes: 62 additions & 2 deletions core/benches/pyreport.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::collections::HashMap;

use codecov_rs::{
parsers::pyreport::{chunks, report_json},
parsers::pyreport::{chunks, chunks_serde, report_json},
test_utils::test_report::{TestReport, TestReportBuilder},
};
use criterion::{criterion_group, criterion_main, Criterion};
@@ -58,7 +58,7 @@ fn simple_chunks(c: &mut Criterion) {
let chunks = &[
// Header and one chunk with an empty line
"{}\n<<<<< end_of_header >>>>>\n{}\n",
// No header, one chunk with a populated line and an empty line
// No header, one chunk with a populated line and an empty line
"{}\n[1, null, [[0, 1]]]\n",
// No header, two chunks, the second having just one empty line
"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n",
@@ -116,3 +116,63 @@ fn parse_chunks_file(input: &str, files: HashMap<usize, i64>, sessions: HashMap<
.parse_next(&mut chunks_stream)
.unwrap();
}

#[divan::bench]
fn simple_chunks_serde() {
let chunks: &[&[u8]] = &[
// Header and one chunk with an empty line
b"{}\n<<<<< end_of_header >>>>>\n{}\n",
// No header, one chunk with a populated line and an empty line
b"{}\n[1, null, [[0, 1]]]\n",
// No header, two chunks, the second having just one empty line
b"{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n",
// Header, two chunks, the second having multiple data lines and an empty line
b"{}\n<<<<< end_of_header >>>>>\n{}\n[1, null, [[0, 1]]]\n\n<<<<< end_of_chunk >>>>>\n{}\n[1, null, [[0, 1]]]\n[1, null, [[0, 1]]]\n",
];

for input in chunks {
parse_chunks_file_serde(input)
}
}

// this is currently <300 ms on my machine
#[divan::bench(sample_count = 10)]
fn complex_chunks_serde(bencher: Bencher) {
// this is a ~96M `chunks` file
let chunks =
load_fixture("pyreport/large/worker-c71ddfd4cb1753c7a540e5248c2beaa079fc3341-chunks.txt");

bencher.bench(|| parse_chunks_file_serde(&chunks));
}

fn parse_chunks_file_serde(input: &[u8]) {
let mut parser = chunks_serde::Parser::new(input);
loop {
// TODO: these are just for debugging
let rest = parser.rest;
let expecting = parser.expecting;
let event = parser.next();
match event {
Ok(None) => break,
Ok(Some(_)) => {}
Err(err) => {
let rest = std::str::from_utf8(rest).unwrap();
let rest = rest.get(..32).unwrap_or(rest);
dbg!(rest, expecting);
panic!("{err}");
}
}
}
}

#[track_caller]
fn load_fixture(path: &str) -> Vec<u8> {
let path = format!("./fixtures/{path}");
let contents = std::fs::read(path).unwrap();

if contents.starts_with(b"version https://git-lfs.github.com/spec/v1") {
panic!("Fixture has not been pulled from Git LFS");
}

contents
}
Loading
Oops, something went wrong.