From 97fd5414d232212068d2b324b8208fef0c18e5ca Mon Sep 17 00:00:00 2001 From: Tim Branyen Date: Tue, 8 Nov 2022 10:35:15 -0800 Subject: [PATCH] Initial commit of Rust-based WASM HTML parser --- .gitignore | 1 + packages/diffhtml-rust-parser/.cargo/config | 1 + packages/diffhtml-rust-parser/Cargo.lock | 218 ++++++++++++++++++++ packages/diffhtml-rust-parser/Cargo.toml | 26 +++ packages/diffhtml-rust-parser/Makefile | 4 + packages/diffhtml-rust-parser/README.md | 18 ++ packages/diffhtml-rust-parser/package.json | 9 + packages/diffhtml-rust-parser/src/parser.rs | 97 +++++++++ 8 files changed, 374 insertions(+) create mode 100644 packages/diffhtml-rust-parser/.cargo/config create mode 100644 packages/diffhtml-rust-parser/Cargo.lock create mode 100644 packages/diffhtml-rust-parser/Cargo.toml create mode 100644 packages/diffhtml-rust-parser/Makefile create mode 100644 packages/diffhtml-rust-parser/README.md create mode 100644 packages/diffhtml-rust-parser/package.json create mode 100644 packages/diffhtml-rust-parser/src/parser.rs diff --git a/.gitignore b/.gitignore index 787571fc..3e9c80ca 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ node_modules dist docs coverage +target lerna-debug.log diff --git a/packages/diffhtml-rust-parser/.cargo/config b/packages/diffhtml-rust-parser/.cargo/config new file mode 100644 index 00000000..5cb05a5b --- /dev/null +++ b/packages/diffhtml-rust-parser/.cargo/config @@ -0,0 +1 @@ +build = { target = "wasm32-unknown-unknown" } diff --git a/packages/diffhtml-rust-parser/Cargo.lock b/packages/diffhtml-rust-parser/Cargo.lock new file mode 100644 index 00000000..106e38f6 --- /dev/null +++ b/packages/diffhtml-rust-parser/Cargo.lock @@ -0,0 +1,218 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "bumpalo" +version = "3.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "diffhtml-rust-parser" +version = "0.1.0" +dependencies = [ + "js-sys", + "serde", + "serde_derive", + "tl", + "wasm-bindgen", + "wee_alloc", +] + +[[package]] +name = "js-sys" +version = "0.3.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7fcc620a3bff7cdd7a365be3376c97191aeaccc2a603e600951e452615bf89" + +[[package]] +name = "log" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "memory_units" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8452105ba047068f40ff7093dd1d9da90898e63dd61736462e9cdda6a90ad3c3" + +[[package]] +name = "once_cell" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" + +[[package]] +name = "proc-macro2" +version = "1.0.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tl" +version = "0.7.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5e993a1c7c32fdf90a308cec4d457f507b2573acc909bd6e7a092321664fdb3" + +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + +[[package]] +name = "wasm-bindgen" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" +dependencies = [ + "cfg-if 1.0.0", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" + +[[package]] +name = "wee_alloc" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb3b5a6b2bb17cb6ad44a2e68a43e8d2722c997da10e928665c72ec6c0a0b8e" +dependencies = [ + "cfg-if 0.1.10", + "libc", + "memory_units", + "winapi", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/packages/diffhtml-rust-parser/Cargo.toml b/packages/diffhtml-rust-parser/Cargo.toml new file mode 100644 index 00000000..37e7cad8 --- /dev/null +++ b/packages/diffhtml-rust-parser/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "diffhtml-rust-parser" +version = "0.1.0" +edition = "2021" + +[lib] +name = "parser" +path = "src/parser.rs" +test = false +bench = false +crate-type = ["cdylib"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tl = { version = "0.7.7", features = ["simd"] } +serde = { version = "1.0", features = ["derive"] } +serde_derive = "1.0.147" +wee_alloc = "0.4.5" +wasm-bindgen = "0.2.83" +js-sys = "0.3.60" + +[profile.release] +strip = "symbols" +lto = true +opt-level = 'z' diff --git a/packages/diffhtml-rust-parser/Makefile b/packages/diffhtml-rust-parser/Makefile new file mode 100644 index 00000000..c1095730 --- /dev/null +++ b/packages/diffhtml-rust-parser/Makefile @@ -0,0 +1,4 @@ +default: + cargo build --release + mkdir -p dist + wasm-bindgen ./target/wasm32-unknown-unknown/release/parser.wasm --out-dir dist diff --git a/packages/diffhtml-rust-parser/README.md b/packages/diffhtml-rust-parser/README.md new file mode 100644 index 00000000..9e94a384 --- /dev/null +++ b/packages/diffhtml-rust-parser/README.md @@ -0,0 +1,18 @@ +# Rust HTML Parser + +Allows using a Rust-based zero-copy HTML parser, instead of the regex-based +parser found in the JS source. This is an alternative that can be used during +a build, in the browser, when writing server-side code, and when implementing +diffHTML in other languages. + +## Build + +``` +make +``` + +## Test + +``` +node --experimental-wasm-modules . +``` diff --git a/packages/diffhtml-rust-parser/package.json b/packages/diffhtml-rust-parser/package.json new file mode 100644 index 00000000..a38fdc78 --- /dev/null +++ b/packages/diffhtml-rust-parser/package.json @@ -0,0 +1,9 @@ +{ + "name": "diffhtml-rust-parser", + "version": "1.0.0", + "description": "", + "main": "dist/parser.js", + "type": "module", + "author": "Tim Branyen (@tbranyen)", + "license": "MIT" +} diff --git a/packages/diffhtml-rust-parser/src/parser.rs b/packages/diffhtml-rust-parser/src/parser.rs new file mode 100644 index 00000000..a139df53 --- /dev/null +++ b/packages/diffhtml-rust-parser/src/parser.rs @@ -0,0 +1,97 @@ +extern crate wee_alloc; + +use js_sys::Reflect; +use tl::{Node, Parser}; +use wasm_bindgen::prelude::*; +use js_sys::Array; +use js_sys::Object; + +// Opt for a smaller allocator to save on resources. +#[global_allocator] +static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; + +#[derive(Clone, Debug)] +#[wasm_bindgen(getter_with_clone)] +pub struct VTree { + pub rawNodeName: String, + pub nodeName: String, + pub nodeValue: String, + pub attributes: Object, + pub childNodes: Array, +} + +// Find properties, attributes, and children and populate into a VTree struct +// which is returned. +pub fn crawl_node(dom_node: &Node, parser: &Parser) -> VTree { + let attributes = Object::new(); + let child_nodes = Array::new(); + let node_tag = dom_node.as_tag(); + let mut node_name = "#text".to_string(); + + match node_tag { + Some(htmlTag) => { + // Set the node name + node_name = htmlTag.name().as_utf8_str().to_string(); + + // Add the attributes + for attr in htmlTag.attributes().iter() { + let key = attr.0.chars().as_str(); + let mut val = match attr.1 { + None => { "" } + Some(ref cowStr) => { + let varChar = cowStr.chars(); + + varChar.as_str() + } + }; + + Reflect::set(&attributes, &JsValue::from_str(key), &JsValue::from_str(val)); + } + } + None => {} + } + + // Loop over the children and build them up into the child_nodes array. + for inner_children in dom_node.children().iter() { + for inner_node in inner_children.top().iter() { + let v_tree = crawl_node(inner_node.get(parser).unwrap(), parser).clone(); + child_nodes.push(&JsValue::from(v_tree)); + } + } + + VTree { + rawNodeName: node_name.clone(), + nodeName: node_name.clone(), + nodeValue: node_name.clone(), + childNodes: child_nodes, + attributes, + } +} + +// Expose a single parse function. This takes in a string of HTML-like markup, +// parses it with the "tl" fast HTML parser, and then returns a struct tree +// representation expected by diffHTML. +#[no_mangle] +#[wasm_bindgen] +pub extern "C" fn parse(markup: &str) -> VTree { + // Experiment with "tl" a zero-copy HTML parser. We may need to replace this + // depending on its accuracy. + let dom = tl::parse(markup, tl::ParserOptions::default()).unwrap(); + let parser = dom.parser(); + let child_nodes = Array::new(); + + // Use the DOM and map into the VTree structure diffHTML expects. + for dom_node in dom.children().iter() { + let v_tree = crawl_node(dom_node.get(parser).unwrap(), parser); + child_nodes.push(&JsValue::from(v_tree)); + } + + // Root node is always a document fragment. + VTree { + rawNodeName: "#document-fragment".to_string(), + nodeName: "#document-fragment".to_string(), + nodeValue: "".to_string(), + childNodes: child_nodes, + attributes: Object::new(), + } +}