Skip to content

Commit

Permalink
Initial commit of Rust-based WASM HTML parser
Browse files Browse the repository at this point in the history
  • Loading branch information
tbranyen committed Nov 8, 2022
1 parent 169b79f commit 97fd541
Show file tree
Hide file tree
Showing 8 changed files with 374 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ node_modules
dist
docs
coverage
target
lerna-debug.log
1 change: 1 addition & 0 deletions packages/diffhtml-rust-parser/.cargo/config
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
build = { target = "wasm32-unknown-unknown" }
218 changes: 218 additions & 0 deletions packages/diffhtml-rust-parser/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions packages/diffhtml-rust-parser/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[package]
name = "diffhtml-rust-parser"
version = "0.1.0"
edition = "2021"

[lib]
name = "parser"
path = "src/parser.rs"
test = false
bench = false
crate-type = ["cdylib"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
tl = { version = "0.7.7", features = ["simd"] }
serde = { version = "1.0", features = ["derive"] }
serde_derive = "1.0.147"
wee_alloc = "0.4.5"
wasm-bindgen = "0.2.83"
js-sys = "0.3.60"

[profile.release]
strip = "symbols"
lto = true
opt-level = 'z'
4 changes: 4 additions & 0 deletions packages/diffhtml-rust-parser/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
default:
cargo build --release
mkdir -p dist
wasm-bindgen ./target/wasm32-unknown-unknown/release/parser.wasm --out-dir dist
18 changes: 18 additions & 0 deletions packages/diffhtml-rust-parser/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Rust HTML Parser

Allows using a Rust-based zero-copy HTML parser, instead of the regex-based
parser found in the JS source. This is an alternative that can be used during
a build, in the browser, when writing server-side code, and when implementing
diffHTML in other languages.

## Build

```
make
```

## Test

```
node --experimental-wasm-modules .
```
9 changes: 9 additions & 0 deletions packages/diffhtml-rust-parser/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"name": "diffhtml-rust-parser",
"version": "1.0.0",
"description": "",
"main": "dist/parser.js",
"type": "module",
"author": "Tim Branyen (@tbranyen)",
"license": "MIT"
}
97 changes: 97 additions & 0 deletions packages/diffhtml-rust-parser/src/parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
extern crate wee_alloc;

use js_sys::Reflect;
use tl::{Node, Parser};
use wasm_bindgen::prelude::*;
use js_sys::Array;
use js_sys::Object;

// Opt for a smaller allocator to save on resources.
#[global_allocator]
static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;

#[derive(Clone, Debug)]
#[wasm_bindgen(getter_with_clone)]
pub struct VTree {
pub rawNodeName: String,
pub nodeName: String,
pub nodeValue: String,
pub attributes: Object,
pub childNodes: Array,
}

// Find properties, attributes, and children and populate into a VTree struct
// which is returned.
pub fn crawl_node(dom_node: &Node, parser: &Parser) -> VTree {
let attributes = Object::new();
let child_nodes = Array::new();
let node_tag = dom_node.as_tag();
let mut node_name = "#text".to_string();

match node_tag {
Some(htmlTag) => {
// Set the node name
node_name = htmlTag.name().as_utf8_str().to_string();

// Add the attributes
for attr in htmlTag.attributes().iter() {
let key = attr.0.chars().as_str();
let mut val = match attr.1 {
None => { "" }
Some(ref cowStr) => {
let varChar = cowStr.chars();

varChar.as_str()
}
};

Reflect::set(&attributes, &JsValue::from_str(key), &JsValue::from_str(val));
}
}
None => {}
}

// Loop over the children and build them up into the child_nodes array.
for inner_children in dom_node.children().iter() {
for inner_node in inner_children.top().iter() {
let v_tree = crawl_node(inner_node.get(parser).unwrap(), parser).clone();
child_nodes.push(&JsValue::from(v_tree));
}
}

VTree {
rawNodeName: node_name.clone(),
nodeName: node_name.clone(),
nodeValue: node_name.clone(),
childNodes: child_nodes,
attributes,
}
}

// Expose a single parse function. This takes in a string of HTML-like markup,
// parses it with the "tl" fast HTML parser, and then returns a struct tree
// representation expected by diffHTML.
#[no_mangle]
#[wasm_bindgen]
pub extern "C" fn parse(markup: &str) -> VTree {
// Experiment with "tl" a zero-copy HTML parser. We may need to replace this
// depending on its accuracy.
let dom = tl::parse(markup, tl::ParserOptions::default()).unwrap();
let parser = dom.parser();
let child_nodes = Array::new();

// Use the DOM and map into the VTree structure diffHTML expects.
for dom_node in dom.children().iter() {
let v_tree = crawl_node(dom_node.get(parser).unwrap(), parser);
child_nodes.push(&JsValue::from(v_tree));
}

// Root node is always a document fragment.
VTree {
rawNodeName: "#document-fragment".to_string(),
nodeName: "#document-fragment".to_string(),
nodeValue: "".to_string(),
childNodes: child_nodes,
attributes: Object::new(),
}
}

0 comments on commit 97fd541

Please sign in to comment.