-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial commit of Rust-based WASM HTML parser
- Loading branch information
Showing
8 changed files
with
374 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,5 @@ node_modules | |
dist | ||
docs | ||
coverage | ||
target | ||
lerna-debug.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
build = { target = "wasm32-unknown-unknown" } |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
[package] | ||
name = "diffhtml-rust-parser" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
[lib] | ||
name = "parser" | ||
path = "src/parser.rs" | ||
test = false | ||
bench = false | ||
crate-type = ["cdylib"] | ||
|
||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||
|
||
[dependencies] | ||
tl = { version = "0.7.7", features = ["simd"] } | ||
serde = { version = "1.0", features = ["derive"] } | ||
serde_derive = "1.0.147" | ||
wee_alloc = "0.4.5" | ||
wasm-bindgen = "0.2.83" | ||
js-sys = "0.3.60" | ||
|
||
[profile.release] | ||
strip = "symbols" | ||
lto = true | ||
opt-level = 'z' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
default: | ||
cargo build --release | ||
mkdir -p dist | ||
wasm-bindgen ./target/wasm32-unknown-unknown/release/parser.wasm --out-dir dist |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Rust HTML Parser | ||
|
||
Allows using a Rust-based zero-copy HTML parser, instead of the regex-based | ||
parser found in the JS source. This is an alternative that can be used during | ||
a build, in the browser, when writing server-side code, and when implementing | ||
diffHTML in other languages. | ||
|
||
## Build | ||
|
||
``` | ||
make | ||
``` | ||
|
||
## Test | ||
|
||
``` | ||
node --experimental-wasm-modules . | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"name": "diffhtml-rust-parser", | ||
"version": "1.0.0", | ||
"description": "", | ||
"main": "dist/parser.js", | ||
"type": "module", | ||
"author": "Tim Branyen (@tbranyen)", | ||
"license": "MIT" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
extern crate wee_alloc; | ||
|
||
use js_sys::Reflect; | ||
use tl::{Node, Parser}; | ||
use wasm_bindgen::prelude::*; | ||
use js_sys::Array; | ||
use js_sys::Object; | ||
|
||
// Opt for a smaller allocator to save on resources. | ||
#[global_allocator] | ||
static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; | ||
|
||
#[derive(Clone, Debug)] | ||
#[wasm_bindgen(getter_with_clone)] | ||
pub struct VTree { | ||
pub rawNodeName: String, | ||
pub nodeName: String, | ||
pub nodeValue: String, | ||
pub attributes: Object, | ||
pub childNodes: Array, | ||
} | ||
|
||
// Find properties, attributes, and children and populate into a VTree struct | ||
// which is returned. | ||
pub fn crawl_node(dom_node: &Node, parser: &Parser) -> VTree { | ||
let attributes = Object::new(); | ||
let child_nodes = Array::new(); | ||
let node_tag = dom_node.as_tag(); | ||
let mut node_name = "#text".to_string(); | ||
|
||
match node_tag { | ||
Some(htmlTag) => { | ||
// Set the node name | ||
node_name = htmlTag.name().as_utf8_str().to_string(); | ||
|
||
// Add the attributes | ||
for attr in htmlTag.attributes().iter() { | ||
let key = attr.0.chars().as_str(); | ||
let mut val = match attr.1 { | ||
None => { "" } | ||
Some(ref cowStr) => { | ||
let varChar = cowStr.chars(); | ||
|
||
varChar.as_str() | ||
} | ||
}; | ||
|
||
Reflect::set(&attributes, &JsValue::from_str(key), &JsValue::from_str(val)); | ||
} | ||
} | ||
None => {} | ||
} | ||
|
||
// Loop over the children and build them up into the child_nodes array. | ||
for inner_children in dom_node.children().iter() { | ||
for inner_node in inner_children.top().iter() { | ||
let v_tree = crawl_node(inner_node.get(parser).unwrap(), parser).clone(); | ||
child_nodes.push(&JsValue::from(v_tree)); | ||
} | ||
} | ||
|
||
VTree { | ||
rawNodeName: node_name.clone(), | ||
nodeName: node_name.clone(), | ||
nodeValue: node_name.clone(), | ||
childNodes: child_nodes, | ||
attributes, | ||
} | ||
} | ||
|
||
// Expose a single parse function. This takes in a string of HTML-like markup, | ||
// parses it with the "tl" fast HTML parser, and then returns a struct tree | ||
// representation expected by diffHTML. | ||
#[no_mangle] | ||
#[wasm_bindgen] | ||
pub extern "C" fn parse(markup: &str) -> VTree { | ||
// Experiment with "tl" a zero-copy HTML parser. We may need to replace this | ||
// depending on its accuracy. | ||
let dom = tl::parse(markup, tl::ParserOptions::default()).unwrap(); | ||
let parser = dom.parser(); | ||
let child_nodes = Array::new(); | ||
|
||
// Use the DOM and map into the VTree structure diffHTML expects. | ||
for dom_node in dom.children().iter() { | ||
let v_tree = crawl_node(dom_node.get(parser).unwrap(), parser); | ||
child_nodes.push(&JsValue::from(v_tree)); | ||
} | ||
|
||
// Root node is always a document fragment. | ||
VTree { | ||
rawNodeName: "#document-fragment".to_string(), | ||
nodeName: "#document-fragment".to_string(), | ||
nodeValue: "".to_string(), | ||
childNodes: child_nodes, | ||
attributes: Object::new(), | ||
} | ||
} |