-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
333 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,5 @@ node_modules | |
dist | ||
docs | ||
coverage | ||
target | ||
lerna-debug.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
build = { target = "wasm32-unknown-unknown" } |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
[package] | ||
name = "diffhtml-rust-parser" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
[lib] | ||
name = "parser" | ||
path = "src/parser.rs" | ||
test = false | ||
bench = false | ||
crate-type = ["cdylib"] | ||
|
||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||
|
||
[dependencies] | ||
tl = { version = "0.7.7", features = ["simd"] } | ||
wasm-bindgen = "0.2.83" | ||
serde = { version = "1.0", features = ["derive"] } | ||
serde-wasm-bindgen = "0.4" | ||
#serde_derive = "1.0.147" | ||
#wee_alloc = "0.4.5" | ||
js-sys = "0.3.60" | ||
|
||
[profile.release] | ||
strip = "symbols" | ||
lto = true | ||
opt-level = 'z' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
default: | ||
RUSTFLAGS="-C target-feature=+simd128" cargo build --release | ||
mkdir -p dist | ||
wasm-bindgen ./target/wasm32-unknown-unknown/release/parser.wasm --out-dir dist |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Rust HTML Parser | ||
|
||
Allows using a Rust-based zero-copy HTML parser, instead of the regex-based | ||
parser found in the JS source. This is an alternative that can be used during | ||
a build, in the browser, when writing server-side code, and when implementing | ||
diffHTML in other languages. | ||
|
||
## Build | ||
|
||
``` | ||
make | ||
``` | ||
|
||
## Test | ||
|
||
``` | ||
node --experimental-wasm-modules . | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"name": "diffhtml-rust-parser", | ||
"version": "1.0.0", | ||
"description": "", | ||
"main": "dist/parser.js", | ||
"type": "module", | ||
"author": "Tim Branyen (@tbranyen)", | ||
"license": "MIT" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
//extern crate wee_alloc; | ||
|
||
use tl::{Node, Parser}; | ||
use serde_wasm_bindgen::to_value; | ||
use wasm_bindgen::prelude::*; | ||
use wasm_bindgen::{intern}; | ||
use js_sys::{Reflect, JsString}; | ||
use js_sys::Array; | ||
use js_sys::Object; | ||
|
||
// Opt for a smaller allocator to save on resources. | ||
//#[global_allocator] | ||
//static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT; | ||
|
||
#[wasm_bindgen(getter_with_clone)] | ||
pub struct VTree { | ||
pub rawNodeName: String, | ||
pub nodeName: String, | ||
pub nodeValue: String, | ||
pub attributes: Object, | ||
pub childNodes: Array, | ||
} | ||
|
||
// Find properties, attributes, and children and populate into a VTree struct | ||
// which is returned. | ||
pub fn crawl_node(dom_node: &Node, parser: &Parser) -> VTree { | ||
let attributes = Object::new(); | ||
let child_nodes = Array::new(); | ||
let node_tag = dom_node.as_tag(); | ||
let mut node_name = "#text".to_string(); | ||
|
||
match node_tag { | ||
Some(html_tag) => { | ||
node_name = html_tag.name().as_utf8_str().to_string(); | ||
|
||
// Add the attributes | ||
for attr in html_tag.attributes().iter() { | ||
let key = attr.0.chars().as_str(); | ||
let value = match attr.1 { | ||
None => { "" } | ||
Some(ref cow_str) => { | ||
cow_str.chars().as_str() | ||
} | ||
}; | ||
|
||
// FIXME Look into performance improvements this is a very expensive line. | ||
Reflect::set(&attributes, &JsValue::from_str(key), &JsValue::from_str(value)); | ||
} | ||
} | ||
None => {} | ||
} | ||
|
||
// Loop over the children and build them up into the child_nodes array. | ||
for inner_children in dom_node.children().iter() { | ||
for inner_node in inner_children.top().iter() { | ||
let v_tree = crawl_node(inner_node.get(parser).unwrap(), parser); | ||
child_nodes.push(&JsValue::from(v_tree)); | ||
} | ||
} | ||
|
||
VTree { | ||
rawNodeName: node_name.clone(), | ||
nodeName: node_name.clone(), | ||
nodeValue: node_name.clone(), | ||
childNodes: child_nodes, | ||
attributes, | ||
} | ||
} | ||
|
||
// Expose a single parse function. This takes in a string of HTML-like markup, | ||
// parses it with the "tl" fast HTML parser, and then returns a struct tree | ||
// representation expected by diffHTML. | ||
#[no_mangle] | ||
#[wasm_bindgen] | ||
pub extern "C" fn parse(markup: &str) -> VTree { | ||
// Experiment with "tl" a zero-copy HTML parser. We may need to replace this | ||
// depending on its accuracy. | ||
let dom = tl::parse(markup, tl::ParserOptions::default()).unwrap(); | ||
let parser = dom.parser(); | ||
let child_nodes = Array::new(); | ||
let fragment_name = "#document-fragment".to_string(); | ||
|
||
// Use the DOM and map into the VTree structure diffHTML expects. | ||
for dom_node in dom.children().iter() { | ||
let v_tree = crawl_node(dom_node.get(parser).unwrap(), parser); | ||
child_nodes.push(&JsValue::from(v_tree)); | ||
} | ||
|
||
// Root node is always a document fragment. | ||
VTree { | ||
rawNodeName: fragment_name.clone(), | ||
nodeName: fragment_name.clone(), | ||
nodeValue: "".to_string(), | ||
childNodes: child_nodes, | ||
attributes: Object::new(), | ||
} | ||
} |