diff --git a/src/dom_utils.rs b/src/dom_utils.rs index dffb9fcb..99122d78 100644 --- a/src/dom_utils.rs +++ b/src/dom_utils.rs @@ -28,6 +28,9 @@ pub trait NodeHandleExt { /// Sets an attribute on the element. Must be an element. fn set_attribute(&self, name: &QualName, value: StrTendril); + /// Removes an attribute from the element, if present. Must be an element. + fn remove_attribute(&self, name: &QualName); + /// Returns true if the node is an element. fn is_element(&self) -> bool; @@ -225,6 +228,16 @@ impl NodeHandleExt for Handle { } } + fn remove_attribute(&self, name: &QualName) { + let mut attrs = match self.data { + NodeData::Element { ref attrs, .. } => attrs.borrow_mut(), + _ => panic!("not an element"), + }; + if let Some(i) = attrs.iter().position(|a| &a.name == name) { + attrs.remove(i); + } + } + fn is_element(&self) -> bool { matches!(&self.data, NodeData::Element { .. }) } diff --git a/src/main.rs b/src/main.rs index e31b71cd..700e8949 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,6 +19,7 @@ mod rcdom_with_line_numbers; mod represents; mod self_link; mod tag_omission; +mod variables; #[tokio::main] async fn main() -> io::Result<()> { @@ -60,11 +61,13 @@ async fn run_preprocess() -> io::Result<()> { let mut tag_omission = tag_omission::Processor::new(); let mut interface_index = interface_index::Processor::new(); let mut self_link = self_link::Processor::new(); + let mut variables = variables::Processor::new(&parsed); // We do exactly one pass to identify the changes that need to be made. dom_utils::scan_dom(&document, &mut |h| { boilerplate.visit(h); represents.visit(h); + variables.visit(h); annotate_attributes.visit(h); tag_omission.visit(h); interface_index.visit(h); @@ -76,6 +79,7 @@ async fn run_preprocess() -> io::Result<()> { // conflicts between them. boilerplate.apply().await?; represents.apply()?; + variables.apply()?; annotate_attributes.apply().await?; tag_omission.apply()?; interface_index.apply()?; diff --git a/src/rcdom_with_line_numbers.rs b/src/rcdom_with_line_numbers.rs index 11cdcce2..d60df560 100644 --- a/src/rcdom_with_line_numbers.rs +++ b/src/rcdom_with_line_numbers.rs @@ -7,14 +7,17 @@ use html5ever::{ tendril::StrTendril, tree_builder::{ElementFlags, NodeOrText, QuirksMode}, }; -use markup5ever_rcdom::{Handle, RcDom}; +use markup5ever_rcdom::{Handle, Node, RcDom}; use std::borrow::Cow; -use std::cell::Cell; +use std::cell::{Cell, RefCell}; +use std::collections::HashMap; use std::io; +use std::rc::Rc; pub struct RcDomWithLineNumbers { dom: RcDom, current_line: Cell, + node_line_map: RefCell>, } #[cfg(test)] @@ -48,6 +51,13 @@ impl RcDomWithLineNumbers { Ok(()) } } + + /// Returns the 1-based line number where the element represented by `handle` + /// was created, if known. + pub fn line_number_for(&self, handle: &Handle) -> Option { + let key = Rc::as_ptr(handle); + self.node_line_map.borrow().get(&key).cloned() + } } impl Default for RcDomWithLineNumbers { @@ -55,6 +65,7 @@ impl Default for RcDomWithLineNumbers { Self { dom: RcDom::default(), current_line: Cell::new(1), + node_line_map: RefCell::new(HashMap::new()), } } } @@ -81,6 +92,20 @@ impl TreeSink for RcDomWithLineNumbers { self } + // Override to record the current line number for each created element handle. + fn create_element( + &self, + name: QualName, + attrs: Vec, + flags: ElementFlags, + ) -> Self::Handle { + let h = self.dom.create_element(name, attrs, flags); + let key = Rc::as_ptr(&h); + let line = self.current_line.get(); + self.node_line_map.borrow_mut().insert(key, line); + h + } + // Delegate all other methods to RcDom. delegate! { to self.dom { @@ -88,13 +113,6 @@ impl TreeSink for RcDomWithLineNumbers { fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a>; - fn create_element( - &self, - name: QualName, - attrs: Vec, - flags: ElementFlags, - ) -> Self::Handle; - fn create_comment(&self, text: StrTendril) -> Self::Handle; fn create_pi(&self, target: StrTendril, data: StrTendril) -> Self::Handle; diff --git a/src/variables.rs b/src/variables.rs new file mode 100644 index 00000000..21804210 --- /dev/null +++ b/src/variables.rs @@ -0,0 +1,426 @@ +//! Converts custom attributes `algorithm=""` and `var-scope=""` to `data-` +//! equivalents, to preserve validity of the output document. +// +// TODO: error on ``s outside of those scopes, unless the `` has an +// `ignore=""` attribute. (This code is present but disabled until +// https://github.com/whatwg/html/pull/11392 is merged.) +// +// TODO: check for ``s inside of these scopes that are only used once, and +// error when such lone ``s are encountered. + +use std::fmt::Write as _; +use std::io; + +use html5ever::Attribute; +use html5ever::{LocalName, QualName, local_name, ns}; +use markup5ever_rcdom::{Handle, NodeData}; + +use crate::dom_utils::NodeHandleExt; +use crate::rcdom_with_line_numbers::RcDomWithLineNumbers; + +pub struct Processor<'a> { + // Parser context (for line numbers) + parsed: &'a RcDomWithLineNumbers, + + // Rename targets: elements that start a variable scope whose attributes we will rewrite + scope_roots: Vec, + + // Offenses collected during visit (reported in apply()) + disallowed_data_algorithm: Vec, + disallowed_data_var_scope: Vec, + both_old_attrs: Vec, + var_out_of_scope_msgs: Vec, + + // Preorder traversal state + stack: Vec, + scope_flags: Vec, + scope_depth: usize, + domintro_flags: Vec, + domintro_depth: usize, + + // Edits to perform during apply() + vars_to_strip_ignore: Vec, +} + +impl<'a> Processor<'a> { + pub fn new(parsed: &'a RcDomWithLineNumbers) -> Self { + Self { + parsed, + scope_roots: vec![], + disallowed_data_algorithm: vec![], + disallowed_data_var_scope: vec![], + both_old_attrs: vec![], + var_out_of_scope_msgs: vec![], + stack: vec![], + scope_flags: vec![], + scope_depth: 0, + domintro_flags: vec![], + domintro_depth: 0, + vars_to_strip_ignore: vec![], + } + } + + pub fn visit(&mut self, node: &Handle) { + if !node.is_element() { + return; + } + + let old_algorithm = QualName::new(None, ns!(), LocalName::from("algorithm")); + let old_var_scope = QualName::new(None, ns!(), LocalName::from("var-scope")); + let data_algorithm = QualName::new(None, ns!(), LocalName::from("data-algorithm")); + let data_var_scope = QualName::new(None, ns!(), LocalName::from("data-var-scope")); + let ignore_attr = QualName::new(None, ns!(), LocalName::from("ignore")); + + // Maintain stack based on preorder and parent link + let parent = node.parent_node(); + while let Some(top) = self.stack.last() { + let is_parent = match &parent { + Some(p) => std::rc::Rc::ptr_eq(top, p), + None => false, + }; + if is_parent { + break; + } + self.stack.pop(); + if self.scope_flags.pop().unwrap_or(false) { + self.scope_depth -= 1; + } + if self.domintro_flags.pop().unwrap_or(false) { + self.domintro_depth -= 1; + } + } + + let starts_scope = node.has_attribute(&old_algorithm) || node.has_attribute(&old_var_scope); + if starts_scope { + self.scope_depth += 1; + } + let starts_domintro = node.has_class("domintro"); + if starts_domintro { + self.domintro_depth += 1; + } + self.stack.push(node.clone()); + self.scope_flags.push(starts_scope); + self.domintro_flags.push(starts_domintro); + + if starts_scope { + self.scope_roots.push(node.clone()); + } + + if node.has_attribute(&data_algorithm) { + self.disallowed_data_algorithm.push(node.clone()); + } + if node.has_attribute(&data_var_scope) { + self.disallowed_data_var_scope.push(node.clone()); + } + if node.has_attribute(&old_algorithm) && node.has_attribute(&old_var_scope) { + self.both_old_attrs.push(node.clone()); + } + + // Check semantics + if node.is_html_element(&local_name!("var")) { + if node.has_attribute(&ignore_attr) { + // Ignore `` with `ignore=""` attribute, and note it for later removal. + self.vars_to_strip_ignore.push(node.clone()); + } else if self.domintro_depth > 0 { + // Ignore `` inside domintro sections. + } else if self.scope_depth == 0 { + let text = node.text_content(); + let mut msg = String::new(); + if let Some(n) = self.parsed.line_number_for(node) { + let _ = write!(msg, "Line {}: ", n); + } + let _ = write!( + msg, + "\"{}\" outside algorithm=\"\"/var-scope=\"\" container.", + text.trim() + ); + self.var_out_of_scope_msgs.push(msg); + } + } + } + + pub fn apply(self) -> io::Result<()> { + if !self.disallowed_data_algorithm.is_empty() || !self.disallowed_data_var_scope.is_empty() + { + let mut msgs = Vec::new(); + for n in self.disallowed_data_algorithm { + let line = self + .parsed + .line_number_for(&n) + .map(|ln| format!("Line {}: ", ln)) + .unwrap_or_default(); + msgs.push(format!( + "{}data-algorithm=\"\" present in source. Use algorithm=\"\" instead.", + line + )); + } + for n in self.disallowed_data_var_scope { + let line = self + .parsed + .line_number_for(&n) + .map(|ln| format!("Line {}: ", ln)) + .unwrap_or_default(); + msgs.push(format!( + "{}data-var-scope=\"\" present in source. Use var-scope=\"\" instead.", + line + )); + } + return Err(io::Error::new(io::ErrorKind::InvalidData, msgs.join("\n"))); + } + if !self.both_old_attrs.is_empty() { + let mut msgs = Vec::new(); + for n in self.both_old_attrs { + let line = self + .parsed + .line_number_for(&n) + .map(|ln| format!("Line {}: ", ln)) + .unwrap_or_default(); + msgs.push(format!( + "{}both algorithm=\"\" and var-scope=\"\" present on the same element. Pick one.", + line + )); + } + return Err(io::Error::new(io::ErrorKind::InvalidData, msgs.join("\n"))); + } + + // Disabled until https://github.com/whatwg/html/pull/11392 is merged. + // if !self.var_out_of_scope_msgs.is_empty() { + // return Err(io::Error::new( + // io::ErrorKind::InvalidData, + // self.var_out_of_scope_msgs.join("\n"), + // )); + // } + + let old_algorithm = QualName::new(None, ns!(), LocalName::from("algorithm")); + let new_algorithm = QualName::new(None, ns!(), LocalName::from("data-algorithm")); + let old_var_scope = QualName::new(None, ns!(), LocalName::from("var-scope")); + let new_var_scope = QualName::new(None, ns!(), LocalName::from("data-var-scope")); + let ignore_attr = QualName::new(None, ns!(), LocalName::from("ignore")); + + for node in self.scope_roots { + if let NodeData::Element { ref attrs, .. } = node.data { + let mut attrs = attrs.borrow_mut(); + rename_if_present(&mut attrs, &old_algorithm, &new_algorithm)?; + rename_if_present(&mut attrs, &old_var_scope, &new_var_scope)?; + } + } + for var_node in self.vars_to_strip_ignore { + var_node.remove_attribute(&ignore_attr); + } + Ok(()) + } +} + +fn rename_if_present( + attrs: &mut Vec, + old_name: &QualName, + new_name: &QualName, +) -> io::Result<()> { + if let Some((idx, value)) = attrs + .iter() + .enumerate() + .find_map(|(i, a)| (a.name == *old_name).then(|| (i, a.value.clone()))) + { + attrs.remove(idx); + attrs.push(Attribute { + name: new_name.clone(), + value, + }); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dom_utils; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_basic_conversion() { + let parsed = parse_document_async( + r##" +
+

Hi

+ +No change +"## + .as_bytes(), + ) + .await + .unwrap(); + let document = parsed.document().clone(); + + let mut proc = Processor::new(&parsed); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"
+

Hi

+ +No change +"## + ); + } + + #[tokio::test] + async fn test_error_on_existing_data_attr() { + let parsed = parse_document_async( + r##" +
+
+"## + .as_bytes(), + ) + .await + .unwrap(); + let document = parsed.document().clone(); + + let mut proc = Processor::new(&parsed); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + let err = result.unwrap_err(); + assert!(err.to_string().contains("Line 2: ")); + assert!(err.to_string().contains("Line 3: ")); + } + + #[tokio::test] + async fn test_error_on_both() { + let parsed = parse_document_async( + r##" +
+"## + .as_bytes(), + ) + .await + .unwrap(); + let document = parsed.document().clone(); + + let mut proc = Processor::new(&parsed); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + let err = result.unwrap_err(); + assert!(err.to_string().contains("Line 2: ")); + } + + #[tokio::test] + async fn test_var_ignore_removes_attr_and_no_error() { + let parsed = parse_document_async( + r##" +

Outside scope foo

+"## + .as_bytes(), + ) + .await + .unwrap(); + let document = parsed.document().clone(); + + let mut proc = Processor::new(&parsed); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + // Should not error because var has ignore + proc.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"

Outside scope foo

+"## + ); + } + + // Disabled until https://github.com/whatwg/html/pull/11392 is merged. + // #[tokio::test] + // async fn test_var_outside_scope_errors() { + // let parsed = parse_document_async( + // r##" + //

Outside scope bar

+ // "## + // .as_bytes(), + // ) + // .await + // .unwrap(); + // let document = parsed.document().clone(); + + // let mut proc = Processor::new(&parsed); + // dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + // let result = proc.apply(); + // let err = result.unwrap_err(); + // assert!(err.to_string().contains("Line 2: ")); + // } + + #[tokio::test] + async fn test_var_inside_algorithm_ok() { + let parsed = parse_document_async( + r##" +

In scope n

+"## + .as_bytes(), + ) + .await + .unwrap(); + let document = parsed.document().clone(); + + let mut proc = Processor::new(&parsed); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"

In scope n

+"## + ); + } + + #[tokio::test] + async fn test_var_inside_var_scope_ok() { + let parsed = parse_document_async( + r##" +

In scope x

+"## + .as_bytes(), + ) + .await + .unwrap(); + let document = parsed.document().clone(); + + let mut proc = Processor::new(&parsed); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"

In scope x

+"## + ); + } + + #[tokio::test] + async fn test_var_inside_domintro_ok() { + let parsed = parse_document_async( + r##" +
+
variable = object.method([optionalArgument])
+ +

This is a note to authors describing the usage of an interface.

+
+"## + .as_bytes() + ).await.unwrap(); + let document = parsed.document().clone(); + let mut proc = Processor::new(&parsed); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + // No scope present, but domintro should suppress the error. + proc.apply().unwrap(); + + assert_eq!( + serialize_for_test(&[document]), + r##"
+
variable = object.method([optionalArgument])
+ +

This is a note to authors describing the usage of an interface.

+
+"## + ); + } +}