untitaker · untitaker · Feb 1, 2022 · Jan 29, 2022 · Jan 31, 2022 · Feb 1, 2022
diff --git a/generate_entities.py b/generate_entities.py
@@ -8,50 +8,59 @@
 match_arms = {}
 
 for key, value in key_and_value:
-    assert key[0] == '&'
+    assert key[0] == "&"
     key = key[1:]
     first_char = key[0]
     key = key[1:]
 
     match_arms.setdefault(first_char, []).append((key, value))
 
 with open("src/entities.rs", "w") as f:
-    f.write("""
+    f.write(
+        """
 // @generated
 // this file is autogenerated by
 // curl https://html.spec.whatwg.org/entities.json | python generate_entities.py
 
-pub struct CharRef {
+pub(crate) struct CharRef {
     /// Name as it appears escaped in HTML
-    pub name: &'static str,
+    pub(crate) name: &'static str,
     /// Unescaped character codepoints
-    pub characters: &'static str,
+    pub(crate) characters: &'static str,
 }
 
-pub fn try_read_character_reference<E>(first_char: char, try_read: impl FnMut(&str) -> Result<bool, E>) -> Result<Option<CharRef>, E> {
+pub(crate) fn try_read_character_reference<E>(first_char: char, try_read: impl FnMut(&str) -> Result<bool, E>) -> Result<Option<CharRef>, E> {
     match first_char {
-""")
+"""
+    )
 
     for first_char, if_statements in sorted(match_arms.items()):
         # Write each branch of the match stmt as its own function such that
         # compilation is faster.
-        f.write("""
+        f.write(
+            """
         '%(first_char)s' => {
             #[allow(non_snake_case)]
             fn branch_%(first_char)s<E>(mut try_read: impl FnMut(&str) -> Result<bool, E>) -> Result<Option<CharRef>, E> {
                 for (other_chars, characters) in &[
-        """ % {"first_char": first_char})
+        """
+            % {"first_char": first_char}
+        )
 
         for other_chars, value in if_statements:
             characters = ""
-            for c in value['codepoints']:
+            for c in value["codepoints"]:
                 characters += r"\u{" + hex(c)[2:] + r"}"
 
-            f.write("""
+            f.write(
+                """
                     ("%(other_chars)s", "%(characters)s"),
-            """ % {"other_chars": other_chars, "characters": characters})
+            """
+                % {"other_chars": other_chars, "characters": characters}
+            )
 
-        f.write("""
+        f.write(
+            """
                 ] {
                     if try_read(other_chars)? {
                         return Ok(Some(CharRef { name: other_chars, characters }));
@@ -64,10 +73,14 @@
             branch_%(first_char)s(try_read)
 
         }
-        """ % {"first_char": first_char})
+        """
+            % {"first_char": first_char}
+        )
 
-    f.write("""
+    f.write(
+        """
         _ => Ok(None)
     }
 }
-    """)
+    """
+    )
diff --git a/src/arrayvec.rs b/src/arrayvec.rs
@@ -1,13 +1,13 @@
 /// This is basically like the arrayvec crate, except crappier, only the subset I need and
 /// therefore without unsafe Rust.
-
-pub struct ArrayVec<T: Copy, const CAP: usize> {
+#[derive(Debug)]
+pub(crate) struct ArrayVec<T: Copy, const CAP: usize> {
     content: [T; CAP],
     len: usize,
 }
 
 impl<T: Copy, const CAP: usize> ArrayVec<T, CAP> {
-    pub fn new(filler_item: T) -> Self {
+    pub(crate) fn new(filler_item: T) -> Self {
         // filler_item is there to avoid usage of MaybeUninit, and can literally be anything at
         // all.
         ArrayVec {
@@ -16,12 +16,12 @@ impl<T: Copy, const CAP: usize> ArrayVec<T, CAP> {
         }
     }
 
-    pub fn push(&mut self, item: T) {
+    pub(crate) fn push(&mut self, item: T) {
         self.content[self.len] = item;
         self.len += 1;
     }
 
-    pub fn drain(&mut self) -> &[T] {
+    pub(crate) fn drain(&mut self) -> &[T] {
         let rv = &self.content[..self.len];
         self.len = 0;
         rv

diff --git a/src/char_validator.rs b/src/char_validator.rs
@@ -1,6 +1,7 @@
 use crate::arrayvec::ArrayVec;
 use crate::{Emitter, Error};
 
+#[derive(Debug)]
 pub(crate) struct CharValidator {
     last_4_bytes: u32,
     character_error: ArrayVec<Error, 3>,
@@ -16,19 +17,19 @@ impl Default for CharValidator {
 }
 
 impl CharValidator {
-    pub fn reset(&mut self) {
+    pub(crate) fn reset(&mut self) {
         self.last_4_bytes = 0;
     }
 
     #[inline]
-    pub fn validate_bytes<E: Emitter>(&mut self, emitter: &mut E, next_bytes: &[u8]) {
+    pub(crate) fn validate_bytes<E: Emitter>(&mut self, emitter: &mut E, next_bytes: &[u8]) {
         for &x in next_bytes {
             self.validate_byte(emitter, x);
         }
     }
 
     #[inline]
-    pub fn validate_byte<E: Emitter>(&mut self, emitter: &mut E, next_byte: u8) {
+    pub(crate) fn validate_byte<E: Emitter>(&mut self, emitter: &mut E, next_byte: u8) {
         if next_byte < 128 {
             // start of character (ascii)
             self.last_4_bytes = 0;
@@ -45,13 +46,13 @@ impl CharValidator {
         }
     }
 
-    pub fn flush_character_error<E: Emitter>(&mut self, emitter: &mut E) {
+    pub(crate) fn flush_character_error<E: Emitter>(&mut self, emitter: &mut E) {
         for e in self.character_error.drain() {
             emitter.emit_error(*e);
         }
     }
 
-    pub fn set_character_error<E: Emitter>(&mut self, emitter: &mut E, error: Error) {
+    pub(crate) fn set_character_error<E: Emitter>(&mut self, emitter: &mut E, error: Error) {
         if self.last_4_bytes == 0 {
             emitter.emit_error(error);
         } else {

diff --git a/src/emitter.rs b/src/emitter.rs
@@ -173,7 +173,7 @@ pub trait Emitter {
 }
 
 /// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
-#[derive(Default)]
+#[derive(Debug, Default)]
 pub struct DefaultEmitter {
     current_characters: Vec<u8>,
     current_token: Option<Token>,

diff --git a/src/entities.rs b/src/entities.rs
@@ -2,14 +2,14 @@
 // this file is autogenerated by
 // curl https://html.spec.whatwg.org/entities.json | python generate_entities.py
 
-pub struct CharRef {
+pub(crate) struct CharRef {
     /// Name as it appears escaped in HTML
-    pub name: &'static str,
+    pub(crate) name: &'static str,
     /// Unescaped character codepoints
-    pub characters: &'static str,
+    pub(crate) characters: &'static str,
 }
 
-pub fn try_read_character_reference<E>(
+pub(crate) fn try_read_character_reference<E>(
     first_char: char,
     try_read: impl FnMut(&str) -> Result<bool, E>,
 ) -> Result<Option<CharRef>, E> {

diff --git a/src/lib.rs b/src/lib.rs
@@ -2,9 +2,19 @@
 // This is an HTML parser. HTML can be untrusted input from the internet.
 #![forbid(unsafe_code)]
 #![doc = include_str!("../README.md")]
+#![warn(clippy::all, clippy::pedantic)]
+#![warn(
+    absolute_paths_not_starting_with_crate,
+    rustdoc::invalid_html_tags,
+    missing_copy_implementations,
+    missing_debug_implementations,
+    semicolon_in_expressions_from_macros,
+    unreachable_pub,
+    unused_extern_crates,
+    variant_size_differences
+)]
 #![allow(clippy::module_name_repetitions)]
 #![allow(clippy::missing_errors_doc)]
-#![allow(clippy::option_option)]
 #![allow(clippy::too_many_lines)]
 
 mod arrayvec;

diff --git a/src/machine.rs b/src/machine.rs
@@ -8,7 +8,9 @@ use crate::{Emitter, Error, Reader, Tokenizer};
 // Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that
 // should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance
 #[inline]
-pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> {
+pub(crate) fn consume<R: Reader, E: Emitter>(
+    slf: &mut Tokenizer<R, E>,
+) -> Result<ControlToken, R::Error> {
     macro_rules! mutate_character_reference {
         (* $mul:literal + $x:ident - $sub:literal) => {
             match slf

diff --git a/src/machine_helper.rs b/src/machine_helper.rs
@@ -1,10 +1,11 @@
 use crate::utils::{trace_log, State};
 use crate::Emitter;
 
+#[derive(Debug)]
 pub(crate) struct MachineHelper {
-    pub temporary_buffer: Vec<u8>,
-    pub character_reference_code: u32,
-    pub state: State,
+    pub(crate) temporary_buffer: Vec<u8>,
+    pub(crate) character_reference_code: u32,
+    pub(crate) state: State,
     return_state: Option<State>,
 }
 

diff --git a/src/read_helper.rs b/src/read_helper.rs
@@ -2,9 +2,11 @@ use crate::char_validator::CharValidator;
 use crate::Emitter;
 use crate::Reader;
 
+#[derive(Debug)]
 pub(crate) struct ReadHelper<R: Reader> {
     reader: R,
     last_character_was_cr: bool,
+    #[allow(clippy::option_option)]
     to_reconsume: Option<Option<u8>>,
 }
 

diff --git a/src/reader.rs b/src/reader.rs
@@ -141,6 +141,7 @@ impl<'a, R: 'a + Reader> Readable<'a> for R {
 ///
 /// assert_eq!(new_html, "<title>hello world</title>");
 /// ```
+#[derive(Debug)]
 pub struct StringReader<'a> {
     input: &'a [u8],
 }
@@ -273,11 +274,12 @@ impl<'a> Readable<'a> for &'a [u8] {
 ///         }
 ///         _ => panic!("unexpected input"),
 ///     }
-///     
+///
 /// }
 ///
 /// assert_eq!(new_html, "<title>hello world</title>");
 /// ```
+#[derive(Debug)]
 pub struct IoReader<R: Read> {
     buf: Box<[u8; BUF_SIZE]>,
     buf_offset: usize,
@@ -389,6 +391,7 @@ fn fast_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
         debug_assert!(needle.len() <= 16);
         let mut needle_arr = [0; 16];
         needle_arr[..needle.len()].copy_from_slice(needle);
+        #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
         jetscii::Bytes::new(needle_arr, needle.len() as i32, |b| needle.contains(&b)).find(haystack)
     }
 

diff --git a/src/testutils.rs b/src/testutils.rs
@@ -35,6 +35,7 @@ pub fn trace_log(msg: &str) {
 }
 
 /// A kind of reader that implements `read_until` very poorly. Only available in tests
+#[derive(Debug)]
 pub struct SlowReader<R: Reader>(pub R);
 
 impl<R: Reader> Reader for SlowReader<R> {

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -8,6 +8,7 @@ use crate::utils::{ControlToken, State};
 use crate::{DefaultEmitter, Emitter, Readable, Reader};
 
 /// A HTML tokenizer. See crate-level docs for basic usage.
+#[derive(Debug)]
 pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> {
     eof: bool,
     pub(crate) validator: CharValidator,
@@ -98,6 +99,7 @@ impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {
 /// `Result<Token, _>`.
 ///
 /// This is the return value of [`Tokenizer::infallible`].
+#[derive(Debug)]
 pub struct InfallibleTokenizer<R: Reader<Error = Infallible>, E: Emitter>(Tokenizer<R, E>);
 
 impl<R: Reader<Error = Infallible>, E: Emitter> Tokenizer<R, E> {

diff --git a/src/utils.rs b/src/utils.rs
@@ -49,9 +49,10 @@ macro_rules! noncharacter_pat {
 
 pub(crate) use noncharacter_pat;
 
-// When integration tests are running, this enum is public and we get warnings about missing docs.
-// However, it's not actually part of public API.
-#[allow(missing_docs)]
+// When integration tests are running, this enum is public and we get missing_docs warning.
+// When integration tests are not running, this enum is NOT public and we get unreachable_pub
+// warning.
+#[allow(missing_docs, unreachable_pub)]
 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 pub enum State {
     Data,
@@ -135,7 +136,7 @@ pub enum State {
     NumericCharacterReferenceEnd,
 }
 
-pub enum ControlToken {
+pub(crate) enum ControlToken {
     Eof,
     Continue,
 }