Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added more lints #31

Merged
merged 4 commits into from
Feb 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
45 changes: 29 additions & 16 deletions generate_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,50 +8,59 @@
match_arms = {}

for key, value in key_and_value:
assert key[0] == '&'
assert key[0] == "&"
key = key[1:]
first_char = key[0]
key = key[1:]

match_arms.setdefault(first_char, []).append((key, value))

with open("src/entities.rs", "w") as f:
f.write("""
f.write(
"""
// @generated
// this file is autogenerated by
// curl https://html.spec.whatwg.org/entities.json | python generate_entities.py

pub struct CharRef {
pub(crate) struct CharRef {
/// Name as it appears escaped in HTML
pub name: &'static str,
pub(crate) name: &'static str,
/// Unescaped character codepoints
pub characters: &'static str,
pub(crate) characters: &'static str,
}

pub fn try_read_character_reference<E>(first_char: char, try_read: impl FnMut(&str) -> Result<bool, E>) -> Result<Option<CharRef>, E> {
pub(crate) fn try_read_character_reference<E>(first_char: char, try_read: impl FnMut(&str) -> Result<bool, E>) -> Result<Option<CharRef>, E> {
match first_char {
""")
"""
)

for first_char, if_statements in sorted(match_arms.items()):
# Write each branch of the match stmt as its own function such that
# compilation is faster.
f.write("""
f.write(
"""
'%(first_char)s' => {
#[allow(non_snake_case)]
fn branch_%(first_char)s<E>(mut try_read: impl FnMut(&str) -> Result<bool, E>) -> Result<Option<CharRef>, E> {
for (other_chars, characters) in &[
""" % {"first_char": first_char})
"""
% {"first_char": first_char}
)

for other_chars, value in if_statements:
characters = ""
for c in value['codepoints']:
for c in value["codepoints"]:
characters += r"\u{" + hex(c)[2:] + r"}"

f.write("""
f.write(
"""
("%(other_chars)s", "%(characters)s"),
""" % {"other_chars": other_chars, "characters": characters})
"""
% {"other_chars": other_chars, "characters": characters}
)

f.write("""
f.write(
"""
] {
if try_read(other_chars)? {
return Ok(Some(CharRef { name: other_chars, characters }));
Expand All @@ -64,10 +73,14 @@
branch_%(first_char)s(try_read)

}
""" % {"first_char": first_char})
"""
% {"first_char": first_char}
)

f.write("""
f.write(
"""
_ => Ok(None)
}
}
""")
"""
)
10 changes: 5 additions & 5 deletions src/arrayvec.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
/// This is basically like the arrayvec crate, except crappier, only the subset I need and
/// therefore without unsafe Rust.

pub struct ArrayVec<T: Copy, const CAP: usize> {
#[derive(Debug)]
pub(crate) struct ArrayVec<T: Copy, const CAP: usize> {
content: [T; CAP],
len: usize,
}

impl<T: Copy, const CAP: usize> ArrayVec<T, CAP> {
pub fn new(filler_item: T) -> Self {
pub(crate) fn new(filler_item: T) -> Self {
// filler_item is there to avoid usage of MaybeUninit, and can literally be anything at
// all.
ArrayVec {
Expand All @@ -16,12 +16,12 @@ impl<T: Copy, const CAP: usize> ArrayVec<T, CAP> {
}
}

pub fn push(&mut self, item: T) {
pub(crate) fn push(&mut self, item: T) {
self.content[self.len] = item;
self.len += 1;
}

pub fn drain(&mut self) -> &[T] {
pub(crate) fn drain(&mut self) -> &[T] {
let rv = &self.content[..self.len];
self.len = 0;
rv
Expand Down
11 changes: 6 additions & 5 deletions src/char_validator.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::arrayvec::ArrayVec;
use crate::{Emitter, Error};

#[derive(Debug)]
pub(crate) struct CharValidator {
last_4_bytes: u32,
character_error: ArrayVec<Error, 3>,
Expand All @@ -16,19 +17,19 @@ impl Default for CharValidator {
}

impl CharValidator {
pub fn reset(&mut self) {
pub(crate) fn reset(&mut self) {
self.last_4_bytes = 0;
}

#[inline]
pub fn validate_bytes<E: Emitter>(&mut self, emitter: &mut E, next_bytes: &[u8]) {
pub(crate) fn validate_bytes<E: Emitter>(&mut self, emitter: &mut E, next_bytes: &[u8]) {
for &x in next_bytes {
self.validate_byte(emitter, x);
}
}

#[inline]
pub fn validate_byte<E: Emitter>(&mut self, emitter: &mut E, next_byte: u8) {
pub(crate) fn validate_byte<E: Emitter>(&mut self, emitter: &mut E, next_byte: u8) {
if next_byte < 128 {
// start of character (ascii)
self.last_4_bytes = 0;
Expand All @@ -45,13 +46,13 @@ impl CharValidator {
}
}

pub fn flush_character_error<E: Emitter>(&mut self, emitter: &mut E) {
pub(crate) fn flush_character_error<E: Emitter>(&mut self, emitter: &mut E) {
for e in self.character_error.drain() {
emitter.emit_error(*e);
}
}

pub fn set_character_error<E: Emitter>(&mut self, emitter: &mut E, error: Error) {
pub(crate) fn set_character_error<E: Emitter>(&mut self, emitter: &mut E, error: Error) {
if self.last_4_bytes == 0 {
emitter.emit_error(error);
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ pub trait Emitter {
}

/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
#[derive(Default)]
#[derive(Debug, Default)]
pub struct DefaultEmitter {
current_characters: Vec<u8>,
current_token: Option<Token>,
Expand Down
8 changes: 4 additions & 4 deletions src/entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
// this file is autogenerated by
// curl https://html.spec.whatwg.org/entities.json | python generate_entities.py

pub struct CharRef {
pub(crate) struct CharRef {
lebensterben marked this conversation as resolved.
Show resolved Hide resolved
/// Name as it appears escaped in HTML
pub name: &'static str,
pub(crate) name: &'static str,
/// Unescaped character codepoints
pub characters: &'static str,
pub(crate) characters: &'static str,
}

pub fn try_read_character_reference<E>(
pub(crate) fn try_read_character_reference<E>(
first_char: char,
try_read: impl FnMut(&str) -> Result<bool, E>,
) -> Result<Option<CharRef>, E> {
Expand Down
12 changes: 11 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,19 @@
// This is an HTML parser. HTML can be untrusted input from the internet.
#![forbid(unsafe_code)]
#![doc = include_str!("../README.md")]
#![warn(clippy::all, clippy::pedantic)]
#![warn(
absolute_paths_not_starting_with_crate,
rustdoc::invalid_html_tags,
missing_copy_implementations,
missing_debug_implementations,
semicolon_in_expressions_from_macros,
unreachable_pub,
unused_extern_crates,
variant_size_differences
)]
#![allow(clippy::module_name_repetitions)]
#![allow(clippy::missing_errors_doc)]
#![allow(clippy::option_option)]
#![allow(clippy::too_many_lines)]

mod arrayvec;
Expand Down
4 changes: 3 additions & 1 deletion src/machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ use crate::{Emitter, Error, Reader, Tokenizer};
// Note: This is not implemented as a method on Tokenizer because there's fields on Tokenizer that
// should not be available in this method, such as Tokenizer.to_reconsume or the Reader instance
#[inline]
pub fn consume<R: Reader, E: Emitter>(slf: &mut Tokenizer<R, E>) -> Result<ControlToken, R::Error> {
pub(crate) fn consume<R: Reader, E: Emitter>(
slf: &mut Tokenizer<R, E>,
) -> Result<ControlToken, R::Error> {
macro_rules! mutate_character_reference {
(* $mul:literal + $x:ident - $sub:literal) => {
match slf
Expand Down
7 changes: 4 additions & 3 deletions src/machine_helper.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use crate::utils::{trace_log, State};
use crate::Emitter;

#[derive(Debug)]
pub(crate) struct MachineHelper {
pub temporary_buffer: Vec<u8>,
pub character_reference_code: u32,
pub state: State,
pub(crate) temporary_buffer: Vec<u8>,
pub(crate) character_reference_code: u32,
pub(crate) state: State,
return_state: Option<State>,
}

Expand Down
2 changes: 2 additions & 0 deletions src/read_helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ use crate::char_validator::CharValidator;
use crate::Emitter;
use crate::Reader;

#[derive(Debug)]
pub(crate) struct ReadHelper<R: Reader> {
reader: R,
last_character_was_cr: bool,
#[allow(clippy::option_option)]
to_reconsume: Option<Option<u8>>,
}

Expand Down
5 changes: 4 additions & 1 deletion src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ impl<'a, R: 'a + Reader> Readable<'a> for R {
///
/// assert_eq!(new_html, "<title>hello world</title>");
/// ```
#[derive(Debug)]
pub struct StringReader<'a> {
input: &'a [u8],
}
Expand Down Expand Up @@ -273,11 +274,12 @@ impl<'a> Readable<'a> for &'a [u8] {
/// }
/// _ => panic!("unexpected input"),
/// }
///
///
/// }
///
/// assert_eq!(new_html, "<title>hello world</title>");
/// ```
#[derive(Debug)]
pub struct IoReader<R: Read> {
buf: Box<[u8; BUF_SIZE]>,
buf_offset: usize,
Expand Down Expand Up @@ -389,6 +391,7 @@ fn fast_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
debug_assert!(needle.len() <= 16);
let mut needle_arr = [0; 16];
needle_arr[..needle.len()].copy_from_slice(needle);
#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
lebensterben marked this conversation as resolved.
Show resolved Hide resolved
jetscii::Bytes::new(needle_arr, needle.len() as i32, |b| needle.contains(&b)).find(haystack)
}

Expand Down
1 change: 1 addition & 0 deletions src/testutils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub fn trace_log(msg: &str) {
}

/// A kind of reader that implements `read_until` very poorly. Only available in tests
#[derive(Debug)]
pub struct SlowReader<R: Reader>(pub R);

impl<R: Reader> Reader for SlowReader<R> {
Expand Down
2 changes: 2 additions & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::utils::{ControlToken, State};
use crate::{DefaultEmitter, Emitter, Readable, Reader};

/// A HTML tokenizer. See crate-level docs for basic usage.
#[derive(Debug)]
pub struct Tokenizer<R: Reader, E: Emitter = DefaultEmitter> {
eof: bool,
pub(crate) validator: CharValidator,
Expand Down Expand Up @@ -98,6 +99,7 @@ impl<R: Reader, E: Emitter> Iterator for Tokenizer<R, E> {
/// `Result<Token, _>`.
///
/// This is the return value of [`Tokenizer::infallible`].
#[derive(Debug)]
pub struct InfallibleTokenizer<R: Reader<Error = Infallible>, E: Emitter>(Tokenizer<R, E>);

impl<R: Reader<Error = Infallible>, E: Emitter> Tokenizer<R, E> {
Expand Down
9 changes: 5 additions & 4 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ macro_rules! noncharacter_pat {

pub(crate) use noncharacter_pat;

// When integration tests are running, this enum is public and we get warnings about missing docs.
// However, it's not actually part of public API.
#[allow(missing_docs)]
// When integration tests are running, this enum is public and we get missing_docs warning.
// When integration tests are not running, this enum is NOT public and we get unreachable_pub
// warning.
#[allow(missing_docs, unreachable_pub)]
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum State {
Data,
Expand Down Expand Up @@ -135,7 +136,7 @@ pub enum State {
NumericCharacterReferenceEnd,
}

pub enum ControlToken {
pub(crate) enum ControlToken {
Eof,
Continue,
}
Expand Down