Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add dump-external links command #168

Merged
merged 8 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 55 additions & 2 deletions src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
use std::path::PathBuf;
use std::sync::Arc;

use crate::html::{Href, Link, UsedLink};
use bumpalo::collections::String as BumpString;
use bumpalo::Bump;

pub trait LinkCollector<P: Send>: Send {
use crate::html::{push_and_canonicalize, try_percent_decode, Href, Link, UsedLink};
use crate::urls::is_external_link;

pub trait LinkCollector<P>: Send {
fn new() -> Self;
fn ingest(&mut self, link: Link<'_, P>);
fn merge(&mut self, other: Self);
Expand Down Expand Up @@ -65,12 +69,60 @@
LinkState::Defined => (),
LinkState::Undefined(links) => match other {
LinkState::Defined => *self = LinkState::Defined,
LinkState::Undefined(links2) => links.extend(links2.into_iter()),

Check warning on line 72 in src/collector.rs

View workflow job for this annotation

GitHub Actions / clippy

explicit call to `.into_iter()` in function argument accepting `IntoIterator`

warning: explicit call to `.into_iter()` in function argument accepting `IntoIterator` --> src/collector.rs:72:62 | 72 | LinkState::Undefined(links2) => links.extend(links2.into_iter()), | ^^^^^^^^^^^^^^^^^^ help: consider removing the `.into_iter()`: `links2` | note: this parameter accepts any `IntoIterator`, so you don't need to call `.into_iter()` --> /rustc/79e9716c980570bfd1f666e3b16ac583f0168962/library/core/src/iter/traits/collect.rs:371:18 = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#useless_conversion = note: `#[warn(clippy::useless_conversion)]` on by default
},
}
}
}

pub struct LocalLinksOnly<C> {
pub collector: C,
arena: Bump,
}

pub fn canonicalize_local_link<'a, P>(arena: &Bump, mut link: Link<'a, P>) -> Option<Link<'a, P>> {
if let Link::Uses(ref mut used_link) = link {
if is_external_link(&used_link.href.0.as_bytes()) {

Check warning on line 85 in src/collector.rs

View workflow job for this annotation

GitHub Actions / clippy

this expression creates a reference which is immediately dereferenced by the compiler

warning: this expression creates a reference which is immediately dereferenced by the compiler --> src/collector.rs:85:29 | 85 | if is_external_link(&used_link.href.0.as_bytes()) { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: change this to: `used_link.href.0.as_bytes()` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrow = note: `#[warn(clippy::needless_borrow)]` on by default
return None;
}

let qs_start = used_link
.href
.0
.find(&['?', '#'][..])
.unwrap_or_else(|| used_link.href.0.len());

Check warning on line 93 in src/collector.rs

View workflow job for this annotation

GitHub Actions / clippy

unnecessary closure used to substitute value for `Option::None`

warning: unnecessary closure used to substitute value for `Option::None` --> src/collector.rs:89:24 | 89 | let qs_start = used_link | ________________________^ 90 | | .href 91 | | .0 92 | | .find(&['?', '#'][..]) 93 | | .unwrap_or_else(|| used_link.href.0.len()); | |______________----------------------------------------^ | | | help: use `unwrap_or(..)` instead: `unwrap_or(used_link.href.0.len())` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unnecessary_lazy_evaluations = note: `#[warn(clippy::unnecessary_lazy_evaluations)]` on by default

// try calling canonicalize
let path = used_link.path.to_str().unwrap_or("");
let mut href = BumpString::from_str_in(path, &arena);

Check warning on line 97 in src/collector.rs

View workflow job for this annotation

GitHub Actions / clippy

this expression creates a reference which is immediately dereferenced by the compiler

warning: this expression creates a reference which is immediately dereferenced by the compiler --> src/collector.rs:97:54 | 97 | let mut href = BumpString::from_str_in(path, &arena); | ^^^^^^ help: change this to: `arena` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrow
push_and_canonicalize(
&mut href,
&try_percent_decode(&used_link.href.0[..qs_start]),
);
}

Some(link)
}

impl<P, C: LinkCollector<P>> LinkCollector<P> for LocalLinksOnly<C> {
fn new() -> Self {
LocalLinksOnly {
collector: C::new(),
arena: Bump::new(),
}
}

fn ingest(&mut self, link: Link<'_, P>) {
if let Some(link) = canonicalize_local_link(&self.arena, link) {
self.collector.ingest(link);
}
}

fn merge(&mut self, other: Self) {
self.collector.merge(other.collector);
}
}

/// Link collector used for actual link checking. Keeps track of broken links only.
pub struct BrokenLinkCollector<P> {
links: BTreeMap<String, LinkState<P>>,
Expand All @@ -89,6 +141,7 @@
match link {
Link::Uses(used_link) => {
self.used_link_count += 1;

self.links
.entry(used_link.href.0.to_owned())
.and_modify(|state| state.add_usage(&used_link))
Expand Down
52 changes: 48 additions & 4 deletions src/html/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,18 @@ use bumpalo::collections::Vec as BumpVec;
use html5gum::{IoReader, Tokenizer};

use crate::paragraph::ParagraphWalker;
use crate::urls::is_external_link;

#[cfg(test)]
use pretty_assertions::assert_eq;

#[inline]
fn push_and_canonicalize(base: &mut BumpString, path: &str) {
if path.starts_with('/') {
pub fn push_and_canonicalize(base: &mut BumpString, path: &str) {
if is_external_link(path.as_bytes()) {
base.clear();
base.push_str(path);
return;
} else if path.starts_with('/') {
base.clear();
} else if path.is_empty() {
if base.ends_with('/') {
Expand Down Expand Up @@ -113,10 +118,42 @@ mod test_push_and_canonicalize {
push_and_canonicalize(&mut base, path);
assert_eq!(base, "foo/index.html/baz.html");
}

#[test]
fn external_scheme_index() {
let mut base = String::from("index.html");
let path = "http://foo.com";
push_and_canonicalize(&mut base, path);
assert_eq!(base, "http://foo.com");
}

#[test]
fn external_scheme_empty_base() {
let mut base = String::from("");
let path = "http://foo.com";
push_and_canonicalize(&mut base, path);
assert_eq!(base, "http://foo.com");
}

#[test]
fn external_scheme_relative() {
let mut base = String::from("bar.html");
let path = "//foo.com";
push_and_canonicalize(&mut base, path);
assert_eq!(base, "//foo.com");
}

#[test]
fn external_scheme_subdir() {
let mut base = String::from("foo/bar.html");
let path = "http://foo.com";
push_and_canonicalize(&mut base, path);
assert_eq!(base, "http://foo.com");
}
}

#[inline]
fn try_percent_decode(input: &str) -> Cow<'_, str> {
pub fn try_percent_decode(input: &str) -> Cow<'_, str> {
percent_encoding::percent_decode_str(input)
.decode_utf8()
.unwrap_or(Cow::Borrowed(input))
Expand Down Expand Up @@ -379,6 +416,9 @@ fn test_html_parsing_malformed_script() {

#[test]
fn test_document_links() {
use bumpalo::Bump;

use crate::collector::canonicalize_local_link;
use crate::paragraph::ParagraphHasher;

let doc = Document::new(
Expand Down Expand Up @@ -435,8 +475,12 @@ fn test_document_links() {
})
};

let arena = Bump::new();

assert_eq!(
&links.collect::<Vec<_>>(),
&links
.filter_map(|x| canonicalize_local_link(&arena, x))
.collect::<Vec<_>>(),
&[
used_link("platforms/ruby"),
used_link("platforms/perl"),
Expand Down
54 changes: 0 additions & 54 deletions src/html/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,42 +16,6 @@ fn try_normalize_href_value(input: &str) -> &str {
input.trim()
}

#[inline]
fn is_bad_schema(url: &[u8]) -> bool {
// check if url is empty
let first_char = match url.first() {
Some(x) => x,
None => return false,
};

// protocol-relative URL
if url.starts_with(b"//") {
return true;
}

// check if string before first : is a valid URL scheme
// see RFC 2396, Appendix A for what constitutes a valid scheme

if !first_char.is_ascii_alphabetic() {
return false;
}

for c in &url[1..] {
match c {
b'a'..=b'z' => (),
b'A'..=b'Z' => (),
b'0'..=b'9' => (),
b'+' => (),
b'-' => (),
b'.' => (),
b':' => return true,
_ => return false,
}
}

false
}

#[derive(Default)]
pub struct ParserBuffers {
current_tag_name: Vec<u8>,
Expand Down Expand Up @@ -91,10 +55,6 @@ where
std::str::from_utf8(&self.buffers.current_attribute_value).unwrap(),
);

if is_bad_schema(value.as_bytes()) {
return;
}

self.link_buf.push(Link::Uses(UsedLink {
href: self.document.join(self.arena, self.check_anchors, value),
path: self.document.path.clone(),
Expand All @@ -113,10 +73,6 @@ where
.filter_map(|candidate: &str| candidate.split_whitespace().next())
.filter(|value| !value.is_empty())
{
if is_bad_schema(value.as_bytes()) {
continue;
}

self.link_buf.push(Link::Uses(UsedLink {
href: self.document.join(self.arena, self.check_anchors, value),
path: self.document.path.clone(),
Expand Down Expand Up @@ -276,13 +232,3 @@ where
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
fn set_force_quirks(&mut self) {}
}

#[test]
fn test_is_bad_schema() {
assert!(is_bad_schema(b"//"));
assert!(!is_bad_schema(b""));
assert!(!is_bad_schema(b"http"));
assert!(is_bad_schema(b"http:"));
assert!(is_bad_schema(b"http:/"));
assert!(!is_bad_schema(b"http/"));
}
51 changes: 45 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ mod collector;
mod html;
mod markdown;
mod paragraph;
mod urls;

use std::cmp;
use std::collections::{BTreeMap, BTreeSet};
Expand All @@ -16,10 +17,12 @@ use jwalk::WalkDirGeneric;
use markdown::DocumentSource;
use rayon::prelude::*;

use collector::{BrokenLinkCollector, LinkCollector, UsedLinkCollector};
use collector::{BrokenLinkCollector, LinkCollector, LocalLinksOnly, UsedLinkCollector};
use html::{DefinedLink, Document, DocumentBuffers, Link};
use paragraph::{DebugParagraphWalker, NoopParagraphWalker, ParagraphHasher, ParagraphWalker};

use crate::urls::is_external_link;

static MARKDOWN_FILES: &[&str] = &["md", "mdx"];
static HTML_FILES: &[&str] = &["htm", "html"];

Expand Down Expand Up @@ -84,6 +87,11 @@ enum Subcommand {
base_path: PathBuf,
sources_path: PathBuf,
},

/// Dump out a list and count of _external_ links. hyperlink does not check external links,
/// but this subcommand can be used to get a summary of the external links that exist in your
/// site.
DumpExternalLinks { base_path: PathBuf },
}

fn main() -> Result<(), Error> {
Expand Down Expand Up @@ -115,6 +123,9 @@ fn main() -> Result<(), Error> {
}) => {
return match_all_paragraphs(base_path, sources_path);
}
Some(Subcommand::DumpExternalLinks { base_path }) => {
return dump_external_links(base_path);
}
None => {}
}

Expand Down Expand Up @@ -150,9 +161,10 @@ where
{
println!("Reading files");

let html_result = extract_html_links::<BrokenLinkCollector<_>, P>(&base_path, check_anchors)?;
let html_result =
extract_html_links::<LocalLinksOnly<BrokenLinkCollector<_>>, P>(&base_path, check_anchors)?;

let used_links_len = html_result.collector.used_links_count();
let used_links_len = html_result.collector.collector.used_links_count();
println!(
"Checking {} links from {} files ({} documents)",
used_links_len, html_result.file_count, html_result.documents_count,
Expand All @@ -163,6 +175,7 @@ where
let mut bad_anchors_count = 0;

let mut broken_links = html_result
.collector
.collector
.get_broken_links(check_anchors)
.peekable();
Expand Down Expand Up @@ -343,6 +356,31 @@ fn dump_paragraphs(path: PathBuf) -> Result<(), Error> {
Ok(())
}

fn dump_external_links(base_path: PathBuf) -> Result<(), Error> {
println!("Reading files");
let html_result =
extract_html_links::<UsedLinkCollector<_>, NoopParagraphWalker>(&base_path, true)?;

println!(
"Checking {} links from {} files ({} documents)",
html_result.collector.used_links.len(),
html_result.file_count,
html_result.documents_count,
);

let used_links = html_result.collector.used_links.iter().peekable();

for used_link in used_links {
if is_external_link(used_link.href.as_bytes()) {
println!("{}", used_link.href);
}
}

mem::forget(html_result);

Ok(())
}

struct HtmlResult<C> {
collector: C,
documents_count: usize,
Expand Down Expand Up @@ -491,8 +529,9 @@ fn extract_markdown_paragraphs<P: ParagraphWalker>(

fn match_all_paragraphs(base_path: PathBuf, sources_path: PathBuf) -> Result<(), Error> {
println!("Reading files");
let html_result =
extract_html_links::<UsedLinkCollector<_>, ParagraphHasher>(&base_path, true)?;
let html_result = extract_html_links::<LocalLinksOnly<UsedLinkCollector<_>>, ParagraphHasher>(
&base_path, true,
)?;

println!("Reading source files");
let paragraps_to_sourcefile = extract_markdown_paragraphs::<ParagraphHasher>(&sources_path)?;
Expand All @@ -505,7 +544,7 @@ fn match_all_paragraphs(base_path: PathBuf, sources_path: PathBuf) -> Result<(),
let mut link_single_source = 0;
// We only care about HTML's used links because paragraph matching is exclusively for error
// messages that point to the broken link.
for link in &html_result.collector.used_links {
for link in &html_result.collector.collector.used_links {
total_links += 1;

let paragraph = match link.paragraph {
Expand Down
Loading
Loading