Skip to content

Commit

Permalink
add dump-external links command (#168)
Browse files Browse the repository at this point in the history
Co-authored-by: Michael Clayton <mclayton@redhat.com>
  • Loading branch information
untitaker and mwcz authored Nov 29, 2023
1 parent bc8ae66 commit e2a7fae
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 66 deletions.
57 changes: 55 additions & 2 deletions src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@ use std::collections::BTreeMap;
use std::path::PathBuf;
use std::sync::Arc;

use crate::html::{Href, Link, UsedLink};
use bumpalo::collections::String as BumpString;
use bumpalo::Bump;

pub trait LinkCollector<P: Send>: Send {
use crate::html::{push_and_canonicalize, try_percent_decode, Href, Link, UsedLink};
use crate::urls::is_external_link;

pub trait LinkCollector<P>: Send {
fn new() -> Self;
fn ingest(&mut self, link: Link<'_, P>);
fn merge(&mut self, other: Self);
Expand Down Expand Up @@ -71,6 +75,54 @@ impl<P: Copy> LinkState<P> {
}
}

pub struct LocalLinksOnly<C> {
pub collector: C,
arena: Bump,
}

pub fn canonicalize_local_link<'a, P>(arena: &Bump, mut link: Link<'a, P>) -> Option<Link<'a, P>> {
if let Link::Uses(ref mut used_link) = link {
if is_external_link(&used_link.href.0.as_bytes()) {

Check warning on line 85 in src/collector.rs

View workflow job for this annotation

GitHub Actions / clippy

this expression creates a reference which is immediately dereferenced by the compiler

warning: this expression creates a reference which is immediately dereferenced by the compiler --> src/collector.rs:85:29 | 85 | if is_external_link(&used_link.href.0.as_bytes()) { | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: change this to: `used_link.href.0.as_bytes()` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrow = note: `#[warn(clippy::needless_borrow)]` on by default
return None;
}

let qs_start = used_link
.href
.0
.find(&['?', '#'][..])
.unwrap_or_else(|| used_link.href.0.len());

Check warning on line 93 in src/collector.rs

View workflow job for this annotation

GitHub Actions / clippy

unnecessary closure used to substitute value for `Option::None`

warning: unnecessary closure used to substitute value for `Option::None` --> src/collector.rs:89:24 | 89 | let qs_start = used_link | ________________________^ 90 | | .href 91 | | .0 92 | | .find(&['?', '#'][..]) 93 | | .unwrap_or_else(|| used_link.href.0.len()); | |______________----------------------------------------^ | | | help: use `unwrap_or(..)` instead: `unwrap_or(used_link.href.0.len())` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#unnecessary_lazy_evaluations = note: `#[warn(clippy::unnecessary_lazy_evaluations)]` on by default

// try calling canonicalize
let path = used_link.path.to_str().unwrap_or("");
let mut href = BumpString::from_str_in(path, &arena);

Check warning on line 97 in src/collector.rs

View workflow job for this annotation

GitHub Actions / clippy

this expression creates a reference which is immediately dereferenced by the compiler

warning: this expression creates a reference which is immediately dereferenced by the compiler --> src/collector.rs:97:54 | 97 | let mut href = BumpString::from_str_in(path, &arena); | ^^^^^^ help: change this to: `arena` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrow
push_and_canonicalize(
&mut href,
&try_percent_decode(&used_link.href.0[..qs_start]),
);
}

Some(link)
}

impl<P, C: LinkCollector<P>> LinkCollector<P> for LocalLinksOnly<C> {
fn new() -> Self {
LocalLinksOnly {
collector: C::new(),
arena: Bump::new(),
}
}

fn ingest(&mut self, link: Link<'_, P>) {
if let Some(link) = canonicalize_local_link(&self.arena, link) {
self.collector.ingest(link);
}
}

fn merge(&mut self, other: Self) {
self.collector.merge(other.collector);
}
}

/// Link collector used for actual link checking. Keeps track of broken links only.
pub struct BrokenLinkCollector<P> {
links: BTreeMap<String, LinkState<P>>,
Expand All @@ -89,6 +141,7 @@ impl<P: Send + Copy> LinkCollector<P> for BrokenLinkCollector<P> {
match link {
Link::Uses(used_link) => {
self.used_link_count += 1;

self.links
.entry(used_link.href.0.to_owned())
.and_modify(|state| state.add_usage(&used_link))
Expand Down
52 changes: 48 additions & 4 deletions src/html/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,18 @@ use bumpalo::collections::Vec as BumpVec;
use html5gum::{IoReader, Tokenizer};

use crate::paragraph::ParagraphWalker;
use crate::urls::is_external_link;

#[cfg(test)]
use pretty_assertions::assert_eq;

#[inline]
fn push_and_canonicalize(base: &mut BumpString, path: &str) {
if path.starts_with('/') {
pub fn push_and_canonicalize(base: &mut BumpString, path: &str) {
if is_external_link(path.as_bytes()) {
base.clear();
base.push_str(path);
return;
} else if path.starts_with('/') {
base.clear();
} else if path.is_empty() {
if base.ends_with('/') {
Expand Down Expand Up @@ -113,10 +118,42 @@ mod test_push_and_canonicalize {
push_and_canonicalize(&mut base, path);
assert_eq!(base, "foo/index.html/baz.html");
}

#[test]
fn external_scheme_index() {
let mut base = String::from("index.html");
let path = "http://foo.com";
push_and_canonicalize(&mut base, path);
assert_eq!(base, "http://foo.com");
}

#[test]
fn external_scheme_empty_base() {
let mut base = String::from("");
let path = "http://foo.com";
push_and_canonicalize(&mut base, path);
assert_eq!(base, "http://foo.com");
}

#[test]
fn external_scheme_relative() {
let mut base = String::from("bar.html");
let path = "//foo.com";
push_and_canonicalize(&mut base, path);
assert_eq!(base, "//foo.com");
}

#[test]
fn external_scheme_subdir() {
let mut base = String::from("foo/bar.html");
let path = "http://foo.com";
push_and_canonicalize(&mut base, path);
assert_eq!(base, "http://foo.com");
}
}

#[inline]
fn try_percent_decode(input: &str) -> Cow<'_, str> {
pub fn try_percent_decode(input: &str) -> Cow<'_, str> {
percent_encoding::percent_decode_str(input)
.decode_utf8()
.unwrap_or(Cow::Borrowed(input))
Expand Down Expand Up @@ -379,6 +416,9 @@ fn test_html_parsing_malformed_script() {

#[test]
fn test_document_links() {
use bumpalo::Bump;

use crate::collector::canonicalize_local_link;
use crate::paragraph::ParagraphHasher;

let doc = Document::new(
Expand Down Expand Up @@ -435,8 +475,12 @@ fn test_document_links() {
})
};

let arena = Bump::new();

assert_eq!(
&links.collect::<Vec<_>>(),
&links
.filter_map(|x| canonicalize_local_link(&arena, x))
.collect::<Vec<_>>(),
&[
used_link("platforms/ruby"),
used_link("platforms/perl"),
Expand Down
54 changes: 0 additions & 54 deletions src/html/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,42 +16,6 @@ fn try_normalize_href_value(input: &str) -> &str {
input.trim()
}

#[inline]
fn is_bad_schema(url: &[u8]) -> bool {
// check if url is empty
let first_char = match url.first() {
Some(x) => x,
None => return false,
};

// protocol-relative URL
if url.starts_with(b"//") {
return true;
}

// check if string before first : is a valid URL scheme
// see RFC 2396, Appendix A for what constitutes a valid scheme

if !first_char.is_ascii_alphabetic() {
return false;
}

for c in &url[1..] {
match c {
b'a'..=b'z' => (),
b'A'..=b'Z' => (),
b'0'..=b'9' => (),
b'+' => (),
b'-' => (),
b'.' => (),
b':' => return true,
_ => return false,
}
}

false
}

#[derive(Default)]
pub struct ParserBuffers {
current_tag_name: Vec<u8>,
Expand Down Expand Up @@ -91,10 +55,6 @@ where
std::str::from_utf8(&self.buffers.current_attribute_value).unwrap(),
);

if is_bad_schema(value.as_bytes()) {
return;
}

self.link_buf.push(Link::Uses(UsedLink {
href: self.document.join(self.arena, self.check_anchors, value),
path: self.document.path.clone(),
Expand All @@ -113,10 +73,6 @@ where
.filter_map(|candidate: &str| candidate.split_whitespace().next())
.filter(|value| !value.is_empty())
{
if is_bad_schema(value.as_bytes()) {
continue;
}

self.link_buf.push(Link::Uses(UsedLink {
href: self.document.join(self.arena, self.check_anchors, value),
path: self.document.path.clone(),
Expand Down Expand Up @@ -276,13 +232,3 @@ where
fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
fn set_force_quirks(&mut self) {}
}

#[test]
fn test_is_bad_schema() {
assert!(is_bad_schema(b"//"));
assert!(!is_bad_schema(b""));
assert!(!is_bad_schema(b"http"));
assert!(is_bad_schema(b"http:"));
assert!(is_bad_schema(b"http:/"));
assert!(!is_bad_schema(b"http/"));
}
51 changes: 45 additions & 6 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ mod collector;
mod html;
mod markdown;
mod paragraph;
mod urls;

use std::cmp;
use std::collections::{BTreeMap, BTreeSet};
Expand All @@ -16,10 +17,12 @@ use jwalk::WalkDirGeneric;
use markdown::DocumentSource;
use rayon::prelude::*;

use collector::{BrokenLinkCollector, LinkCollector, UsedLinkCollector};
use collector::{BrokenLinkCollector, LinkCollector, LocalLinksOnly, UsedLinkCollector};
use html::{DefinedLink, Document, DocumentBuffers, Link};
use paragraph::{DebugParagraphWalker, NoopParagraphWalker, ParagraphHasher, ParagraphWalker};

use crate::urls::is_external_link;

static MARKDOWN_FILES: &[&str] = &["md", "mdx"];
static HTML_FILES: &[&str] = &["htm", "html"];

Expand Down Expand Up @@ -84,6 +87,11 @@ enum Subcommand {
base_path: PathBuf,
sources_path: PathBuf,
},

/// Dump out a list and count of _external_ links. hyperlink does not check external links,
/// but this subcommand can be used to get a summary of the external links that exist in your
/// site.
DumpExternalLinks { base_path: PathBuf },
}

fn main() -> Result<(), Error> {
Expand Down Expand Up @@ -115,6 +123,9 @@ fn main() -> Result<(), Error> {
}) => {
return match_all_paragraphs(base_path, sources_path);
}
Some(Subcommand::DumpExternalLinks { base_path }) => {
return dump_external_links(base_path);
}
None => {}
}

Expand Down Expand Up @@ -150,9 +161,10 @@ where
{
println!("Reading files");

let html_result = extract_html_links::<BrokenLinkCollector<_>, P>(&base_path, check_anchors)?;
let html_result =
extract_html_links::<LocalLinksOnly<BrokenLinkCollector<_>>, P>(&base_path, check_anchors)?;

let used_links_len = html_result.collector.used_links_count();
let used_links_len = html_result.collector.collector.used_links_count();
println!(
"Checking {} links from {} files ({} documents)",
used_links_len, html_result.file_count, html_result.documents_count,
Expand All @@ -163,6 +175,7 @@ where
let mut bad_anchors_count = 0;

let mut broken_links = html_result
.collector
.collector
.get_broken_links(check_anchors)
.peekable();
Expand Down Expand Up @@ -343,6 +356,31 @@ fn dump_paragraphs(path: PathBuf) -> Result<(), Error> {
Ok(())
}

fn dump_external_links(base_path: PathBuf) -> Result<(), Error> {
println!("Reading files");
let html_result =
extract_html_links::<UsedLinkCollector<_>, NoopParagraphWalker>(&base_path, true)?;

println!(
"Checking {} links from {} files ({} documents)",
html_result.collector.used_links.len(),
html_result.file_count,
html_result.documents_count,
);

let used_links = html_result.collector.used_links.iter().peekable();

for used_link in used_links {
if is_external_link(used_link.href.as_bytes()) {
println!("{}", used_link.href);
}
}

mem::forget(html_result);

Ok(())
}

struct HtmlResult<C> {
collector: C,
documents_count: usize,
Expand Down Expand Up @@ -491,8 +529,9 @@ fn extract_markdown_paragraphs<P: ParagraphWalker>(

fn match_all_paragraphs(base_path: PathBuf, sources_path: PathBuf) -> Result<(), Error> {
println!("Reading files");
let html_result =
extract_html_links::<UsedLinkCollector<_>, ParagraphHasher>(&base_path, true)?;
let html_result = extract_html_links::<LocalLinksOnly<UsedLinkCollector<_>>, ParagraphHasher>(
&base_path, true,
)?;

println!("Reading source files");
let paragraps_to_sourcefile = extract_markdown_paragraphs::<ParagraphHasher>(&sources_path)?;
Expand All @@ -505,7 +544,7 @@ fn match_all_paragraphs(base_path: PathBuf, sources_path: PathBuf) -> Result<(),
let mut link_single_source = 0;
// We only care about HTML's used links because paragraph matching is exclusively for error
// messages that point to the broken link.
for link in &html_result.collector.used_links {
for link in &html_result.collector.collector.used_links {
total_links += 1;

let paragraph = match link.paragraph {
Expand Down
Loading

0 comments on commit e2a7fae

Please sign in to comment.