untitaker · untitaker · Nov 29, 2023 · Sep 7, 2022 · Sep 16, 2022 · Nov 29, 2023
diff --git a/src/collector.rs b/src/collector.rs
@@ -2,9 +2,13 @@
 use std::path::PathBuf;
 use std::sync::Arc;
 
-use crate::html::{Href, Link, UsedLink};
+use bumpalo::collections::String as BumpString;
+use bumpalo::Bump;
 
-pub trait LinkCollector<P: Send>: Send {
+use crate::html::{push_and_canonicalize, try_percent_decode, Href, Link, UsedLink};
+use crate::urls::is_external_link;
+
+pub trait LinkCollector<P>: Send {
     fn new() -> Self;
     fn ingest(&mut self, link: Link<'_, P>);
     fn merge(&mut self, other: Self);
@@ -65,12 +69,60 @@
            LinkState::Defined => (),
            LinkState::Undefined(links) => match other {
                LinkState::Defined => *self = LinkState::Defined,
                LinkState::Undefined(links2) => links.extend(links2.into_iter()),
            },
        }
     }
 }
 
+pub struct LocalLinksOnly<C> {
+    pub collector: C,
+    arena: Bump,
+}
+
+pub fn canonicalize_local_link<'a, P>(arena: &Bump, mut link: Link<'a, P>) -> Option<Link<'a, P>> {
+    if let Link::Uses(ref mut used_link) = link {
+        if is_external_link(&used_link.href.0.as_bytes()) {
+            return None;
+        }
+
+        let qs_start = used_link
+            .href
+            .0
+            .find(&['?', '#'][..])
+            .unwrap_or_else(|| used_link.href.0.len());
+
+        // try calling canonicalize
+        let path = used_link.path.to_str().unwrap_or("");
+        let mut href = BumpString::from_str_in(path, &arena);
+        push_and_canonicalize(
+            &mut href,
+            &try_percent_decode(&used_link.href.0[..qs_start]),
+        );
+    }
+
+    Some(link)
+}
+
+impl<P, C: LinkCollector<P>> LinkCollector<P> for LocalLinksOnly<C> {
+    fn new() -> Self {
+        LocalLinksOnly {
+            collector: C::new(),
+            arena: Bump::new(),
+        }
+    }
+
+    fn ingest(&mut self, link: Link<'_, P>) {
+        if let Some(link) = canonicalize_local_link(&self.arena, link) {
+            self.collector.ingest(link);
+        }
+    }
+
+    fn merge(&mut self, other: Self) {
+        self.collector.merge(other.collector);
+    }
+}
+
 /// Link collector used for actual link checking. Keeps track of broken links only.
 pub struct BrokenLinkCollector<P> {
     links: BTreeMap<String, LinkState<P>>,
@@ -89,6 +141,7 @@
         match link {
             Link::Uses(used_link) => {
                 self.used_link_count += 1;
+
                 self.links
                     .entry(used_link.href.0.to_owned())
                     .and_modify(|state| state.add_usage(&used_link))

diff --git a/src/html/mod.rs b/src/html/mod.rs
@@ -14,13 +14,18 @@ use bumpalo::collections::Vec as BumpVec;
 use html5gum::{IoReader, Tokenizer};
 
 use crate::paragraph::ParagraphWalker;
+use crate::urls::is_external_link;
 
 #[cfg(test)]
 use pretty_assertions::assert_eq;
 
 #[inline]
-fn push_and_canonicalize(base: &mut BumpString, path: &str) {
-    if path.starts_with('/') {
+pub fn push_and_canonicalize(base: &mut BumpString, path: &str) {
+    if is_external_link(path.as_bytes()) {
+        base.clear();
+        base.push_str(path);
+        return;
+    } else if path.starts_with('/') {
         base.clear();
     } else if path.is_empty() {
         if base.ends_with('/') {
@@ -113,10 +118,42 @@ mod test_push_and_canonicalize {
         push_and_canonicalize(&mut base, path);
         assert_eq!(base, "foo/index.html/baz.html");
     }
+
+    #[test]
+    fn external_scheme_index() {
+        let mut base = String::from("index.html");
+        let path = "http://foo.com";
+        push_and_canonicalize(&mut base, path);
+        assert_eq!(base, "http://foo.com");
+    }
+
+    #[test]
+    fn external_scheme_empty_base() {
+        let mut base = String::from("");
+        let path = "http://foo.com";
+        push_and_canonicalize(&mut base, path);
+        assert_eq!(base, "http://foo.com");
+    }
+
+    #[test]
+    fn external_scheme_relative() {
+        let mut base = String::from("bar.html");
+        let path = "//foo.com";
+        push_and_canonicalize(&mut base, path);
+        assert_eq!(base, "//foo.com");
+    }
+
+    #[test]
+    fn external_scheme_subdir() {
+        let mut base = String::from("foo/bar.html");
+        let path = "http://foo.com";
+        push_and_canonicalize(&mut base, path);
+        assert_eq!(base, "http://foo.com");
+    }
 }
 
 #[inline]
-fn try_percent_decode(input: &str) -> Cow<'_, str> {
+pub fn try_percent_decode(input: &str) -> Cow<'_, str> {
     percent_encoding::percent_decode_str(input)
         .decode_utf8()
         .unwrap_or(Cow::Borrowed(input))
@@ -379,6 +416,9 @@ fn test_html_parsing_malformed_script() {
 
 #[test]
 fn test_document_links() {
+    use bumpalo::Bump;
+
+    use crate::collector::canonicalize_local_link;
     use crate::paragraph::ParagraphHasher;
 
     let doc = Document::new(
@@ -435,8 +475,12 @@ fn test_document_links() {
         })
     };
 
+    let arena = Bump::new();
+
     assert_eq!(
-        &links.collect::<Vec<_>>(),
+        &links
+            .filter_map(|x| canonicalize_local_link(&arena, x))
+            .collect::<Vec<_>>(),
         &[
             used_link("platforms/ruby"),
             used_link("platforms/perl"),

diff --git a/src/html/parser.rs b/src/html/parser.rs
@@ -16,42 +16,6 @@ fn try_normalize_href_value(input: &str) -> &str {
     input.trim()
 }
 
-#[inline]
-fn is_bad_schema(url: &[u8]) -> bool {
-    // check if url is empty
-    let first_char = match url.first() {
-        Some(x) => x,
-        None => return false,
-    };
-
-    // protocol-relative URL
-    if url.starts_with(b"//") {
-        return true;
-    }
-
-    // check if string before first : is a valid URL scheme
-    // see RFC 2396, Appendix A for what constitutes a valid scheme
-
-    if !first_char.is_ascii_alphabetic() {
-        return false;
-    }
-
-    for c in &url[1..] {
-        match c {
-            b'a'..=b'z' => (),
-            b'A'..=b'Z' => (),
-            b'0'..=b'9' => (),
-            b'+' => (),
-            b'-' => (),
-            b'.' => (),
-            b':' => return true,
-            _ => return false,
-        }
-    }
-
-    false
-}
-
 #[derive(Default)]
 pub struct ParserBuffers {
     current_tag_name: Vec<u8>,
@@ -91,10 +55,6 @@ where
             std::str::from_utf8(&self.buffers.current_attribute_value).unwrap(),
         );
 
-        if is_bad_schema(value.as_bytes()) {
-            return;
-        }
-
         self.link_buf.push(Link::Uses(UsedLink {
             href: self.document.join(self.arena, self.check_anchors, value),
             path: self.document.path.clone(),
@@ -113,10 +73,6 @@ where
             .filter_map(|candidate: &str| candidate.split_whitespace().next())
             .filter(|value| !value.is_empty())
         {
-            if is_bad_schema(value.as_bytes()) {
-                continue;
-            }
-
             self.link_buf.push(Link::Uses(UsedLink {
                 href: self.document.join(self.arena, self.check_anchors, value),
                 path: self.document.path.clone(),
@@ -276,13 +232,3 @@ where
     fn set_doctype_system_identifier(&mut self, _: &[u8]) {}
     fn set_force_quirks(&mut self) {}
 }
-
-#[test]
-fn test_is_bad_schema() {
-    assert!(is_bad_schema(b"//"));
-    assert!(!is_bad_schema(b""));
-    assert!(!is_bad_schema(b"http"));
-    assert!(is_bad_schema(b"http:"));
-    assert!(is_bad_schema(b"http:/"));
-    assert!(!is_bad_schema(b"http/"));
-}
diff --git a/src/main.rs b/src/main.rs
@@ -3,6 +3,7 @@ mod collector;
 mod html;
 mod markdown;
 mod paragraph;
+mod urls;
 
 use std::cmp;
 use std::collections::{BTreeMap, BTreeSet};
@@ -16,10 +17,12 @@ use jwalk::WalkDirGeneric;
 use markdown::DocumentSource;
 use rayon::prelude::*;
 
-use collector::{BrokenLinkCollector, LinkCollector, UsedLinkCollector};
+use collector::{BrokenLinkCollector, LinkCollector, LocalLinksOnly, UsedLinkCollector};
 use html::{DefinedLink, Document, DocumentBuffers, Link};
 use paragraph::{DebugParagraphWalker, NoopParagraphWalker, ParagraphHasher, ParagraphWalker};
 
+use crate::urls::is_external_link;
+
 static MARKDOWN_FILES: &[&str] = &["md", "mdx"];
 static HTML_FILES: &[&str] = &["htm", "html"];
 
@@ -84,6 +87,11 @@ enum Subcommand {
         base_path: PathBuf,
         sources_path: PathBuf,
     },
+
+    /// Dump out a list and count of _external_ links.  hyperlink does not check external links,
+    /// but this subcommand can be used to get a summary of the external links that exist in your
+    /// site.
+    DumpExternalLinks { base_path: PathBuf },
 }
 
 fn main() -> Result<(), Error> {
@@ -115,6 +123,9 @@ fn main() -> Result<(), Error> {
         }) => {
             return match_all_paragraphs(base_path, sources_path);
         }
+        Some(Subcommand::DumpExternalLinks { base_path }) => {
+            return dump_external_links(base_path);
+        }
         None => {}
     }
 
@@ -150,9 +161,10 @@ where
 {
     println!("Reading files");
 
-    let html_result = extract_html_links::<BrokenLinkCollector<_>, P>(&base_path, check_anchors)?;
+    let html_result =
+        extract_html_links::<LocalLinksOnly<BrokenLinkCollector<_>>, P>(&base_path, check_anchors)?;
 
-    let used_links_len = html_result.collector.used_links_count();
+    let used_links_len = html_result.collector.collector.used_links_count();
     println!(
         "Checking {} links from {} files ({} documents)",
         used_links_len, html_result.file_count, html_result.documents_count,
@@ -163,6 +175,7 @@ where
     let mut bad_anchors_count = 0;
 
     let mut broken_links = html_result
+        .collector
         .collector
         .get_broken_links(check_anchors)
         .peekable();
@@ -343,6 +356,31 @@ fn dump_paragraphs(path: PathBuf) -> Result<(), Error> {
     Ok(())
 }
 
+fn dump_external_links(base_path: PathBuf) -> Result<(), Error> {
+    println!("Reading files");
+    let html_result =
+        extract_html_links::<UsedLinkCollector<_>, NoopParagraphWalker>(&base_path, true)?;
+
+    println!(
+        "Checking {} links from {} files ({} documents)",
+        html_result.collector.used_links.len(),
+        html_result.file_count,
+        html_result.documents_count,
+    );
+
+    let used_links = html_result.collector.used_links.iter().peekable();
+
+    for used_link in used_links {
+        if is_external_link(used_link.href.as_bytes()) {
+            println!("{}", used_link.href);
+        }
+    }
+
+    mem::forget(html_result);
+
+    Ok(())
+}
+
 struct HtmlResult<C> {
     collector: C,
     documents_count: usize,
@@ -491,8 +529,9 @@ fn extract_markdown_paragraphs<P: ParagraphWalker>(
 
 fn match_all_paragraphs(base_path: PathBuf, sources_path: PathBuf) -> Result<(), Error> {
     println!("Reading files");
-    let html_result =
-        extract_html_links::<UsedLinkCollector<_>, ParagraphHasher>(&base_path, true)?;
+    let html_result = extract_html_links::<LocalLinksOnly<UsedLinkCollector<_>>, ParagraphHasher>(
+        &base_path, true,
+    )?;
 
     println!("Reading source files");
     let paragraps_to_sourcefile = extract_markdown_paragraphs::<ParagraphHasher>(&sources_path)?;
@@ -505,7 +544,7 @@ fn match_all_paragraphs(base_path: PathBuf, sources_path: PathBuf) -> Result<(),
     let mut link_single_source = 0;
     // We only care about HTML's used links because paragraph matching is exclusively for error
     // messages that point to the broken link.
-    for link in &html_result.collector.used_links {
+    for link in &html_result.collector.collector.used_links {
         total_links += 1;
 
         let paragraph = match link.paragraph {