Skip to content
This repository has been archived by the owner on Jun 27, 2022. It is now read-only.

Commit

Permalink
Added sitemap generator to CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris Watson committed Jun 30, 2019
1 parent 611a009 commit 3c12c03
Show file tree
Hide file tree
Showing 29 changed files with 6,914 additions and 380 deletions.
363 changes: 328 additions & 35 deletions docs/Arachnid.html

Large diffs are not rendered by default.

551 changes: 408 additions & 143 deletions docs/Arachnid/Agent.html

Large diffs are not rendered by default.

267 changes: 266 additions & 1 deletion docs/Arachnid/Agent/Actions.html

Large diffs are not rendered by default.

267 changes: 266 additions & 1 deletion docs/Arachnid/Agent/Actions/Action.html

Large diffs are not rendered by default.

267 changes: 266 additions & 1 deletion docs/Arachnid/Agent/Actions/Paused.html

Large diffs are not rendered by default.

267 changes: 266 additions & 1 deletion docs/Arachnid/Agent/Actions/RuntimeError.html

Large diffs are not rendered by default.

267 changes: 266 additions & 1 deletion docs/Arachnid/Agent/Actions/SkipLink.html

Large diffs are not rendered by default.

267 changes: 266 additions & 1 deletion docs/Arachnid/Agent/Actions/SkipResource.html

Large diffs are not rendered by default.

281 changes: 273 additions & 8 deletions docs/Arachnid/Agent/Queue.html

Large diffs are not rendered by default.

273 changes: 269 additions & 4 deletions docs/Arachnid/AuthCredential.html

Large diffs are not rendered by default.

281 changes: 273 additions & 8 deletions docs/Arachnid/AuthStore.html

Large diffs are not rendered by default.

287 changes: 276 additions & 11 deletions docs/Arachnid/CookieJar.html

Large diffs are not rendered by default.

267 changes: 266 additions & 1 deletion docs/Arachnid/Document.html

Large diffs are not rendered by default.

287 changes: 276 additions & 11 deletions docs/Arachnid/Document/HTML.html

Large diffs are not rendered by default.

285 changes: 275 additions & 10 deletions docs/Arachnid/Document/HTML/Tag.html

Large diffs are not rendered by default.

351 changes: 308 additions & 43 deletions docs/Arachnid/Resource.html

Large diffs are not rendered by default.

325 changes: 295 additions & 30 deletions docs/Arachnid/Resource/ContentTypes.html

Large diffs are not rendered by default.

271 changes: 268 additions & 3 deletions docs/Arachnid/Resource/Cookies.html

Large diffs are not rendered by default.

303 changes: 284 additions & 19 deletions docs/Arachnid/Resource/HTML.html

Large diffs are not rendered by default.

285 changes: 275 additions & 10 deletions docs/Arachnid/Resource/StatusCodes.html

Large diffs are not rendered by default.

281 changes: 273 additions & 8 deletions docs/Arachnid/Rules.html

Large diffs are not rendered by default.

293 changes: 279 additions & 14 deletions docs/Arachnid/SessionCache.html

Large diffs are not rendered by default.

275 changes: 270 additions & 5 deletions docs/URI.html

Large diffs are not rendered by default.

273 changes: 269 additions & 4 deletions docs/index.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/index.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/search-index.js

Large diffs are not rendered by default.

58 changes: 55 additions & 3 deletions src/arachnid/cli.cr
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,76 @@ module Arachnid

sub "summarize" do
desc "Scan a site (or sites) and generate a JSON report"
usage "arachnid summarize [sites] [options] ..."
usage <<-USAGE
arachnid summarize [sites] [options]
Examples:
# Scan a site and count the number of pages, outputting the result to STDOUT
arachnid summarize https://crystal-lang.org
# Scan a site and count the number of internal links, outputting the result to STDOUT
arachnid summarize https://crystal-lang.org -l
# Scan a site and count the number of internal and external links, saving the result to a file
arachnid summarize https://crystal-lang.org -l -L -o report.json
# Scan a site and list all pages that returned a 404 or 500 status code
arachnid summarize https://crystal-lang.org -c 404 500
USAGE

option "-l", "--ilinks", type: Bool, desc: "generate a map of pages to internal links"
option "-L", "--elinks", type: Bool, desc: "generate a map of pages to external links"
option "-c CODES", "--codes=CODES", type: Array(Int32), desc: "generate a map of status codes to pages \
that responded with that code"
option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan"
option "-o FILE", "--output=FILE", type: String, desc: "file to write the report to", default: "arachnid.json"
option "-o FILE", "--output=FILE", type: String, desc: "file to write the report to (if undefined \
output will be printed to STDOUT"

run do |opts, args|
count = Arachnid::Cli::Count.new
if args.empty?
STDERR.puts "At least one site is required"
else
count = Arachnid::Cli::Count.new
count.run(opts, args)
end
end
end

sub "sitemap" do
desc "generate a sitemap for a site in XML or JSON format"
usage <<-USAGE
arachnid sitemap [url] [--xml | --json] [options]
Examples:
# Generate a XML sitemap for crystal-lang.org
arachnid sitemap https://crystal-lang.org --xml
# Generate a XML sitemap with a custom filename
arachnid sitemap https://crystal-lang.org --xml -o ~/Desktop/crystal-lang.org.xml
# Generate a JSON sitemap instead (not really useful as an actual sitemap)
arachnid sitemap https://crystal-lang.org --json
USAGE


option "--xml", type: Bool, desc: "generate the sitemap in XML format"
option "--json", type: Bool, desc: "generate the sitemap in JSON format"
option "-o FILE", "--output=FILE", type: String, desc: "filename to write the report to. \
default is the hostname + .json or .xml"

run do |opts, args|
if args.size != 1
raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}"
elsif !opts.json && !opts.xml
raise "you must select either xml or json"
else
sitemap = Arachnid::Cli::Sitemap.new
sitemap.run(opts, args)
end
end
end
end
end
end
Expand Down
8 changes: 6 additions & 2 deletions src/arachnid/cli/count.cr
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,12 @@ module Arachnid
report["external_links"] = external_links if external_links
report["codes"] = codes if codes

File.write(outfile, report.to_json, mode: "w+")
puts "Report saved to #{outfile}"
if outfile
File.write(outfile.to_s, report.to_json, mode: "w+")
puts "Report saved to #{outfile}"
else
pp report
end
end
end
end
Expand Down
90 changes: 90 additions & 0 deletions src/arachnid/cli/sitemap.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
require "./action"
require "../arachnid"
require "termspinner"
require "json"

module Arachnid
class Cli < Clim
class Sitemap < Cli::Action

alias LastMod = NamedTuple(year: String, month: String, day: String)
alias PageMap = NamedTuple(url: String, page: String, changefreq: String, priority: String, lastmod: LastMod)

def run(opts, args)
url = URI.parse(args[0])
date = Time.now
spinner = Spinner::Spinner.new("Wait...")

spider = Arachnid::Agent.new
spider.visit_urls_like(Regex.new(url.to_s))

map = {
domain: url.to_s,
lastmod: {
year: date.year.to_s, month: date.month.to_s, day: date.day.to_s
},
filetype: "html",

pages: [] of PageMap
}

spinner.start("Crawling...")

spider.every_html_page do |page|
spinner.message = "Crawling... Current page #{page.url.to_s}"
last_mod = page.headers["Last-Modified"]?
last_mod = last_mod ? Time.parse_utc(last_mod, "%a, %d %b %Y %H:%M:%S GMT") : Time.now

item = {
url: page.url.to_s,
page: page.title.to_s,
changefreq: "never",
priority: "0.5",
lastmod: {
year: last_mod.year.to_s,
month: last_mod.month.to_s.rjust(2, '0'),
day: last_mod.day.to_s.rjust(2, '0')
}
}

map[:pages] << item
end

spider.start_at(url)
spinner.stop("Finished scanning!\n")

if opts.xml
filename = (opts.output ? opts.output.to_s : url.hostname.to_s + ".xml")
sitemap = gen_xml_sitemap(map)
else
filename = (opts.output ? opts.output.to_s : url.hostname.to_s + ".json")
sitemap = gen_json_sitemap(map)
end

File.write(File.expand_path(filename, __DIR__), sitemap.to_s, mode: "w+")
puts "Wrote sitemap to #{filename}"
end

def gen_xml_sitemap(map)
XML.build(indent: " ", encoding: "UTF-8") do |xml|
xml.element("urlset", xmlns: "http://www.sitemaps.org/schemas/sitemap/0.9") do
map[:pages].each do |page|
xml.element("url") do
lastmod = page[:lastmod]

xml.element("loc") { xml.text page[:url] }
xml.element("lastmod") { xml.text "#{lastmod[:year]}-#{lastmod[:month]}-#{lastmod[:day]}" }
xml.element("changefreq") { xml.text page[:changefreq] }
xml.element("priority") { xml.text page[:priority] }
end
end
end
end
end

def gen_json_sitemap(map)
map.to_pretty_json
end
end
end
end

0 comments on commit 3c12c03

Please sign in to comment.