Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 116 lines (105 sloc) 3.16 KB
#!/bin/bash
set -e -f -u -o pipefail
create_datebase() {
local database="$1"
sqlite3 "${database}" <<END_OF_CREATE
CREATE TABLE linksdb (
urlname VARCHAR(256) NOT NULL,
parentname VARCHAR(256),
baseref VARCHAR(256),
valid INT,
result VARCHAR(256),
warning VARCHAR(512),
info VARCHAR(512),
url VARCHAR(256),
line INT,
col INT,
name VARCHAR(256),
checktime INT,
dltime INT,
size INT,
cached INT,
level INT NOT NULL,
modified VARCHAR(256),
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
);
END_OF_CREATE
}
run_linkchecker() {
local url="$1" hostname="$2"
# - Cut down on output so this can be run from cron, but write to a file that
# can be displayed on failure or when run interactively.
# - Check external links but do not recurse on external sites.
# - Exclude domains that definitely won't work; list was determined through
# debugging and perserverence.
# - stderr will frequently have a warning about SSL certificates.
# - To figure out problems run with '--debug=checking' and look for the URLs
# that keep getting checked at the end.
linkchecker \
--no-status \
--quiet \
--file-output=text \
--file-output=sql \
--check-extern \
--no-follow-url="!${hostname}" \
--ignore-url=//en.wikipedia.org \
--ignore-url=//pool.ntp.org \
--ignore-url=//www.amazon.co.uk \
--ignore-url=//www.example.org \
"${url}" > stdout 2> stderr
}
report_persistently_bad_urls() {
local database="$1" date_threshold="$2" count_threshold="$3" output
sqlite3 "${database}" << END_OF_QUERY
CREATE TEMP TABLE url_counts
AS SELECT urlname, COUNT(urlname) AS counts
FROM linksdb
WHERE timestamp > '${date_threshold}'
GROUP by urlname;
SELECT urlname, counts
FROM url_counts
WHERE counts > ${count_threshold};
END_OF_QUERY
}
check_one_website() {
local url="$1" hostname exit_status=0
hostname="$(sed -e 's,https\?://,,' -e 's,/$,,' <<<"${url}")"
local database="${HOME}/tmp/linkchecker-cron.${hostname}.sqlite3"
if [[ ! -f "${database}" ]]; then
create_datebase "${database}"
fi
mkdir -p "${hostname}"
cd "${hostname}"
run_linkchecker "${url}" "${hostname}" || exit_status="$?"
if [[ "${exit_status}" != 0 ]]; then
sqlite3 "${database}" < linkchecker-out.sql
local output date_threshold
date_threshold="$(date --date '2 days ago' +%Y-%m-%d)"
# TODO: raise the count threshold?
output="$(report_persistently_bad_urls "${database}" "${date_threshold}" 2)"
if [[ -n "${output}" ]]; then
printf "Bad URLs for %s since %s\\n" \
"${url}" "${date_threshold}"
awk -F '|' '{print $2, $1}' <<<"${output}"
fi
fi
return "${exit_status}"
}
main() {
if [[ ! -t 0 ]]; then
sleep $((RANDOM % 600))
fi
local destdir url exit_status=0
destdir="$(mktemp -d -t linkchecker-cron.XXXXXXXXXX)"
for url in "$@"; do
cd "${destdir}"
check_one_website "${url}" || exit_status="$?"
done
if [[ -t 0 ]]; then
printf "Output in %s\\n" "${destdir}"
else
rm -rf "${destdir}"
fi
return "${exit_status}"
}
main "$@"
You can’t perform that action at this time.