-
Notifications
You must be signed in to change notification settings - Fork 10
/
walk.sh
executable file
·71 lines (57 loc) · 2.26 KB
/
walk.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/bash
set -e
set -x
outputdir=${1:-out/}
skiplist=${2:-}
rm -rf ${outputdir}/ && mkdir ${outputdir}
LOGFILE=$outputdir/script-output.txt
exec 1> >(exec -a 'LOGGING TEE' tee $LOGFILE) 2>&1
TEEPID=$!
scriptdir=$(cd $(dirname $0); pwd)
. ${scriptdir}/common-vars.sh
set_auth_header
timingarg="\nTotal request time: %{time_total} seconds for url: %{url_effective}\n"
cleanup() {
# close FDs to ensure tee finishes
exec 1>&0 2>&1
if [ -n "$TEEPID" ];then
while ps --pid $TEEPID > /dev/null 2>&1
do
sleep 1
done
fi
}
trap 'cleanup' EXIT
echo $START_URL | sort |uniq > ${outputdir}/to-visit.txt
# for information only
echo ${BASE}${url} > ${outputdir}/url.txt
LOOPS=0
POTENTIALLY_GOT_MORE=1
while [ $POTENTIALLY_GOT_MORE -eq 1 ]
do
POTENTIALLY_GOT_MORE=0
for url in $(cat ${outputdir}/to-visit.txt)
do
if grep -q "^${url}\$" ${outputdir}/visited.txt ${outputdir}/errors.txt $skiplist; then
continue
fi
OUTFILE=${outputdir}/$( echo -n ${url} | perl -p -e 's/[^a-zA-Z0-9.]/_/g;' ).json
if ! $CURLCMD --fail -H "$AUTH_HEADER" --silent -L -w"$timingarg" ${BASE}${url} -o ${OUTFILE}-RAW ; then
echo $url >> ${outputdir}/errors.txt
continue
fi
cat ${OUTFILE}-RAW | jq . > ${OUTFILE}
rm ${OUTFILE}-RAW ||:
cat $OUTFILE | jq -r 'recurse (.[]?) | objects | select(has("@odata.id")) | .["@odata.id"]' | perl -p -i -e 's/(\/#.*)//' | perl -p -i -e 's/(#.*)//' | grep -v JSONSchema >> ${outputdir}/to-visit.txt ||:
POTENTIALLY_GOT_MORE=1
echo $url >> ${outputdir}/visited.txt
done
cat ${outputdir}/to-visit.txt | sort | uniq > ${outputdir}/to-visit-new.txt
mv ${outputdir}/to-visit-new.txt ${outputdir}/to-visit.txt
cat ${outputdir}/visited.txt | sort | uniq > ${outputdir}/visited-new.txt
mv ${outputdir}/visited-new.txt ${outputdir}/visited.txt
LOOPS=$(( LOOPS + 1 ))
done
timingarg="\nTotal PIPELINED request time (for this subrequest): %{time_total} seconds for url: %{url_effective}\n"
time $CURLCMD --fail -i -H "$AUTH_HEADER" -w"$timingarg" -s $(cat ${outputdir}/visited.txt | perl -n -e "print '${BASE}' . \$_" ) | tee ${outputdir}/entire-tree.txt
echo "Took $LOOPS loops to collect the URL list"