Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Tree: 24082153b4
Fetching contributors…

Cannot retrieve contributors at this time

executable file 58 lines (45 sloc) 1.939 kB
#!/bin/bash
#"$PATTERN" "$OUTFILE_PREFIX" # $LOG_FILE
PATTERN=$1;
OUTFILE_PREFIX=$2
F=$3
TIMESTAMP=$(echo $LOG_FILE | sed -e's/pagecounts-\([0-9]*\)-\([0-9][0-9]\)[0-9]*\.gz/\1-\2/' )
OUTPUT_FILE=/mnt/working/results/${OUTFILE_PREFIX}-${TIMESTAMP}.out
INPUT_FILE=/mnt/downloads/$LOG_FILE
function get_input_file {
# see https://github.com/datawrangling/trendingtopics
# this version downloads the file from:
local remote="http://dammit.lt/wikistats/archive/2010/10/$LOG_FILE"
# and then caches it on a local EBS volume
# An alternative approach is to mount the EBS volume snapshot
# here: http://aws.amazon.com/datasets/4182 In practice the
# snapshot restoration process (from s3) is extremely slow for
# massive volumes such as this. It is much faster, but more
# $expensive, to simply download them. Ideally the individual files
# would have been accessible from s3.
if [[ -e $INPUT_FILE ]]; then
# continue an interrupted download
cat $INPUT_FILE | curl -C - -s -X GET $remote
else
curl -s -X GET $remote
fi
}
function filter_and_transform {
# select matching records/lines from the input and output a json
# document that contains an array of matches.
# $1=wikip-project, $2=URL, $3=hits/hr, $4=bytes/hr
awk -v ts=$1 -v pattern="$2" '
BEGIN { printf("{\"ts\":\"%s\", \"%s\", \"matches\":[\n", ts, pattern) }
$0 ~ pattern { printf("[\"%s\",%d,%d],\n", $2, $3, $4) }
END { print "]}" }'
}
function main {
[[ -e ${OUTPUT_FILE}.done ]] && exit 1
# get the input file, 1 hour of logs per file
[[ -e ${INPUT_FILE}.done ]] || get_input_file \
> $INPUT_FILE && touch ${INPUT_FILE}.done
# filter it based on the supplied patter, transform it to json and save it
zcat $INPUT_FILE | filter_and_transform $TIMESTAMP "$PATTERN" \
> $OUTPUT_FILE && touch ${OUTPUT_FILE}.done
}
main
Jump to Line
Something went wrong with that request. Please try again.