Skip to content

Commit

Permalink
Merge 2c63297 into 266cd56
Browse files Browse the repository at this point in the history
  • Loading branch information
adam-miller committed Mar 17, 2017
2 parents 266cd56 + 2c63297 commit e733710
Showing 1 changed file with 29 additions and 1 deletion.
30 changes: 29 additions & 1 deletion warcio/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import json
import sys
import socket
import errno

from warcio.recordloader import ArchiveLoadFailed
from warcio.archiveiterator import ArchiveIterator
Expand All @@ -13,6 +15,8 @@
import tempfile
import shutil

from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE,SIG_DFL)

# ============================================================================
def main(args=None):
Expand All @@ -33,10 +37,18 @@ def main(args=None):
recompress.add_argument('output')
recompress.set_defaults(func=Recompressor())

extract = subparsers.add_parser('extract', help='WARC/ARC Record Extractor')
extract.add_argument('-u', '--uri', help='Record Target URI')
extract.add_argument('-a', '--all_records', dest='print_all', action='store_true', help='Keep searching the full WARC for any matching records (default)')
extract.add_argument('-1', '--first_record_only', dest='print_all', action='store_false', help='Return only the first matching WARC record')
extract.add_argument('-r', '--records', default='response,revisit', help='WARC record types to match. Ex. request,response,revisit,metadata,warcinfo')
extract.add_argument('inputs', nargs='+')
extract.set_defaults(print_all=True)
extract.set_defaults(func=extract_record)

cmd = parser.parse_args(args=args)
cmd.func(cmd)


# ============================================================================
def indexer(cmd):
fields = cmd.fields.split(',')
Expand Down Expand Up @@ -93,6 +105,22 @@ def decompress_and_recompress(self, stream, output):
tout.seek(0)
self.load_and_write(tout, output)

# ============================================================================
def extract_record(cmd):
records = cmd.records.split(',')
writer = WARCWriter(filebuf=sys.stdout.buffer, gzip=False)
for filename in cmd.inputs:
with open(filename, 'rb') as fh:
for record in ArchiveIterator(fh, no_record_parse=True, arc2warc=True):
if record.format == 'arc':
rec_uri = record.rec_headers.get_header('uri')
elif record.format in ('warc', 'arc2warc'):
rec_uri = record.rec_headers.get_header('WARC-Target-URI')

if record.rec_type in records and (cmd.uri is None or rec_uri is None or (cmd.uri is not None and cmd.uri == rec_uri)):
writer.write_record(record)
if(not cmd.print_all):
break

# ============================================================================
if __name__ == "__main__": #pragma: no cover
Expand Down

0 comments on commit e733710

Please sign in to comment.