Skip to content

Commit

Permalink
include offset in output of "warcio index ..." (#4)
Browse files Browse the repository at this point in the history
* include offset in output of "warcio index ..."

* make offset a configurable field like the others

* test the offset field to get coverage back up
  • Loading branch information
nlevitt authored and ikreymer committed Mar 17, 2017
1 parent 266cd56 commit 0fa527d
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 26 deletions.
38 changes: 19 additions & 19 deletions test/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,31 @@ def test_index():
files = ['example.warc.gz', 'example.warc', 'example.arc.gz', 'example.arc']
files = [get_test_file(filename) for filename in files]

args = ['index', '-f', 'warc-type,warc-target-uri,warc-filename']
args = ['index', '-f', 'offset,warc-type,warc-target-uri,warc-filename']
args.extend(files)

expected = """\
{"warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"warc-type": "response", "warc-target-uri": "http://example.com/"}
{"warc-type": "request", "warc-target-uri": "http://example.com/"}
{"warc-type": "revisit", "warc-target-uri": "http://example.com/"}
{"warc-type": "request", "warc-target-uri": "http://example.com/"}
{"warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"warc-type": "response", "warc-target-uri": "http://example.com/"}
{"warc-type": "request", "warc-target-uri": "http://example.com/"}
{"warc-type": "revisit", "warc-target-uri": "http://example.com/"}
{"warc-type": "request", "warc-target-uri": "http://example.com/"}
{"warc-type": "warcinfo", "warc-filename": "live-web-example.arc.gz"}
{"warc-type": "response", "warc-target-uri": "http://example.com/"}
{"warc-type": "warcinfo", "warc-filename": "live-web-example.arc.gz"}
{"warc-type": "response", "warc-target-uri": "http://example.com/"}
expected = b"""\
{"offset": 0, "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"offset": 353, "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"offset": 784, "warc-type": "response", "warc-target-uri": "http://example.com/"}
{"offset": 2012, "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"offset": 2538, "warc-type": "revisit", "warc-target-uri": "http://example.com/"}
{"offset": 3123, "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"offset": 0, "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"offset": 488, "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"}
{"offset": 1197, "warc-type": "response", "warc-target-uri": "http://example.com/"}
{"offset": 2566, "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"offset": 3370, "warc-type": "revisit", "warc-target-uri": "http://example.com/"}
{"offset": 4316, "warc-type": "request", "warc-target-uri": "http://example.com/"}
{"offset": 0, "warc-type": "warcinfo", "warc-filename": "live-web-example.arc.gz"}
{"offset": 171, "warc-type": "response", "warc-target-uri": "http://example.com/"}
{"offset": 0, "warc-type": "warcinfo", "warc-filename": "live-web-example.arc.gz"}
{"offset": 151, "warc-type": "response", "warc-target-uri": "http://example.com/"}
"""

with patch_stdout() as buff:
res = main(args=args)
assert buff.getvalue().decode('utf-8') == expected
assert buff.getvalue() == expected


def test_recompress():
Expand Down
15 changes: 8 additions & 7 deletions warcio/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def main(args=None):

index = subparsers.add_parser('index', help='WARC/ARC Indexer')
index.add_argument('inputs', nargs='+')
index.add_argument('-f', '--fields', default='warc-type,warc-target-uri')
index.add_argument('-f', '--fields', default='offset,warc-type,warc-target-uri')
index.add_argument('-o', '--output')
index.set_defaults(func=indexer)

Expand All @@ -44,14 +44,15 @@ def indexer(cmd):
with open_or_default(cmd.output, 'wt', sys.stdout) as out:
for filename in cmd.inputs:
with open(filename, 'rb') as fh:
for record in ArchiveIterator(fh,
no_record_parse=True,
arc2warc=True):

it = ArchiveIterator(fh, no_record_parse=True, arc2warc=True)
for record in it:
index = OrderedDict()
for field in fields:
value = record.rec_headers.get_header(field)
if value:
if field == 'offset':
value = it.offset
else:
value = record.rec_headers.get_header(field)
if value is not None:
index[field] = value

out.write(json.dumps(index) + '\n')
Expand Down

0 comments on commit 0fa527d

Please sign in to comment.