Skip to content

Commit

Permalink
Merge 48ef9db into 20ccc99
Browse files Browse the repository at this point in the history
  • Loading branch information
nlevitt committed Sep 25, 2017
2 parents 20ccc99 + 48ef9db commit 0c280e0
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 0 deletions.
41 changes: 41 additions & 0 deletions test/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,47 @@ def test_recompress_bad_file():
with pytest.raises(ArchiveLoadFailed):
main(args=['recompress', temp.name, temp2.name])

def test_extract_warcinfo():
with patch_stdout() as buff:
res = main(args=['extract', get_test_file('example.warc.gz'), '0'])
assert buff.getvalue() == b'WARC/1.0\r\nWARC-Date: 2017-03-06T04:03:53Z\r\nWARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\nWARC-Filename: temp-20170306040353.warc.gz\r\nWARC-Type: warcinfo\r\nContent-Type: application/warc-fields\r\nContent-Length: 249\r\n\r\nsoftware: Webrecorder Platform v3.7\r\nformat: WARC File Format 1.0\r\ncreator: temp-MJFXHZ4S\r\nisPartOf: Temporary%20Collection\r\njson-metadata: {"title": "Temporary Collection", "size": 2865, "created_at": 1488772924, "type": "collection", "desc": ""}\r\n'

with patch_stdout() as buff:
res = main(args=['extract', '--headers', get_test_file('example.warc.gz'), '0'])
assert buff.getvalue() == b'WARC/1.0\r\nWARC-Date: 2017-03-06T04:03:53Z\r\nWARC-Record-ID: <urn:uuid:e9a0cecc-0221-11e7-adb1-0242ac120008>\r\nWARC-Filename: temp-20170306040353.warc.gz\r\nWARC-Type: warcinfo\r\nContent-Type: application/warc-fields\r\nContent-Length: 249\r\n\r\n'

with patch_stdout() as buff:
res = main(args=['extract', '--payload', get_test_file('example.warc.gz'), '0'])
assert buff.getvalue() == b'software: Webrecorder Platform v3.7\r\nformat: WARC File Format 1.0\r\ncreator: temp-MJFXHZ4S\r\nisPartOf: Temporary%20Collection\r\njson-metadata: {"title": "Temporary Collection", "size": 2865, "created_at": 1488772924, "type": "collection", "desc": ""}\r\n'

def test_extract_warc_response():
with patch_stdout() as buff:
res = main(args=['extract', get_test_file('example.warc.gz'), '784'])
assert buff.getvalue() == b'WARC/1.0\r\nWARC-Target-URI: http://example.com/\r\nWARC-Date: 2017-03-06T04:02:06Z\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>\r\nWARC-IP-Address: 93.184.216.34\r\nWARC-Block-Digest: sha1:DR5MBP7OD3OPA7RFKWJUD4CTNUQUGFC5\r\nWARC-Payload-Digest: sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK\r\nContent-Type: application/http; msgtype=response\r\nContent-Length: 975\r\n\r\nHTTP/1.1 200 OK\r\nContent-Encoding: gzip\r\nAccept-Ranges: bytes\r\nCache-Control: max-age=604800\r\nContent-Type: text/html\r\nDate: Mon, 06 Mar 2017 04:02:06 GMT\r\nEtag: "359670651+gzip"\r\nExpires: Mon, 13 Mar 2017 04:02:06 GMT\r\nLast-Modified: Fri, 09 Aug 2013 23:54:35 GMT\r\nServer: ECS (iad/182A)\r\nVary: Accept-Encoding\r\nX-Cache: HIT\r\nContent-Length: 606\r\nConnection: close\r\n\r\n\x1f\x8b\x08\x00;\x81\x05R\x00\x03\x8dTA\xaf\xd30\x0c\xbe\xefW\x98r\x01i]\xf7\x80\x07S\xd7V @\xe2\x02\x1c\xe0\xc21k\xdc\xd5Z\x93\x94$\xed6\xa1\xf7\xdfq\xdb\xbd\xae\xe5\xed@+\xb5\x8e\x1d\x7f\xfel\xc7I\x9eI\x93\xfbs\x8dPzUe\x8b\xe4\xf1\x87Bf\x0b\xe0\'\xf1\xe4+\xcc>\x9f\x84\xaa+\x84OF\t\xd2I4h\x17\xc3\x16\x85^@^\n\xeb\xd0\xa7A\xe3\x8bp\x13@\x94M\x8c\xa5\xf7u\x88\xbf\x1bj\xd3\xe0\xa3\xd1\x1e\xb5\x0f\xbb\xb0\x01\xe4\xc3*\r<\x9e|\xd4\x85\xdf\x8eP\xb7\x90\xb4P\x98\x06-\xe1\xb16\xd6O\xfc\x8f$}\x99Jl)\xc7\xb0_,\x814y\x12U\xe8rQazw\x85r\xfe\xcc\xc9t\x0c.\x81s\xe7\x82\xc1\xb63\xf2\x0c\x7fz\xb1_\x8a\xfc\xb0\xb7\xa6\xd12\xccMel\x0c\xcf\x8b5\xbf\xaf\xb6\xe3\x16%\xec\x9et\x0c\xeb\xab\xaa\x16R\x92\xde\xcft\x053\r\x0b\xa1\xa8:\xc7\x10|\xafQ\xc3\x0f\xa1]\xb0\x84\xe0\x0bV-z\xca\x05|\xc3\x06Y3*\x96\xf0\xc1r\x06Kp\xbc5th\xa9\xb8"\xf6\xc2C\xff\x95\xd4NH\xf7\xe9\xc7\xf0v\xbd\xaeOOy\xde\xa3\x02\xd1xs\x83\xee\xfd\xcc\xe1V\xee\xc5$\xfe\xceX\x896\xb4BR\xe3b\xb8C\xb5\x9dP\x12qE\xfa\xb0\xe4\x7fK\x8e<\xca\t\xc1G\xb8\xd7\x9b7\x9b\xcd\x04\xb1\xebE(17Vx2\xccU\x1b\x8dS\xd0\xf7\n%\tx\xa1\xc4)\xbcd\xf9\xae\xcb\xf2\xe5\xb4e\xf3\x0e\xfeO&\x0f\xa34/\xe4\xa4\x98\xf3\x8a\xcd\xfa~\xc3\xf6Oi\xd6s\xebX\xef\xb1dW\x12\xc37\x89\xfa#\x9au\xf2"\x89\x86y\\$]j<\x9eL\xf2r\x90\xcb\xbb\'\xa3\xc9\xaa\xc1Vg?Kr {=\xb0\x84\xce\x8b]E\xae\xe4^x\x03;\x84\xc6\xb1X\x18\x0bTU\x8d\xf3]\xd5[\x04\x1c\x10\x1d\xcf\x0f{\xe7\x8d\xe2\x01s+\xf8e\x1a\xce\xf9\xdc9\x81g\xe4\xe1\xe0]\xd0\xf5\xd5\xebH\xbe4\x8d\x87\xda\x12#\xe7\x86KA\xba\xef\'\xf0Z\xb8\x03\xa7\xde\x07\xad\xd1*r\x8e\r\xab$\xaaG\xd6\t\xdf\x17\x16\x8b4\xe8n\x8d8\x8a\x8e\xc7\xe3\x8a\x84\x16+c\xf7\xd1\x10\xcfE\x97hA\xf6\xd5X\xe4\xf0\x8c\xa7\xfa\x18\xab\x15\x83\x89\xac\x07L\xa2\xbeRIt\xa9[4\\o\x7f\x01\x08\x95\xaa\x8b\xf6\x04\x00\x00'

with patch_stdout() as buff:
res = main(args=['extract', '--headers', get_test_file('example.warc.gz'), '784'])
assert buff.getvalue() == b'WARC/1.0\r\nWARC-Target-URI: http://example.com/\r\nWARC-Date: 2017-03-06T04:02:06Z\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>\r\nWARC-IP-Address: 93.184.216.34\r\nWARC-Block-Digest: sha1:DR5MBP7OD3OPA7RFKWJUD4CTNUQUGFC5\r\nWARC-Payload-Digest: sha1:G7HRM7BGOKSKMSXZAHMUQTTV53QOFSMK\r\nContent-Type: application/http; msgtype=response\r\nContent-Length: 975\r\n\r\nHTTP/1.1 200 OK\r\nContent-Encoding: gzip\r\nAccept-Ranges: bytes\r\nCache-Control: max-age=604800\r\nContent-Type: text/html\r\nDate: Mon, 06 Mar 2017 04:02:06 GMT\r\nEtag: "359670651+gzip"\r\nExpires: Mon, 13 Mar 2017 04:02:06 GMT\r\nLast-Modified: Fri, 09 Aug 2013 23:54:35 GMT\r\nServer: ECS (iad/182A)\r\nVary: Accept-Encoding\r\nX-Cache: HIT\r\nContent-Length: 606\r\nConnection: close\r\n\r\n'

with patch_stdout() as buff:
res = main(args=['extract', '--payload', get_test_file('example.warc.gz'), '784'])
assert buff.getvalue() == b'<!doctype html>\n<html>\n<head>\n <title>Example Domain</title>\n\n <meta charset="utf-8" />\n <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n <meta name="viewport" content="width=device-width, initial-scale=1" />\n <style type="text/css">\n body {\n background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em auto;\n padding: 50px;\n background-color: #fff;\n border-radius: 1em;\n }\n a:link, a:visited {\n color: #38488f;\n text-decoration: none;\n }\n @media (max-width: 700px) {\n body {\n background-color: #fff;\n }\n div {\n width: auto;\n margin: 0 auto;\n border-radius: 0;\n padding: 1em;\n }\n }\n </style> \n</head>\n\n<body>\n<div>\n <h1>Example Domain</h1>\n <p>This domain is established to be used for illustrative examples in documents. You may use this\n domain in examples without prior coordination or asking for permission.</p>\n <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>\n</body>\n</html>\n'

# @pytest.mark.xfail
# warcio doesn't support ARC output yet, and @xfail tests have some bad
# interaction with patch_stdout(), thus the failing tests are commented out
def test_extract_arc():
with patch_stdout() as buff:
res = main(args=['extract', '--payload', get_test_file('example.arc'), '151'])
assert buff.getvalue() == b'<!doctype html>\n<html>\n<head>\n <title>Example Domain</title>\n\n <meta charset="utf-8" />\n <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n <meta name="viewport" content="width=device-width, initial-scale=1" />\n <style type="text/css">\n body {\n background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em auto;\n padding: 50px;\n background-color: #fff;\n border-radius: 1em;\n }\n a:link, a:visited {\n color: #38488f;\n text-decoration: none;\n }\n @media (max-width: 700px) {\n body {\n background-color: #fff;\n }\n div {\n width: auto;\n margin: 0 auto;\n border-radius: 0;\n padding: 1em;\n }\n }\n </style> \n</head>\n\n<body>\n<div>\n <h1>Example Domain</h1>\n <p>This domain is established to be used for illustrative examples in documents. You may use this\n domain in examples without prior coordination or asking for permission.</p>\n <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>\n</body>\n</html>\n'

# with patch_stdout() as buff:
# res = main(args=['extract', '--headers', get_test_file('example.arc'), '151'])
# assert buff.getvalue() == b'http://example.com/ 93.184.216.119 20140216050221 text/html 1591\nHTTP/1.1 200 OK\r\nAccept-Ranges: bytes\r\nCache-Control: max-age=604800\r\nContent-Type: text/html\r\nDate: Sun, 16 Feb 2014 05:02:20 GMT\r\nEtag: "359670651"\r\nExpires: Sun, 23 Feb 2014 05:02:20 GMT\r\nLast-Modified: Fri, 09 Aug 2013 23:54:35 GMT\r\nServer: ECS (sjc/4FCE)\r\nX-Cache: HIT\r\nx-ec-custom-error: 1\r\nContent-Length: 1270\r\n\r\n'

# with patch_stdout() as buff:
# res = main(args=['extract', get_test_file('example.arc'), '151'])
# assert buff.getvalue() == b'http://example.com/ 93.184.216.119 20140216050221 text/html 1591\nHTTP/1.1 200 OK\r\nAccept-Ranges: bytes\r\nCache-Control: max-age=604800\r\nContent-Type: text/html\r\nDate: Sun, 16 Feb 2014 05:02:20 GMT\r\nEtag: "359670651"\r\nExpires: Sun, 23 Feb 2014 05:02:20 GMT\r\nLast-Modified: Fri, 09 Aug 2013 23:54:35 GMT\r\nServer: ECS (sjc/4FCE)\r\nX-Cache: HIT\r\nx-ec-custom-error: 1\r\nContent-Length: 1270\r\n\r\n<!doctype html>\n<html>\n<head>\n <title>Example Domain</title>\n\n <meta charset="utf-8" />\n <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n <meta name="viewport" content="width=device-width, initial-scale=1" />\n <style type="text/css">\n body {\n background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em auto;\n padding: 50px;\n background-color: #fff;\n border-radius: 1em;\n }\n a:link, a:visited {\n color: #38488f;\n text-decoration: none;\n }\n @media (max-width: 700px) {\n body {\n background-color: #fff;\n }\n div {\n width: auto;\n margin: 0 auto;\n border-radius: 0;\n padding: 1em;\n }\n }\n </style> \n</head>\n\n<body>\n<div>\n <h1>Example Domain</h1>\n <p>This domain is established to be used for illustrative examples in documents. You may use this\n domain in examples without prior coordination or asking for permission.</p>\n <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>\n</body>\n</html>\n'

@contextmanager
def patch_stdout():
Expand Down
37 changes: 37 additions & 0 deletions warcio/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import tempfile
import shutil
import sys


# ============================================================================
Expand All @@ -33,9 +34,45 @@ def main(args=None):
recompress.add_argument('output')
recompress.set_defaults(func=Recompressor())

extract = subparsers.add_parser('extract', help='Extract WARC/ARC Record')
extract.add_argument('filename')
extract.add_argument('offset')
group = extract.add_mutually_exclusive_group()
group.add_argument('--payload', action='store_true', help='output only record payload (after content and transfer decoding, if applicable)')
group.add_argument('--headers', action='store_true', help='output only record headers (and http headers, if applicable)')

extract.set_defaults(func=extract_record)

cmd = parser.parse_args(args=args)
cmd.func(cmd)

# ============================================================================
def extract_record(cmd):
with open(cmd.filename, 'rb') as fh:
fh.seek(int(cmd.offset))
it = iter(ArchiveIterator(fh))
record = next(it)

try:
stdout_raw = sys.stdout.buffer
except AttributeError:
stdout_raw = sys.stdout

if cmd.payload:
stream = record.content_stream()
buf = stream.read(65536)
while buf:
stdout_raw.write(buf)
buf = stream.read(65536)
else:
stdout_raw.write(record.rec_headers.to_bytes())
if record.http_headers:
stdout_raw.write(record.http_headers.to_bytes())
if not cmd.headers:
buf = record.raw_stream.read(65536)
while buf:
stdout_raw.write(buf)
buf = record.raw_stream.read(65536)

# ============================================================================
def get_version():
Expand Down

0 comments on commit 0c280e0

Please sign in to comment.