Skip to content

Commit

Permalink
Fixes related to reading record and writing same record back out:
Browse files Browse the repository at this point in the history
- Fix issue #104 where utf-8 canonicalization caused record to be written with incorrect content-length
- Fix issue #57 where protocol is set as statusline in correctly
- fill payload_length when reading warc records
- always support tell() to LimitReader and BufferedReader()
  • Loading branch information
ikreymer committed Feb 20, 2020
1 parent 49113b4 commit 68cdcd6
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 9 deletions.
61 changes: 61 additions & 0 deletions test/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,3 +782,64 @@ def validate_response(record):

validate_warcinfo(records[0])

def test_utf8_rewrite_content_adjust(self):
UTF8_PAYLOAD = '\
HTTP/1.0 200 OK\r\n\
Content-Type: text/plain; charset="UTF-8"\r\n\
Content-Disposition: attachment; filename="испытание.txt"\r\n\
Custom-Header: somevalue\r\n\
Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\
\r\n\
some\n\
text'

content_length = len(UTF8_PAYLOAD.encode('utf-8'))

UTF8_RECORD = '\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: http://example.com/\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: {0}\r\n\
\r\n\
{1}\r\n\
\r\n\
'.format(content_length, UTF8_PAYLOAD)

assert(content_length == 226)

record = ArcWarcRecordLoader().parse_record_stream(BytesIO(UTF8_RECORD.encode('utf-8')))

writer = BufferWARCWriter(gzip=False)
writer.write_record(record)

raw_buff = writer.get_contents()
assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS

for record in ArchiveIterator(writer.get_stream()):
assert record.length == 268

def test_identity(self):
""" read(write(record)) should yield record """
payload = b'foobar'
writer = BufferWARCWriter(gzip=True)
httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
warcHeaders = {'Foo': 'Bar'}
record = writer.create_warc_record('http://example.com/', 'request',
payload=BytesIO(payload),
warc_headers_dict=warcHeaders, http_headers=httpHeaders)

writer.write_record(record)

for new_rec in ArchiveIterator(writer.get_stream()):
assert new_rec.rec_type == record.rec_type
assert new_rec.rec_headers == record.rec_headers
assert new_rec.content_type == record.content_type
assert new_rec.length == record.length
assert new_rec.http_headers == record.http_headers
assert new_rec.raw_stream.read() == payload

3 changes: 3 additions & 0 deletions warcio/bufferedreaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ def readline(self, length=None):

return linebuff

def tell(self):
return self.num_read

def empty(self):
if not self.buff or self.buff.tell() >= self.buff_size:
# if reading all members, attempt to get next member automatically
Expand Down
9 changes: 4 additions & 5 deletions warcio/limitreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@ class LimitReader(object):
def __init__(self, stream, limit):
self.stream = stream
self.limit = limit

if hasattr(stream, 'tell'):
self.tell = self._tell
self._orig_limit = limit

def _update(self, buff):
length = len(buff)
Expand Down Expand Up @@ -43,8 +41,9 @@ def readline(self, length=None):
def close(self):
self.stream.close()

def _tell(self):
return self.stream.tell()
def tell(self):
# implement our own tell
return self._orig_limit - self.limit

@staticmethod
def wrap_stream(stream, content_length):
Expand Down
4 changes: 2 additions & 2 deletions warcio/recordbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, warc_version=None, header_filter=None):
self.header_filter = header_filter

def create_warcinfo_record(self, filename, info):
warc_headers = StatusAndHeaders(self.warc_version, [])
warc_headers = StatusAndHeaders('', [], protocol=self.warc_version)
warc_headers.add_header('WARC-Type', 'warcinfo')
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
if filename:
Expand Down Expand Up @@ -121,7 +121,7 @@ def create_warc_record(self, uri, record_type,
return record

def _init_warc_headers(self, uri, record_type, warc_headers_dict):
warc_headers = StatusAndHeaders(self.warc_version, list(warc_headers_dict.items()))
warc_headers = StatusAndHeaders('', list(warc_headers_dict.items()), protocol=self.warc_version)
warc_headers.replace_header('WARC-Type', record_type)
if not warc_headers.get_header('WARC-Record-ID'):
warc_headers.add_header('WARC-Record-ID', self._make_warc_id())
Expand Down
8 changes: 6 additions & 2 deletions warcio/recordloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class ArcWarcRecord(object):
def __init__(self, *args, **kwargs):
(self.format, self.rec_type, self.rec_headers, self.raw_stream,
self.http_headers, self.content_type, self.length) = args
self.payload_length = -1
self.payload_length = kwargs.get('payload_length', -1)
self.digest_checker = kwargs.get('digest_checker')

def content_stream(self):
Expand Down Expand Up @@ -135,10 +135,14 @@ def parse_record_stream(self, stream,
length=length)

http_headers = None
payload_length = -1

# load http headers if parsing
if not no_record_parse:
start = stream.tell()
http_headers = self.load_http_headers(rec_type, uri, stream, length)
if length and http_headers:
payload_length = length - (stream.tell() - start)

# generate validate http headers (eg. for replay)
if not http_headers and ensure_http_headers:
Expand All @@ -149,7 +153,7 @@ def parse_record_stream(self, stream,

return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, http_headers,
content_type, length, digest_checker=digest_checker)
content_type, length, payload_length=payload_length, digest_checker=digest_checker)

def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None):
payload_digest = rec_headers.get_header('WARC-Payload-Digest')
Expand Down

0 comments on commit 68cdcd6

Please sign in to comment.