Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

record.content_stream().read() alters the record and causes a write out to fail #114

Open
thomaspreece opened this issue Apr 24, 2020 · 1 comment

Comments

@thomaspreece
Copy link
Contributor

(Using code from #57)
Calling record.content_stream().read() before writing the record causes the record to be changed in such a way that the file it writes out is incorrect and mangled.

import pytest

from io import BytesIO
from tempfile import NamedTemporaryFile

from warcio.archiveiterator import ArchiveIterator
from warcio.warcwriter import WARCWriter
from warcio.statusandheaders import StatusAndHeaders

def test_identity_correct ():
    """ read(write(record)) should yield record """
    with NamedTemporaryFile () as fd:
        payload = b'foobar'
        writer = WARCWriter (fd, gzip=False)
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
        warcHeaders = {'Foo': 'Bar'}
        record = writer.create_warc_record ('http://example.com/', 'request',
                payload=BytesIO(payload),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        writer.write_record (record)

        fd.seek (0)
        rut = next (ArchiveIterator (fd))
        golden = record
        assert rut.rec_type == golden.rec_type
        assert rut.rec_headers == golden.rec_headers
        assert rut.content_type == golden.content_type
        assert rut.length == golden.length
        assert rut.http_headers == golden.http_headers
        assert rut.raw_stream.read() == payload

def test_identity_fail ():
    """ read(write(record)) should yield record """
    with NamedTemporaryFile () as fd:
        payload = b'foobar'
        writer = WARCWriter (fd, gzip=False)
        httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
        warcHeaders = {'Foo': 'Bar'}
        record = writer.create_warc_record ('http://example.com/', 'request',
                payload=BytesIO(payload),
                warc_headers_dict=warcHeaders, http_headers=httpHeaders)
        record.content_stream().read()
        writer.write_record (record)

        fd.seek (0)
        rut = next (ArchiveIterator (fd))
        golden = record
        assert rut.rec_type == golden.rec_type
        assert rut.rec_headers == golden.rec_headers
        assert rut.content_type == golden.content_type
        assert rut.length == golden.length
        assert rut.http_headers == golden.http_headers
        assert rut.raw_stream.read() == payload

test_identity_correct()
print("Write Worked")
test_identity_fail()
print("Write 2 Worked")

Output:

Write Worked
Traceback (most recent call last):
  File "./test2.py", line 57, in <module>
    test_identity_fail()
  File "./test2.py", line 53, in test_identity_fail
    assert rut.raw_stream.read() == payload
AssertionError
@wumpus
Copy link
Collaborator

wumpus commented Apr 24, 2020

It's not obvious, but this is the same as #64 -- and we should fix it while keeping warcio's ability to do streaming.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants