Skip to content

Commit

Permalink
writer: support create_warc_record() with payload and no length corre…
Browse files Browse the repository at this point in the history
…ctly auto-computes length

ArcWarcRecordLoader.load_http_headers() determines if there are headers to load,
can be used to load http headers from a stream if needed used by create_warc_record()
tests: add tests for dns response and resource records, fixes #15
bump version to 1.3.1
  • Loading branch information
ikreymer committed May 9, 2017
1 parent 595f651 commit 14757df
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 16 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from setuptools.command.test import test as TestCommand
import glob

__version__ = '1.3'
__version__ = '1.3.1'


class PyTest(TestCommand):
Expand Down
67 changes: 67 additions & 0 deletions test/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,45 @@ def _make_warc_date(cls):
'


DNS_RESPONSE_RECORD = '\
WARC/1.0\r\n\
WARC-Type: response\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: dns:google.com\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Payload-Digest: sha1:2AAVJYKKIWK5CF6EWE7PH63EMNLO44TH\r\n\
WARC-Block-Digest: sha1:2AAVJYKKIWK5CF6EWE7PH63EMNLO44TH\r\n\
Content-Type: application/http; msgtype=response\r\n\
Content-Length: 147\r\n\
\r\n\
20170509000739\n\
google.com. 185 IN A 209.148.113.239\n\
google.com. 185 IN A 209.148.113.238\n\
google.com. 185 IN A 209.148.113.250\n\
\r\n\r\n\
'

DNS_RESOURCE_RECORD = '\
WARC/1.0\r\n\
WARC-Type: resource\r\n\
WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
WARC-Target-URI: dns:google.com\r\n\
WARC-Date: 2000-01-01T00:00:00Z\r\n\
WARC-Payload-Digest: sha1:2AAVJYKKIWK5CF6EWE7PH63EMNLO44TH\r\n\
WARC-Block-Digest: sha1:2AAVJYKKIWK5CF6EWE7PH63EMNLO44TH\r\n\
Content-Type: application/warc-record\r\n\
Content-Length: 147\r\n\
\r\n\
20170509000739\n\
google.com. 185 IN A 209.148.113.239\n\
google.com. 185 IN A 209.148.113.238\n\
google.com. 185 IN A 209.148.113.250\n\
\r\n\r\n\
'




# ============================================================================
# Decorator Setup
# ============================================================================
Expand Down Expand Up @@ -279,6 +318,34 @@ def sample_response_2(writer):
http_headers=http_headers)


# ============================================================================
@sample_record('response_dns', DNS_RESPONSE_RECORD)
def sample_response_dns(writer):
payload = b'''\
20170509000739
google.com. 185 IN A 209.148.113.239
google.com. 185 IN A 209.148.113.238
google.com. 185 IN A 209.148.113.250
'''

return writer.create_warc_record('dns:google.com', 'response',
payload=BytesIO(payload))


# ============================================================================
@sample_record('resource_dns', DNS_RESOURCE_RECORD)
def sample_resource_dns(writer):
payload = b'''\
20170509000739
google.com. 185 IN A 209.148.113.239
google.com. 185 IN A 209.148.113.238
google.com. 185 IN A 209.148.113.250
'''

return writer.create_warc_record('dns:google.com', 'resource',
payload=BytesIO(payload))


# ============================================================================
@sample_record('request_1', REQUEST_RECORD)
def sample_request(writer):
Expand Down
27 changes: 17 additions & 10 deletions warcio/recordloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,9 @@ def parse_record_stream(self, stream,

http_headers = None

# record has http headers
# checking if length != 0 instead of length > 0
# since length == None is also accepted
if (not no_record_parse and
length != 0 and
(rec_type in self.HTTP_RECORDS) and
uri.startswith(self.HTTP_SCHEMES)):

http_headers = self._load_http_headers(rec_type, stream)
# load http headers if parsing
if not no_record_parse:
http_headers = self.load_http_headers(rec_type, uri, stream, length)

# generate validate http headers (eg. for replay)
if not http_headers and ensure_http_headers:
Expand All @@ -148,7 +142,20 @@ def parse_record_stream(self, stream,
rec_headers, stream, http_headers,
content_type, length)

def _load_http_headers(self, rec_type, stream):
def load_http_headers(self, rec_type, uri, stream, length):
# only if length == 0 don't parse
# try parsing is length is unknown (length is None) or length > 0
if length == 0:
return None

# only certain record types can have http headers
if rec_type not in self.HTTP_RECORDS:
return None

# only http:/https: uris can have http headers
if not uri.startswith(self.HTTP_SCHEMES):
return None

# request record: parse request
if rec_type == 'request':
return self.http_req_parser.parse(stream)
Expand Down
12 changes: 7 additions & 5 deletions warcio/warcwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from warcio.statusandheaders import StatusAndHeadersParser, StatusAndHeaders

from warcio.recordloader import ArcWarcRecord
from warcio.recordloader import ArcWarcRecord, ArcWarcRecordLoader


# ============================================================================
Expand Down Expand Up @@ -170,15 +170,17 @@ def create_revisit_record(self, uri, digest, refers_to_uri, refers_to_date,

def create_warc_record(self, uri, record_type,
payload=None,
length=0,
length=None,
warc_content_type='',
warc_headers_dict={},
warc_headers=None,
http_headers=None):

if payload and not http_headers and record_type in ('response', 'request'):
http_headers = self.parser.parse(payload)
length -= payload.tell()
if payload and not http_headers:
loader = ArcWarcRecordLoader()
http_headers = loader.load_http_headers(record_type, uri, payload, length)
if http_headers and length is not None:
length -= payload.tell()

if not payload:
payload = BytesIO()
Expand Down

0 comments on commit 14757df

Please sign in to comment.