Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
372 lines (279 sloc) 13.3 KB
from warcio.statusandheaders import StatusAndHeaders
from warcio.statusandheaders import StatusAndHeadersParser
from warcio.statusandheaders import StatusAndHeadersParserException
from warcio.exceptions import ArchiveLoadFailed
from warcio.limitreader import LimitReader
from warcio.digestverifyingreader import DigestVerifyingReader, DigestChecker
from warcio.bufferedreaders import BufferedReader, ChunkedDataReader
from warcio.timeutils import timestamp_to_iso_date
from six.moves import zip
import logging
logger = logging.getLogger(__name__)
class ArcWarcRecord(object):
def __init__(self, *args, **kwargs):
(self.format, self.rec_type, self.rec_headers, self.raw_stream,
self.http_headers, self.content_type, self.length) = args
self.payload_length = -1
self.digest_checker = kwargs.get('digest_checker')
def content_stream(self):
if not self.http_headers:
return self.raw_stream
encoding = self.http_headers.get_header('content-encoding')
if encoding:
encoding = encoding.lower()
if encoding not in BufferedReader.get_supported_decompressors():
encoding = None
if self.http_headers.get_header('transfer-encoding') == 'chunked':
return ChunkedDataReader(self.raw_stream, decomp_type=encoding)
elif encoding:
return BufferedReader(self.raw_stream, decomp_type=encoding)
return self.raw_stream
class ArcWarcRecordLoader(object):
WARC_TYPES = ['WARC/1.1', 'WARC/1.0', 'WARC/0.17', 'WARC/0.18']
HTTP_TYPES = ['HTTP/1.0', 'HTTP/1.1']
HTTP_RECORDS = ('response', 'request', 'revisit')
NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
HTTP_SCHEMES = ('http:', 'https:')
def __init__(self, verify_http=True, arc2warc=True):
if arc2warc:
self.arc_parser = ARC2WARCHeadersParser()
self.arc_parser = ARCHeadersParser()
self.warc_parser = StatusAndHeadersParser(self.WARC_TYPES)
self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
def parse_record_stream(self, stream,
""" Parse file-like stream and return an ArcWarcRecord
encapsulating the record headers, http headers (if any),
and a stream limited to the remainder of the record.
Pass statusline and known_format to detect_type_loader_headers()
to faciliate parsing.
(the_format, rec_headers) = (self.
if the_format == 'arc':
uri = rec_headers.get_header('uri')
length = rec_headers.get_header('length')
content_type = rec_headers.get_header('content-type')
sub_len = rec_headers.total_len
if uri and uri.startswith('filedesc://'):
rec_type = 'arc_header'
rec_type = 'response'
elif the_format in ('warc', 'arc2warc'):
rec_type = rec_headers.get_header('WARC-Type')
uri = self._ensure_target_uri_format(rec_headers)
length = rec_headers.get_header('Content-Length')
content_type = rec_headers.get_header('Content-Type')
if the_format == 'warc':
sub_len = 0
sub_len = rec_headers.total_len
the_format = 'warc'
is_err = False
if length is not None:
length = int(length) - sub_len
if length < 0:
is_err = True
except (ValueError, TypeError):
is_err = True
# err condition
if is_err:
length = 0
is_verifying = False
digest_checker = DigestChecker(check_digests)
# limit stream to the length for all valid records
if length is not None and length >= 0:
stream = LimitReader.wrap_stream(stream, length)
if check_digests:
stream, is_verifying = self.wrap_digest_verifying_stream(stream, rec_type,
rec_headers, digest_checker,
http_headers = None
# load http headers if parsing
if not no_record_parse:
http_headers = self.load_http_headers(rec_type, uri, stream, length)
# generate validate http headers (eg. for replay)
if not http_headers and ensure_http_headers:
http_headers = self.default_http_headers(length, content_type)
if is_verifying:
return ArcWarcRecord(the_format, rec_type,
rec_headers, stream, http_headers,
content_type, length, digest_checker=digest_checker)
def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None):
payload_digest = rec_headers.get_header('WARC-Payload-Digest')
block_digest = rec_headers.get_header('WARC-Block-Digest')
segment_number = rec_headers.get_header('WARC-Segment-Number')
if not payload_digest and not block_digest:
return stream, False
stream = DigestVerifyingReader(stream, length, digest_checker,
return stream, True
def load_http_headers(self, rec_type, uri, stream, length):
# only if length == 0 don't parse
# try parsing is length is unknown (length is None) or length > 0
if length == 0:
return None
# only certain record types can have http headers
if rec_type not in self.HTTP_RECORDS:
return None
# only http:/https: uris can have http headers
if not uri.startswith(self.HTTP_SCHEMES):
return None
# request record: parse request
if rec_type == 'request':
return self.http_req_parser.parse(stream)
elif rec_type == 'revisit':
return self.http_parser.parse(stream)
except EOFError:
# empty revisit with no http headers, is ok!
return None
# response record or non-empty revisit: parse HTTP status and headers!
return self.http_parser.parse(stream)
def default_http_headers(self, length, content_type=None):
headers = []
if content_type:
headers.append(('Content-Type', content_type))
if length is not None and length >= 0:
headers.append(('Content-Length', str(length)))
return StatusAndHeaders('200 OK', headers=headers, protocol='HTTP/1.0')
def _detect_type_load_headers(self, stream,
statusline=None, known_format=None):
""" If known_format is specified ('warc' or 'arc'),
parse only as that format.
Otherwise, try parsing record as WARC, then try parsing as ARC.
if neither one succeeds, we're out of luck.
if known_format != 'arc':
# try as warc first
rec_headers = self.warc_parser.parse(stream, statusline)
return 'warc', rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'warc':
msg = 'Invalid WARC record, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
statusline = se.statusline
# now try as arc
rec_headers = self.arc_parser.parse(stream, statusline)
return self.arc_parser.get_rec_type(), rec_headers
except StatusAndHeadersParserException as se:
if known_format == 'arc':
msg = 'Invalid ARC record, first line: '
msg = 'Unknown archive format, first line: '
raise ArchiveLoadFailed(msg + str(se.statusline))
def _ensure_target_uri_format(self, rec_headers):
"""Checks the value for the WARC-Target-URI header field to see if it starts
with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present,
corrects and updates the field returning the corrected value for the field
otherwise just returns the fields value. Also checks for the presence of
spaces and percent-encodes them if present, for more reliable parsing
:param StatusAndHeaders rec_headers: The parsed WARC headers
:return: The value for the WARC-Target-URI field
:rtype: str | None
uri = rec_headers.get_header('WARC-Target-URI')
if uri is not None and uri.startswith('<') and uri.endswith('>'):
uri = uri[1:-1]
rec_headers.replace_header('WARC-Target-URI', uri)
if uri is not None and " " in uri:
logger.warning("Replacing spaces in invalid WARC-Target-URI: {}".format(uri))
uri = uri.replace(" ", "%20")
rec_headers.replace_header('WARC-Target-URI', uri)
return uri
class ARCHeadersParser(object):
# ARC 1.0 headers
ARC_HEADERS = ["uri", "ip-address", "archive-date",
"content-type", "length"]
def __init__(self):
self.headernames = self.get_header_names()
def get_rec_type(self):
return 'arc'
def parse(self, stream, headerline=None):
total_read = 0
if headerline is None:
headerline = stream.readline()
headerline = StatusAndHeadersParser.decode_header(headerline)
header_len = len(headerline)
if header_len == 0:
raise EOFError()
headerline = headerline.rstrip()
headernames = self.headernames
# if arc header, consume next two lines
if headerline.startswith('filedesc://'):
version = StatusAndHeadersParser.decode_header(stream.readline()) # skip version
spec = StatusAndHeadersParser.decode_header(stream.readline()) # skip header spec, use preset one
total_read += len(version)
total_read += len(spec)
parts = headerline.rsplit(' ', len(headernames)-1)
if len(parts) != len(headernames):
msg = 'Wrong # of headers, expected arc headers {0}, Found {1}'
msg = msg.format(headernames, parts)
raise StatusAndHeadersParserException(msg, parts)
protocol, headers = self._get_protocol_and_headers(headerline, parts)
return StatusAndHeaders(statusline='',
def get_header_names(cls):
return cls.ARC_HEADERS
def _get_protocol_and_headers(self, headerline, parts):
headers = []
for name, value in zip(self.headernames, parts):
headers.append((name, value))
return ('ARC/1.0', headers)
class ARC2WARCHeadersParser(ARCHeadersParser):
# Headers for converting ARC -> WARC Header
def get_rec_type(self):
return 'arc2warc'
def get_header_names(cls):
def _get_protocol_and_headers(self, headerline, parts):
headers = []
if headerline.startswith('filedesc://'):
rec_type = 'warcinfo'
rec_type = 'response'
parts[3] = 'application/http;msgtype=response'
headers.append(('WARC-Type', rec_type))
headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))
for name, value in zip(self.headernames, parts):
if name == 'WARC-Date':
value = timestamp_to_iso_date(value)
if rec_type == 'warcinfo' and name == 'WARC-Target-URI':
name = 'WARC-Filename'
value = value[len('filedesc://'):]
headers.append((name, value))
return ('WARC/1.0', headers)
You can’t perform that action at this time.