Skip to content

Commit

Permalink
encoding fixes: set default encoding for to_native_str() to utf-8
Browse files Browse the repository at this point in the history
encode WARC headers as utf-8, http headers as iso-8859-1
when decoding WARC headers, attempt utf-8 first, fallback to iso-8859-1
see #7 and #6 for more details
  • Loading branch information
ikreymer committed Apr 14, 2017
1 parent cdedc00 commit a3a8960
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 25 deletions.
14 changes: 3 additions & 11 deletions warcio/recordloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from warcio.statusandheaders import StatusAndHeadersParserException

from warcio.limitreader import LimitReader
from warcio.utils import to_native_str

from warcio.bufferedreaders import BufferedReader, ChunkedDataReader

Expand Down Expand Up @@ -220,14 +219,7 @@ def get_rec_type(self):
def parse(self, stream, headerline=None):
total_read = 0

def readline():
return to_native_str(stream.readline())

# if headerline passed in, use that
if headerline is None:
headerline = readline()
else:
headerline = to_native_str(headerline)
headerline = StatusAndHeadersParser.read_decoded_line(stream, headerline)

header_len = len(headerline)

Expand All @@ -240,8 +232,8 @@ def readline():

# if arc header, consume next two lines
if headerline.startswith('filedesc://'):
version = readline() # skip version
spec = readline() # skip header spec, use preset one
version = StatusAndHeadersParser.read_decoded_line(stream) # skip version
spec = StatusAndHeadersParser.read_decoded_line(stream) # skip header spec, use preset one
total_read += len(version)
total_read += len(spec)

Expand Down
31 changes: 19 additions & 12 deletions warcio/statusandheaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ def to_str(self, filter_func=None):

return string

def to_bytes(self, filter_func=None):
return self.to_str(filter_func).encode('iso-8859-1') + b'\r\n'
def to_bytes(self, filter_func=None, encoding='utf-8'):
return self.to_str(filter_func).encode(encoding) + b'\r\n'


#=================================================================
Expand All @@ -172,14 +172,8 @@ def parse(self, stream, full_statusline=None):
support continuation headers starting with space or tab
"""

def readline():
return to_native_str(stream.readline())

# status line w newlines intact
if full_statusline is None:
full_statusline = readline()
else:
full_statusline = to_native_str(full_statusline)
full_statusline = self.read_decoded_line(stream, full_statusline)

statusline, total_read = _strip_count(full_statusline, 0)

Expand All @@ -205,7 +199,7 @@ def readline():
else:
protocol_status = statusline.split(' ', 1)

line, total_read = _strip_count(readline(), total_read)
line, total_read = _strip_count(self.read_decoded_line(stream), total_read)
while line:
result = line.split(':', 1)
if len(result) == 2:
Expand All @@ -215,14 +209,14 @@ def readline():
name = result[0]
value = None

next_line, total_read = _strip_count(readline(),
next_line, total_read = _strip_count(self.read_decoded_line(stream),
total_read)

# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
if value is not None:
value += next_line
next_line, total_read = _strip_count(readline(),
next_line, total_read = _strip_count(self.read_decoded_line(stream),
total_read)

if value is not None:
Expand Down Expand Up @@ -260,6 +254,19 @@ def make_warc_id(id_=None):
return '<urn:uuid:{0}>'.format(id_)


@staticmethod
def read_decoded_line(stream, line=None):
if line is None:
line = stream.readline()

try:
# attempt to decode as utf-8 first
return to_native_str(line, 'utf-8')
except:
# if fails, default to ISO-8859-1
return to_native_str(line, 'iso-8859-1')


#=================================================================
class StatusAndHeadersParserException(Exception):
"""
Expand Down
2 changes: 1 addition & 1 deletion warcio/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


# #===========================================================================
def to_native_str(value, encoding='iso-8859-1', func=lambda x: x):
def to_native_str(value, encoding='utf-8', func=lambda x: x):
if isinstance(value, str):
return value

Expand Down
2 changes: 1 addition & 1 deletion warcio/warcwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict):
return warc_headers

def _set_header_buff(self, record):
headers_buff = record.http_headers.to_bytes(self.header_filter)
headers_buff = record.http_headers.to_bytes(self.header_filter, 'iso-8859-1')
record.http_headers.headers_buff = headers_buff

def _write_warc_record(self, out, record, adjust_cl=True):
Expand Down

0 comments on commit a3a8960

Please sign in to comment.