diff --git a/README b/README index dfdd6f3..6c0c074 100644 --- a/README +++ b/README @@ -1,5 +1,7 @@ In Python, read the .80 file format, for 80legs web crawl results. +The URL and data are UTF-8 decoded. + From http://80legs.pbworks.com/Results: For people interested in deserializing in other languages, the file format this creates and reads is: @@ -7,6 +9,3 @@ From http://80legs.pbworks.com/Results: * The last 4 items () repeat for each url/data pair. * , , , and are encoded 32-bit integers. * The url is encoded using UTF-8. - -ISSUES: - * I don't Unicode decode either the URL or the data. diff --git a/eightyformat.py b/eightyformat.py index 30de667..e685940 100755 --- a/eightyformat.py +++ b/eightyformat.py @@ -27,6 +27,7 @@ def read(file): (DATASIZE,) = struct.unpack("i", l) # print DATASIZE data = str(file.read(DATASIZE)) + data = data.decode("utf-8") yield (url, data) # print data # print data.decode("utf-8")