From a9485eb024af131acf4fb0b5960c008f5610faba Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 30 Mar 2019 14:25:24 -0700 Subject: [PATCH] digest checking improvements: (#67) - tests: fix incorrect digests for example.warc, example.warc.gz - ensure each example file is checked to ensure no invalid digests (and no checks for arc files) - move all digest checking to test_check_digest_examples.py headers buffer cleanup: move headers buffer computation directly into StatusAndHeaders via compute_headers_buffer() instead of in warcwriter --- test/data/example.warc | Bin 5120 -> 5120 bytes test/data/example.warc.gz | Bin 3649 -> 3816 bytes test/test_check_digest_examples.py | 70 +++++++++++++++++++++++++++++ test/test_cli.py | 52 +++------------------ warcio/statusandheaders.py | 7 +++ warcio/warcwriter.py | 12 +++-- 6 files changed, 88 insertions(+), 53 deletions(-) create mode 100644 test/test_check_digest_examples.py diff --git a/test/data/example.warc b/test/data/example.warc index 4bc3089a5de83170fa6711bb7ebfacd5545d6dd1..3d0f5ed822eddc34153575f50d672960ea0fd65f 100644 GIT binary patch delta 49 zcmZqBXwcZe!KV;z>gVeb92n*u<`w2)?C$2~9%=4n9_-~8Y3}ap?(RQ%4XfSeG`?A^ E0A_^_y8r+H delta 49 zcmZqBXwcZe!KdI77UUfiY?jGnL?B?%i>K$ns=HeOdW1velP&TW%_x&HdRS}moGXn<|!x#H%lK}zYq8OM7qvX(NGVyeGB{eK-s3OJP396 zddxtS(Ow+CBL@Q7FC^T_sYc~vDJ7b#%dI4$hF*LDuvM0K_@v?!f){8?_x~-)6@h`TuN+KMVG)nHaq;&a!t6zIX==Z|41^!&}3>;Iy+k{IYC{J{_`t;DY&?TkrsEU3Kl|EzE8zQsZxo1A2)INSb>fDr=TuQzn#G_8j zNO$f}^BkYw>{EP0GqK>Jql3@rynq!~8~FtI1^c!!%a*B6!lH2gKd(_HBb4M@HrWoi zdC(G;kz+8kM&N^|hu?@4da|#C;Y@3S{3%j?v!Kaypnr2ZLRs;BP-imAp`=hp|& zS9$d|+`q#vP)QhvX8@@LYQboY8C4GLJj9e_ep${>wBVu0(NcWz=Y8th$sZy%FJ|EE z@=WrM9XEr%6T_r)PH{TJxXIO1#D(zYaKtd=`AsQx(Uu(w#iow??X~Au2Cs)UJOmki z>ienscZN+r{codV*9e25`aM@z@*NHn&}+9H##%H22X?rGJqAqOgmjxIC5QIh*=xOf zwy7lrq8!`WrbUm05nnM+pD?QSDyrOEnTV}EUOWl98rElsCxGNLVrg~b&{ zjFuSUYrV8!tq(c)U9Ktq&0!V=hU}BJl#(I6t8vU-focp5uCl&M&q$rH8H2zkP`l8V;Y4b3;{YZvB*5V7gFM}} z8r5cSQb}>d&U#J$L*#UenVdy#$GHn@U>{T&i(OWC%#B^E`-XhO(?oWzBLVkf#JV4! zYIS}>$CRQSmb)VtqOK1bSHSQ`Ob1U^iEhiCNh8z%zAX5!B}qo8j$YJf20b^74MAhJ|`M|v@h<0 zuRKuR0FL~mP{tk6eHmjIVB5QP#0n|Gd)2n$k_fGtIQbQ^;Y!|s@%?|x=|9Yhrxz;z zi&=sH(=4lL3b&S5<-+>7^usJ*9j(8tqjl9nXEInn-1B%ZnV?|&O6vI@J`=%IoTNgn zLnJ)XY5x7nQI44I-Kn#1@5OILRV2vIC@#Cwr9HsbT;bj(@A7*%`OPi#*?Dn}+wSX4 z)aRmuh2lzO>F~0_B`Xtm&Y5nE(QM^^n+33V!#3a0O4&f`@^`c@ZS~QXtfO^myIvi_ z)uWl7XRXW8M$iBNuzP4IdgoM&PR(Utx&F~Z{o^zCvH9Q38eccdlIrgK#o1&EqQItq zGb>dPR_ncPmeq-fpkh(nT2p#x&v&!BH_QTlptW$%UliGm@|4tYc6uos?%BJg*CKrmQn-!iqbc<$Rtkv23mgrD+!gZ delta 1674 zcmV;526g%99l;z5ABzY8000000t2xMU;zOGvugpa0RaQE8UxD&e*Yxtw!tiwK=NNtjP3Q!1vl{QP`FrAo^#zLri=X!g zu1c+HV^BMilwZAY?BS7@C0t%ZtzjgVU{vfzV zow|*H{;1dObUU{<2~e0M$)6p#Nok#cqeR`#SShT$Q0lanf2n^BHF;PBs4w0wS$H0p zx}C(`&K%ch(ColiPv#w$rUi~B%u4F8sL1Ts&!@?Af6=b!ZadQ{v1o1Me^;UZW&2R2 zMy`-dwMCB)eLLHDRse4!YE%sb0V*#odVxhl#;Ff3U{-!)c`oT7WN-f8&Ic><;J9Jirt)q3wOZsY)=`iZo_d3_3eM=+4tlq9 z-9+M_-MP`KY%mH)Ke8%-6|Qst2LrGxwFzKpnr1iSe;;@J1m=G-M^PF=ldf+bxWpQDfF z{?3I>lwLcyhiq$OzF*aFibf^*4i~twmK13cwVTOEy zJmL=?f6mt&YXea1i8ow#dnwDMD&TxXz{kB3WWVPHVIlK|K~5oZT?{d1eMs@3FiYR1 zId&03!Kdzm`7Ml!Lb0YHVCePm!0)*@?0J57epSi*8yxYHHJX5COEDR((>Q$?Z{o~f zjyKcgVl`PrLHs3HkLH`tH=B-mOOKVL#jo6He}9%H{j2FBUd74k{vnzrcgrkW2i|fq zStaqAtK=iE*h^f3>Dp*y>WTHFDxL3CDsAMC-aT3N)9(X>a}jc1@|x(WS9bOVO}b#tjpt zf8G+{g+tr2tyz}c8gJOlL0}QT;>IGE1s5d&)Bf(#e0B4kWh8<3C=$um?oKRjWspnFT7wF z_E8w%cde2Ovwx_$YDJv**!vg%hBg`|f5yclX9WQh#m%%Ki1uZR_JHe=z$aetj9#^U zD+0#XEAWQX!odSKdTU#2s*VJ7_Rp!<);&CVCtx}wzUoZt1sWN{g>kajx{en!&26>>26R-<+M+5)9ED>A#{ z@Vz#Q*LA+)IKYIhQ4ku(j4*~nGFoHLFOyhE7E?-xKMt1)lc6K`iy4QFa}|#w+={X) zFqFEC_NT6d^H0Z=4GH!43{skKf7;{8=xKl8QY%>-gUXRqfAP{A_vNt{!A7fNFMaRD z>s>2uGm5Gzaw%M{Rn(X|htg)wZ(V$*KM3wnWp2YE8TY!KZs+DYg$Sl;`ez4js;%R2 zl&IU8NQspf3YFC&^RJ;I_sam)`P&5zM}eu^$)MX=;2I5@9T@A$yyJ^$e}SV3a~(M> zD>DD}^Ktr=EZbGI+s;)+ELz+6-&Lajvc0b|BUVUe%A&`IzMXA6%Yn5K70Lzz2U%!~ zUSQFXN#=udnCp+cC`8l`DV)rO%$3u2a}F~PrT|}n-%EG|MxPmbW~VUmQ1Jm@ZgUs) zd&96dfRo#4I=>5`$X^j|Xi&cDQ@Gq1U1HQ9gfxuDaT1P(uoP=y^1bRnkb}dm^TEm@ zIBwXsseIdht(HZLbsXfSr{18fg0tvs2ff?5Y9g`E?p!OW8;l&XkGu+Ch3mrq!2s+^ UZUT5QO|zTv55XhU_9O!U0M!vN^#A|> diff --git a/test/test_check_digest_examples.py b/test/test_check_digest_examples.py new file mode 100644 index 00000000..a559e895 --- /dev/null +++ b/test/test_check_digest_examples.py @@ -0,0 +1,70 @@ +from warcio.cli import main +from warcio import ArchiveIterator +from warcio.warcwriter import BufferWARCWriter + +from . import get_test_file +import os + +SKIP = ['example-trunc.warc', + 'example-iana.org-chunked.warc', + 'example-wrong-chunks.warc.gz', + 'example-bad-non-chunked.warc.gz', + 'example-digest.warc' + ] + + +def pytest_generate_tests(metafunc): + if 'test_filename' in metafunc.fixturenames: + files = [filename for filename in os.listdir(get_test_file('.')) + if filename not in SKIP and filename.endswith(('.warc', '.warc.gz', '.arc', '.arc.gz'))] + + metafunc.parametrize('test_filename', files) + + +class TestExamplesDigest(object): + def check_helper(self, args, expected_exit_value, capsys): + exit_value = None + try: + main(args=args) + except SystemExit as e: + exit_value = e.code + finally: + assert exit_value == expected_exit_value + + return capsys.readouterr().out + + def test_check_invalid(self, capsys): + filenames = [get_test_file('example-digest.warc')] + + args = ['check'] + filenames + value = self.check_helper(args, 1, capsys) + assert value.count('payload digest failed') == 1 + assert value.count('WARC-Record-ID') == 1 + + args = ['check', '-v'] + filenames + value = self.check_helper(args, 1, capsys) + assert value.count('payload digest failed') == 1 + assert value.count('digest pass') == 3 + assert value.count('WARC-Record-ID') == 4 + + def test_check_valid(self, capsys): + filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')] + + args = ['check'] + filenames + expected = '' + assert self.check_helper(args, 0, capsys) == expected + + args = ['check', '-v'] + filenames + value = self.check_helper(args, 0, capsys) + # two digests per file (payload and block) + assert value.count('digest pass') == 4 + assert value.count('WARC-Record-ID') == 12 + + def test_check_no_invalid_files(self, test_filename, capsys): + args = ['check', '-v', get_test_file(test_filename)] + value = self.check_helper(args, 0, capsys) + assert value.count('digest failed') == 0 + + # if ARC file, no digests to check, so no passing results + if test_filename.endswith(('.arc', '.arc.gz')): + assert value.count('digest pass') == 0 diff --git a/test/test_cli.py b/test/test_cli.py index dc643ec4..a9bf18ed 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -24,9 +24,9 @@ def test_index(): {"length": "353", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"} {"length": "431", "offset": "353", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"} {"length": "1228", "offset": "784", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"} -{"length": "526", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"} -{"length": "585", "offset": "2538", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"} -{"length": "526", "offset": "3123", "warc-type": "request", "warc-target-uri": "http://example.com/"} +{"length": "609", "offset": "2012", "warc-type": "request", "warc-target-uri": "http://example.com/"} +{"length": "586", "offset": "2621", "warc-type": "revisit", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"} +{"length": "609", "offset": "3207", "warc-type": "request", "warc-target-uri": "http://example.com/"} {"length": "484", "offset": "0", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"} {"length": "705", "offset": "488", "warc-type": "warcinfo", "warc-filename": "temp-20170306040353.warc.gz"} {"length": "1365", "offset": "1197", "warc-type": "response", "warc-target-uri": "http://example.com/", "http:content-type": "text/html"} @@ -54,55 +54,15 @@ def test_index_2(): {"offset": "0", "length": "353", "warc-type": "warcinfo", "filename": "example.warc.gz"} {"offset": "353", "length": "431", "warc-type": "warcinfo", "filename": "example.warc.gz"} {"offset": "784", "length": "1228", "http:status": "200", "warc-type": "response", "filename": "example.warc.gz"} -{"offset": "2012", "length": "526", "warc-type": "request", "filename": "example.warc.gz"} -{"offset": "2538", "length": "585", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"} -{"offset": "3123", "length": "526", "warc-type": "request", "filename": "example.warc.gz"} +{"offset": "2012", "length": "609", "warc-type": "request", "filename": "example.warc.gz"} +{"offset": "2621", "length": "586", "http:status": "200", "warc-type": "revisit", "filename": "example.warc.gz"} +{"offset": "3207", "length": "609", "warc-type": "request", "filename": "example.warc.gz"} """ with patch_stdout() as buff: res = main(args=args) assert buff.getvalue() == expected -def check_helper(args, expected_exit_value): - with patch_stdout() as buff: - exit_value = None - try: - main(args=args) - except SystemExit as e: - exit_value = e.code - finally: - assert exit_value == expected_exit_value - - return buff.getvalue() - - -def test_check_valid(): - filenames = [get_test_file('example.warc'), get_test_file('example.warc.gz')] - - args = ['check'] + filenames - expected = b'' - assert check_helper(args, 0) == expected - - args = ['check', '-v'] + filenames - value = check_helper(args, 0) - assert value.count(b'digest pass') == 2 - assert value.count(b'WARC-Record-ID') == 12 - - -def test_check_invalid(): - filenames = [get_test_file('example-digest.warc')] - - args = ['check'] + filenames - value = check_helper(args, 1) - assert value.count(b'payload digest failed') == 1 - assert value.count(b'WARC-Record-ID') == 1 - - args = ['check', '-v'] + filenames - value = check_helper(args, 1) - assert value.count(b'payload digest failed') == 1 - assert value.count(b'digest pass') == 3 - assert value.count(b'WARC-Record-ID') == 4 - def test_recompress_non_chunked(): with named_temp() as temp: diff --git a/warcio/statusandheaders.py b/warcio/statusandheaders.py index e0241361..7817e325 100644 --- a/warcio/statusandheaders.py +++ b/warcio/statusandheaders.py @@ -110,6 +110,13 @@ def add_range(self, start, part_len, total_len): self.replace_header('Accept-Ranges', 'bytes') return self + def compute_headers_buffer(self, header_filter=None): + """ + Set buffer representing headers + """ + # HTTP headers %-encoded as ascii (see to_ascii_bytes for more info) + self.headers_buff = self.to_ascii_bytes(header_filter) + def __repr__(self): return "StatusAndHeaders(protocol = '{0}', statusline = '{1}', \ headers = {2})".format(self.protocol, self.statusline, self.headers) diff --git a/warcio/warcwriter.py b/warcio/warcwriter.py index f8e6d504..0189016e 100644 --- a/warcio/warcwriter.py +++ b/warcio/warcwriter.py @@ -93,7 +93,10 @@ def ensure_digest(self, record, block=True, payload=True): pos = 0 temp_file = self._create_temp_file() - if block_digester and record.http_headers and record.http_headers.headers_buff: + if block_digester and record.http_headers: + if not record.http_headers.headers_buff: + record.http_headers.compute_headers_buffer(self.header_filter) + block_digester.update(record.http_headers.headers_buff) for buf in self._iter_stream(record.raw_stream): @@ -239,17 +242,12 @@ def _init_warc_headers(self, uri, record_type, warc_headers_dict): return warc_headers - def _set_header_buff(self, record): - # HTTP headers %-encoded as ascii (see to_ascii_bytes for more info) - headers_buff = record.http_headers.to_ascii_bytes(self.header_filter) - record.http_headers.headers_buff = headers_buff - def _write_warc_record(self, out, record): if self.gzip: out = GzippingWrapper(out) if record.http_headers: - self._set_header_buff(record) + record.http_headers.compute_headers_buffer(self.header_filter) # Content-Length is None/unknown # Fix record by: buffering and recomputing all digests and length