-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_12_compact.py
90 lines (85 loc) · 3.31 KB
/
test_12_compact.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gzip
import os
from pickle import load
from tempfile import TemporaryDirectory
from test_params import DISKS_12
from unittest import TestCase
class Test12Compact(TestCase):
def test_(self):
ids = [
'clueweb12-0013wb-88-00000',
'clueweb12-0013wb-88-00410',
'clueweb12-0013wb-88-00966',
]
script = __file__.replace(
'test_12_compact',
'ClueWebCompactor')
disks = ' '.join(DISKS_12)
for filename in ['pickle', 'pickle.gz']:
with TemporaryDirectory() as dir:
stdin = os.path.join(dir, 'stdin')
with open(stdin, 'w') as f:
for id in ids:
f.write(id)
f.write('\n')
pickle = os.path.join(dir, filename)
assert 0 == os.system('cat %s | python %s --twelve %s %s' % (
stdin,
script,
disks,
pickle))
with (gzip.open if pickle.endswith('.gz') else open)(
pickle,
'rb') as f:
bodies, https, warcs = load(f)
assert 3 == len(bodies)
for body in bodies:
assert 0 < len(body)
assert 3 == len(https)
for http in https.values():
assert 0 < len(http)
assert http[None].startswith(b'HTTP')
assert 3 == len(warcs)
for warc in warcs.values():
assert 0 < len(warc)
assert warc[None].startswith(b'WARC')
def test_update(self):
ids = [
'clueweb12-0013wb-88-00000',
'clueweb12-0013wb-88-00410',
'clueweb12-0013wb-88-00966',
]
script = __file__.replace(
'test_12_compact',
'ClueWebCompactor')
disks = ' '.join(DISKS_12)
for filename in ['pickle', 'pickle.gz']:
with TemporaryDirectory() as dir:
stdin = os.path.join(dir, 'stdin')
with open(stdin, 'w') as f:
for id in ids:
f.write(id)
f.write('\n')
pickle = os.path.join(dir, filename)
for count in [1, 2, 3]:
assert 0 == os.system('cat %s | head -%i | python %s --twelve %s %s' % (
stdin,
count,
script,
disks,
pickle))
with (gzip.open if pickle.endswith('.gz') else open)(
pickle,
'rb') as f:
bodies, https, warcs = load(f)
assert count == len(bodies)
for body in bodies:
assert 0 < len(body)
assert count == len(https)
for http in https.values():
assert 0 < len(http)
assert http[None].startswith(b'HTTP')
assert count == len(warcs)
for warc in warcs.values():
assert 0 < len(warc)
assert warc[None].startswith(b'WARC')