-
Notifications
You must be signed in to change notification settings - Fork 0
/
ClueWeb12.py
32 lines (28 loc) · 1.09 KB
/
ClueWeb12.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import ClueWeb
import os
class ClueWeb12(ClueWeb.Collection):
def read(self, disk_path):
disk_path = disk_path.rstrip(os.sep)
for i in range(100):
try:
part_name = 'ClueWeb12_%s' % str(i).zfill(2)
part_path = os.path.join(disk_path, part_name)
for segment_name in os.listdir(part_path):
segment_path = os.path.join(part_path, segment_name)
self[segment_name] = Segment().read(segment_path)
except FileNotFoundError:
continue
return self
class Segment(ClueWeb.Segment):
def read(self, segment_path):
segment_path = segment_path.rstrip(os.sep)
segment_name = segment_path.rsplit(os.sep, 1)[-1]
for i in range(100):
try:
file_name = '%s-%s.warc.gz' % (segment_name, str(i).zfill(2))
file_path = os.path.join(segment_path, file_name)
f = ClueWeb.File(file_path)
self.append(f)
except FileNotFoundError:
break
return self