In [1]:
import bz2
import sys
import re
import xml.etree.ElementTree as ET


In [9]:
def read_bz2_streaming_lines(path, block_size=256*1024):
    bz2dec = bz2.BZ2Decompressor()
    with open(path, "rb") as infile:
        while True:
            compressed_data = infile.read(block_size)
            try:
                uncompressed_data = bz2dec.decompress(compressed_data)
            except EOFError:
                # We've reached the end of the stream
                break
            # If there's no more data in the file
            if not compressed_data:
                if bz2dec.need_input:
                    break
            newline_index = uncompressed_data.find(b"\n")
            while newline_index != -1:
                yield uncompressed_data[:newline_index]
                uncompressed_data = uncompressed_data[newline_index+1:]
                newline_index = uncompressed_data.find(b"\n")


def read_bz2_from_offset(path, offset, block_size=256*1024):
    bz2dec = bz2.BZ2Decompressor()
    uncompressed_data = b''
    with open(path, "rb") as infile:
        infile.seek(int(offset))
        while True:
            compressed_data = infile.read(block_size)
            try:
                uncompressed_data += bz2dec.decompress(compressed_data)
            except EOFError:
                # We've reached the end of the stream
                break
            # If there's no more data in the file
            if not compressed_data:
                if bz2dec.need_input:
                    raise Exception("Failed to read a complete stream")
    return uncompressed_data


class WikiDumpReader:
    def __init__(self, dump_file_path, index_file_path):
        self.dump_file_path = dump_file_path
        self.index_file_path = index_file_path

    def setup(self):
        self.construct_index_dict()

    def construct_index_dict(self):
        self.index_dict = {}
        prev_line = None
        for li, l in enumerate(read_bz2_streaming_lines(self.index_file_path)):
            fields = l.split(b':')
            if len(fields) < 3:
                continue
            try:
                offset = int(fields[0])
                page_id = int(fields[1])
                title = fields[2]
            except:
                # print(f'error at line {li+1}: {l}')
                # print(f'prev line: {prev_line}')
                continue
            self.index_dict[title] = offset
            prev_line = l


    def read_page(self, offset, page_id, title):
        xml_data = '<root>' + read_bz2_from_offset(self.dump_file_path, offset).decode('utf-8') + '</root>'
        root = ET.fromstring(xml_data)
        for page in root.findall("page"):
            if title is not None:
                if title != page.find("title").text:
                    continue
            if page_id is not None:
                if page_id != int(page.find("id").text):
                    continue
            revision = page.find("revision")
            wikitext = revision.find("text")
            return wikitext.text
        return None

    def read_page_by_title(self, title):
        return self.read_page(self.index_dict[title.encode('utf-8')], None, title)





In [10]:
# https://meta.wikimedia.org/wiki/Data_dumps/Dump_format


wdr = WikiDumpReader(
    dump_file_path='/home/tcteo/mnt/tcteo-data/enwiki/enwiki-20240701-pages-articles-multistream.xml.bz2',
    index_file_path='/home/tcteo/mnt/tcteo-data/enwiki/enwiki-20240701-pages-articles-multistream-index.txt.bz2',
)
wdr.setup()


In [16]:
s = wdr.read_page_by_title('Machine learning')
print(s)

{{Short description|Study of algorithms that improve automatically through experience}}
{{For|the journal|Machine Learning (journal){{!}}''Machine Learning'' (journal)}}
{{Redirect|Statistical learning|statistical learning in linguistics|statistical learning in language acquisition}}
{{Machine learning bar}}
{{Artificial intelligence|Major goals}}

'''Machine learning''' ('''ML''') is a [[field of study]] in [[artificial intelligence]] concerned with the development and study of [[Computational statistics|statistical algorithms]] that can learn from [[data]] and [[generalize]] to unseen data and thus perform [[Task (computing)|tasks]] without explicit [[Machine code|instructions]].{{refn|The definition "without being explicitly programmed" is often attributed to [[Arthur Samuel (computer scientist)|Arthur Samuel]], who coined the term "machine learning" in 1959, but the phrase is not found verbatim in this publication, and may be a [[paraphrase]] that appeared later. Confer "Paraphrasi