Final code along with check

In [3]:
import bz2
import os
import re
import time

# Regular expression pattern for XML tag extraction, with raw string to avoid escape sequence issues
tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')

In [5]:
class NextFile:
    """
    Class to manage the creation of sequential files for output.
    """
    filesPerDir = 100  # Max files per directory

    def __init__(self, path_name):
        self.path_name = path_name
        self.dir_index = -1
        self.file_index = -1

    def next(self):
        # Move to next file; create directory if needed
        self.file_index = (self.file_index + 1) % NextFile.filesPerDir
        if self.file_index == 0:
            self.dir_index += 1
        dirname = self._dirname()
        os.makedirs(dirname, exist_ok=True)
        return self._filepath()

    def _dirname(self):
        # Generate subdirectory name based on current indices
        char1 = self.dir_index % 26
        char2 = (self.dir_index // 26) % 26
        return os.path.join(self.path_name, f'{chr(ord("A") + char2)}{chr(ord("A") + char1)}')

    def _filepath(self):
        # Generate filepath for the next output file
        return f'{self._dirname()}/wiki_{self.file_index:02d}'


class OutputSplitter:
    """
    Class to manage output files, ensuring each file does not exceed a given size.
    """
    def __init__(self, nextFile, max_file_size=0, compress=True):
        self.nextFile = nextFile
        self.compress = compress
        self.max_file_size = max_file_size
        self.file = self.open(self.nextFile.next())

    def reserve(self, size):
        # Close and open a new file if current file exceeds max size
        if self.file.tell() + size > self.max_file_size:
            self.close()
            self.file = self.open(self.nextFile.next())

    def write_article(self, article_id, title, text):
        # Wrap each article with <doc> tags and write to file
        doc = f'<doc id="{article_id}" title="{title}">\n{text}\n</doc>\n\n'
        self.write(doc)

    def write(self, data):
        self.reserve(len(data))
        # Write data with compression if enabled
        self.file.write(data.encode('utf-8') if self.compress else data)

    def close(self):
        self.file.close()

    def open(self, filename):
        # Open file with or without compression
        return bz2.open(filename + '.bz2', 'wb') if self.compress else open(filename, 'w', encoding='utf-8')

In [7]:
def process_pages(input_file, output_directory, max_file_size=100 * 1024, compress=True):
    """
    Processes pages in the input XML file and outputs each article with <doc> tags.
    """
    next_file = NextFile(output_directory)
    output = OutputSplitter(next_file, max_file_size, compress)

    article_count = 0
    start_time = time.time()
    last_milestone = 0  # Tracks the last milestone printed

    with open(input_file, 'r', encoding='utf-8') as file:
        page = []
        title = ""
        for line in file:
            if '<' not in line:
                if page:
                    page.append(line)
                continue

            m = tagRE.search(line)
            if not m:
                continue
            tag = m.group(2)
            if tag == 'page':
                page = []
                title = ""
            elif tag == 'title':
                title = m.group(3) or ""
            elif tag == 'text':
                line = line[m.start(3):m.end(3)]
                page.append(line)
                if m.lastindex == 4:
                    output.write_article(article_count, title, " ".join(page))
                    page = []
                    article_count += 1
            elif tag == '/text' and page:
                output.write_article(article_count, title, " ".join(page))
                page = []
                article_count += 1
            elif tag == '/page':
                output.write_article(article_count, title, " ".join(page))
                page = []
                article_count += 1

            # Log progress every 100,000 articles
            if article_count >= last_milestone + 100000:
                elapsed_time = time.time() - start_time
                print(f"Processed {article_count} articles in {elapsed_time:.2f} seconds")
                last_milestone = article_count

    # Final log for total articles processed
    elapsed_time = time.time() - start_time
    print(f"Completed processing {article_count} articles in {elapsed_time:.2f} seconds")
    output.close()

In [9]:
def extract_bz2_to_txt(input_directory, output_directory):
    """
    Extracts all .bz2 files in input_directory to .txt files in output_directory.
    """
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Walk through all subdirectories and extract .bz2 files
    for subdir, _, files in os.walk(input_directory):
        for file in files:
            if file.endswith('.bz2'):
                bz2_file_path = os.path.join(subdir, file)

                # Maintain subdirectory structure in output directory
                relative_path = os.path.relpath(subdir, input_directory)
                output_subdir = os.path.join(output_directory, relative_path)
                os.makedirs(output_subdir, exist_ok=True)

                txt_file_path = os.path.join(output_subdir, file.replace('.bz2', '.txt'))

                # Extract content
                with bz2.open(bz2_file_path, 'rt', encoding='utf-8') as bz2_file:
                    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
                        txt_file.write(bz2_file.read())

                print(f"Extracted {bz2_file_path} to {txt_file_path}")

In [11]:
# Paths for input and output directories
input_file = r"C:\Users\Hi\My Works\My Py Scripts\Git Repos\29_Tamil Wiki\tawiki-20241101-pages-articles-multistream.xml"
output_directory = r"C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles"
input_directory = "C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles"
output_directory_txt = "C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles"

In [13]:
# Run the page processing and extraction functions
process_pages(input_file, output_directory, max_file_size=10 * 1024 * 1024, compress=True)
extract_bz2_to_txt(input_directory, output_directory_txt)

Processed 100000 articles in 17.87 seconds
Processed 200000 articles in 28.06 seconds
Processed 300000 articles in 43.39 seconds
Processed 400000 articles in 58.05 seconds
Processed 500000 articles in 76.21 seconds
Completed processing 569055 articles in 88.00 seconds
Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_00.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles\AA\wiki_00.txt
Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_01.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles\AA\wiki_01.txt
Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_02.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles\AA\wiki_02.txt
Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_03.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/2

In [14]:
def count_articles_in_directory(directory):
    # Regex pattern to match <doc> tags
    doc_tag_pattern = re.compile(r'<doc id="[^"]*" title="[^"]*">')
    
    total_articles = 0  # Initialize article counter
    
    # Walk through all subdirectories and files in the specified directory
    for subdir, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(subdir, file)
                
                # Open each text file and count <doc> tags
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    total_articles += len(doc_tag_pattern.findall(content))
    
    print(f"Total articles found: {total_articles}")

In [17]:
# Run the article count function
count_articles_in_directory(output_directory_txt)

Total articles found: 569055
