In [1]:
import bz2
import logging
import os
import re
import time

from io import StringIO
from multiprocessing import Queue, Process, cpu_count
from timeit import default_timer

# Adjust the regex pattern as raw strings to avoid escape sequence issues
tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')

In [2]:
class NextFile:
    filesPerDir = 100

    def __init__(self, path_name):
        self.path_name = path_name
        self.dir_index = -1
        self.file_index = -1

    def next(self):
        self.file_index = (self.file_index + 1) % NextFile.filesPerDir
        if self.file_index == 0:
            self.dir_index += 1
        dirname = self._dirname()
        if not os.path.isdir(dirname):
            os.makedirs(dirname)
        return self._filepath()

    def _dirname(self):
        char1 = self.dir_index % 26
        char2 = int(self.dir_index / 26) % 26
        return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))

    def _filepath(self):
        return '%s/wiki_%02d' % (self._dirname(), self.file_index)


class OutputSplitter:
    def __init__(self, nextFile, max_file_size=0, compress=True):
        """
        :param nextFile: a NextFile object from which to obtain filenames.
        :param max_file_size: the maximum size of each file.
        :param compress: whether to write data with bzip compression.
        """
        self.nextFile = nextFile
        self.compress = compress
        self.max_file_size = max_file_size
        self.file = self.open(self.nextFile.next())

    def reserve(self, size):
        if self.file.tell() + size > self.max_file_size:
            self.close()
            self.file = self.open(self.nextFile.next())

    def write(self, data):
        self.reserve(len(data))
        # If compressing, write as bytes; otherwise, write as string
        if self.compress:
            self.file.write(data.encode('utf-8'))  # Encode to bytes for bz2
        else:
            self.file.write(data)

    def close(self):
        self.file.close()

    def open(self, filename):
        if self.compress:
            return bz2.BZ2File(filename + '.bz2', 'wb')  # Open in binary mode for bz2
        else:
            return open(filename, 'w', encoding='utf-8')  # Open as text if not compressing

In [3]:
def load_templates(file):
    templates = {}
    page = []
    inText = False
    for line in file:
        if '<' not in line:
            if inText:
                page.append(line)
            continue
        m = tagRE.search(line)
        if not m:
            continue
        tag = m.group(2)
        if tag == 'page':
            page = []
        elif tag == 'title':
            title = m.group(3)
        elif tag == 'text':
            inText = True
            line = line[m.start(3):m.end(3)]
            page.append(line)
            if m.lastindex == 4:
                inText = False
        elif tag == '/text':
            if m.group(1):
                page.append(m.group(1))
            inText = False
        elif inText:
            page.append(line)
        elif tag == '/page':
            templates[title] = page
            page = []
    return templates


def process_pages(input_file, output_directory, max_file_size=100*1024, compress=True):
    next_file = NextFile(output_directory)
    output = OutputSplitter(next_file, max_file_size, compress)

    article_count = 0  # Initialize article counter
    start_time = time.time()  # Record start time for progress tracking

    # Open the XML file directly without bz2 compression
    with open(input_file, 'r', encoding='utf-8') as file:
        page = []
        for line in file:
            if '<' not in line:
                if page:
                    page.append(line)
                continue
            m = tagRE.search(line)
            if not m:
                continue
            tag = m.group(2)
            if tag == 'page':
                page = []
            elif tag == 'title':
                title = m.group(3)
            elif tag == 'text':
                line = line[m.start(3):m.end(3)]
                page.append(line)
                if m.lastindex == 4:
                    output.write(" ".join(page))
                    page = []
                    article_count += 1  # Increment article counter
            elif tag == '/text' and page:
                output.write(" ".join(page))
                page = []
                article_count += 1  # Increment article counter
            elif tag == '/page':
                page = []
                article_count += 1  # Increment article counter

            # Log progress every 100000 articles
            if article_count % 100000 == 0:
                elapsed_time = time.time() - start_time
                print(f"Processed {article_count} articles in {elapsed_time:.2f} seconds")

    # Final log for total articles processed
    print(f"Completed processing {article_count} articles in {time.time() - start_time:.2f} seconds")
    output.close()

In [4]:
# Set the paths to your input and output
input_file = r"C:\Users\Hi\My Works\My Py Scripts\Git Repos\29_Tamil Wiki\tawiki-20241101-pages-articles-multistream.xml"
output_directory = r"C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles"

# Process the Wikipedia dump file
process_pages(input_file, output_directory, max_file_size=10*1024*1024, compress=True)

Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
Processed 0 articles in 0.00 seconds
P