Extract and check all the Tamil articles in Wikipedia

In [5]:
import bz2
import logging
import os
import re
import time

from io import StringIO
from multiprocessing import Queue, Process, cpu_count
from timeit import default_timer

# Adjust the regex pattern as raw strings to avoid escape sequence issues
tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*>(?:([^<]*)(<.*?>)?)?')

In [7]:
class NextFile:
    filesPerDir = 100

    def __init__(self, path_name):
        self.path_name = path_name
        self.dir_index = -1
        self.file_index = -1

    def next(self):
        self.file_index = (self.file_index + 1) % NextFile.filesPerDir
        if self.file_index == 0:
            self.dir_index += 1
        dirname = self._dirname()
        if not os.path.isdir(dirname):
            os.makedirs(dirname)
        return self._filepath()

    def _dirname(self):
        char1 = self.dir_index % 26
        char2 = int(self.dir_index / 26) % 26
        return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))

    def _filepath(self):
        return '%s/wiki_%02d' % (self._dirname(), self.file_index)

class OutputSplitter:
    def __init__(self, nextFile, max_file_size=0, compress=True):
        self.nextFile = nextFile
        self.compress = compress
        self.max_file_size = max_file_size
        self.file = self.open(self.nextFile.next())

    def reserve(self, size):
        if self.file.tell() + size > self.max_file_size:
            self.close()
            self.file = self.open(self.nextFile.next())

    def write_article(self, article_id, title, text):
        # Wrap each article with <doc> tags
        doc = f'<doc id="{article_id}" title="{title}">\n{text}\n</doc>\n\n'
        self.write(doc)

    def write(self, data):
        self.reserve(len(data))
        if self.compress:
            self.file.write(data.encode('utf-8'))  # Encode to bytes for bz2
        else:
            self.file.write(data)

    def close(self):
        self.file.close()

    def open(self, filename):
        if self.compress:
            return bz2.BZ2File(filename + '.bz2', 'wb')
        else:
            return open(filename, 'w', encoding='utf-8')

In [15]:
def process_pages(input_file, output_directory, max_file_size=100*1024, compress=True):
    next_file = NextFile(output_directory)
    output = OutputSplitter(next_file, max_file_size, compress)

    article_count = 0
    start_time = time.time()
    last_milestone = 0  # Track the last milestone at which we printed

    with open(input_file, 'r', encoding='utf-8') as file:
        page = []
        title = ""
        for line in file:
            if '<' not in line:
                if page:
                    page.append(line)
                continue
            m = tagRE.search(line)
            if not m:
                continue
            tag = m.group(2)
            if tag == 'page':
                page = []
                title = ""
            elif tag == 'title':
                title = m.group(3) or ""
            elif tag == 'text':
                line = line[m.start(3):m.end(3)]
                page.append(line)
                if m.lastindex == 4:
                    output.write_article(article_count, title, " ".join(page))
                    page = []
                    article_count += 1
            elif tag == '/text' and page:
                output.write_article(article_count, title, " ".join(page))
                page = []
                article_count += 1
            elif tag == '/page':
                output.write_article(article_count, title, " ".join(page))
                page = []
                article_count += 1

            # Log progress every 100,000 articles
            if article_count >= last_milestone + 100000:
                elapsed_time = time.time() - start_time
                print(f"Processed {article_count} articles in {elapsed_time:.2f} seconds")
                last_milestone = article_count  # Update the last milestone

    # Final log for total articles processed
    elapsed_time = time.time() - start_time
    print(f"Completed processing {article_count} articles in {elapsed_time:.2f} seconds")
    output.close()

In [17]:
# Set paths for input and output
input_file = r"C:\Users\Hi\My Works\My Py Scripts\Git Repos\29_Tamil Wiki\tawiki-20241101-pages-articles-multistream.xml"
output_directory = r"C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles"

# Process the Wikipedia dump file
process_pages(input_file, output_directory, max_file_size=10*1024*1024, compress=True)

Processed 100000 articles in 17.57 seconds
Processed 200000 articles in 27.57 seconds
Processed 300000 articles in 43.20 seconds
Processed 400000 articles in 58.08 seconds
Processed 500000 articles in 75.68 seconds
Completed processing 569055 articles in 86.86 seconds


In [18]:
def extract_bz2_to_txt(input_directory, output_directory):
    """
    Extracts all .bz2 files in the input_directory and subdirectories,
    saving the contents as .txt files in the output_directory.
    
    Parameters:
    - input_directory: The root directory containing .bz2 files in subdirectories.
    - output_directory: The directory to save extracted .txt files.
    """
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)
    
    # Walk through all subdirectories and files
    for subdir, _, files in os.walk(input_directory):
        for file in files:
            if file.endswith('.bz2'):
                # Construct full file path
                bz2_file_path = os.path.join(subdir, file)
                
                # Determine the output .txt file path
                # Use subdirectory structure in output directory to mirror input
                relative_path = os.path.relpath(subdir, input_directory)
                output_subdir = os.path.join(output_directory, relative_path)
                os.makedirs(output_subdir, exist_ok=True)
                
                # Create the output .txt file path
                txt_file_path = os.path.join(output_subdir, file.replace('.bz2', '.txt'))
                
                # Read from .bz2 and write to .txt
                with bz2.open(bz2_file_path, 'rt', encoding='utf-8') as bz2_file:
                    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
                        txt_file.write(bz2_file.read())
                
                print(f"Extracted {bz2_file_path} to {txt_file_path}")

In [19]:
# Define paths for input and output directories
input_directory = "C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles"
output_directory = "C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles"

# Run the extraction
extract_bz2_to_txt(input_directory, output_directory)

Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_00.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles\AA\wiki_00.txt
Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_01.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles\AA\wiki_01.txt
Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_02.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles\AA\wiki_02.txt
Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_03.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles\AA\wiki_03.txt
Extracted C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/extracted_articles\AA\wiki_04.bz2 to C:/Users/Hi/My Works/My Py Scripts/Git Repos/29_Tamil Wiki/txt_articles\AA\wiki_04.txt
Extracted C:/Users/Hi/My Works/My P