In [67]:
import json
import subprocess
from pathlib import Path

In [68]:

# ---------------- CONFIG ----------------
INPUT_JSONL = "data/tawiki_pages.jsonl"
OUTPUT_DIR = "data/markdown_batches"
BATCH_SIZE = 1000   # pages per batch file
# ---------------------------------------

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [69]:
import re

def convert_formatting(text):
    """
    Convert MediaWiki bold/italic markup to Markdown.
    """
    # Convert bold/italic markup per MediaWiki rules
    text = re.sub(r"'''''(.*?)'''''", r'***\1***', text)  # bold+italic ''''' → ***
    text = re.sub(r"'''(.*?)'''", r'**\1**', text)        # bold ''' → **
    text = re.sub(r"''(.*?)''", r'*\1*', text)            # italic '' → *
    return text

def wikitext_to_markdown(wiki_text):
    """
    Convert MediaWiki wikitext (including Tamil text) into Markdown format.
    Strips templates, references, images, etc., and translates headings, lists,
    bold/italic, and tables into Markdown syntax.
    """
    text = wiki_text
    
    # 1. Remove HTML comments, <ref> references, and <nowiki> sections
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    text = re.sub(r'<ref[^>]*?>.*?</ref>', '', text, flags=re.DOTALL|re.IGNORECASE)
    text = re.sub(r'<ref[^>]*/>', '', text, flags=re.IGNORECASE)
    text = re.sub(r'<nowiki>.*?</nowiki>', '', text, flags=re.DOTALL|re.IGNORECASE)
    
    # 2. Remove templates {{...}} (repeat in case of nested)
    while True:
        new_text = re.sub(r'\{\{[^{}]*\}\}', '', text)
        if new_text == text:
            break
        text = new_text

    # 3. Remove category tags
    text = re.sub(r'\[\[Category:[^\]]+\]\]', '', text)
    
    # 4. Remove images/files with all parameters - more aggressive approach
    # MediaWiki image syntax: [[File:name.jpg|thumb|300px|right|caption]]
    # Need to handle nested brackets properly
    def remove_images(text):
        # Match [[File: or [[Image: or [[:File: etc, and find the matching closing ]]
        pattern = r'\[\[:?(?:File|Image|படிமம்|கோப்பு):[^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*\]\]'
        return re.sub(pattern, '', text, flags=re.IGNORECASE)
    
    # Apply image removal multiple times to handle nested cases
    for _ in range(3):
        new_text = remove_images(text)
        if new_text == text:
            break
        text = new_text
    
    # Remove leftover image parameter fragments
    text = re.sub(r'\b(?:thumb|thumbnail|frame|frameless|border|left|right|center|none|\d+px)\b\|?', '', text)
    
    # Clean up leftover brackets, pipes, and parentheses from image removal
    # these are commented out because they were removing valid link closing brackets and table pipes
    # text = re.sub(r'\]\]\s*$', '', text, flags=re.MULTILINE)  # trailing ]]
    # text = re.sub(r'^\s*\]\]', '', text, flags=re.MULTILINE)  # leading ]]
    text = re.sub(r'^\s*\)\s*\]\]', '', text, flags=re.MULTILINE)  # ) ]] at line start
    # text = re.sub(r'\|\|+', '', text)  # multiple pipes
    text = re.sub(r'\(\s*\)', '', text)  # empty parentheses

    # 5. Convert external links [http://url label] → label (or drop if no label)
    def ext_link_repl(m):
        return m.group(2) if m.group(2) else ''
    text = re.sub(
        r'\[(?:https?://|ftp://)([^\s\]]+)(?:\s+([^\]]+))?\]', 
        ext_link_repl, text
    )
    
    # 6. Convert internal links [[Page|Label]] → Label or [[Page]] → Page
    def int_link_repl(m):
        page = m.group(1)
        label = m.group(2)
        # Skip namespace links like File: or Category:
        if ':' in page:
            prefix = page.lower().split(':', 1)[0]
            if prefix in ('file', 'image', 'category', 'help', 'wikipedia', 'படிமம்', 'கோப்பு'):
                return ''
        return label if label else page
    text = re.sub(
        r'\[\[([^|\]]+)(?:\|([^]]+))?\]\]', 
        int_link_repl, text
    )

    # 7. Remove HTML/CSS attributes from tables (colspan, rowspan, style, etc.)
    text = re.sub(r'\s*(?:colspan|rowspan|style|class|align|valign|bgcolor|width|height)\s*=\s*"[^"]*"', '', text, flags=re.IGNORECASE)
    text = re.sub(r"\s*(?:colspan|rowspan|style|class|align|valign|bgcolor|width|height)\s*=\s*'[^']*'", '', text, flags=re.IGNORECASE)
    
    # 8. (Moved) Bold/Italic conversion is now done during line processing 
    # to avoid conflict with list detection (since ''' becomes **)
    
    # 9. Replace <br> with newline (for tables or paragraphs)
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    
    # 10. Remove Wikipedia metadata sections (See also, References, External links, Categories)
    # These sections typically appear at the end of articles
    metadata_sections = [
        r'==\s*(?:இவற்றையும் பார்க்கவும்|See also|மேலும் காண்க)\s*==.*',
        r'==\s*(?:மேற்கோள்கள்|References|குறிப்புகள்|சான்றுகள்)\s*==.*',
        r'==\s*(?:வெளி இணைப்புகள்|External links|புற இணைப்புகள்)\s*==.*',
        r'==\s*(?:நூற்பட்டியல்|Bibliography|நூல்கள்)\s*==.*',
        r'==\s*(?:மேலும் படிக்க|Further reading)\s*==.*',
        r'\[\[பகுப்பு:.*?\]\]',  # Category tags in content
        r'பகுப்பு:.*$',  # Category: lines
    ]
    
    for pattern in metadata_sections:
        text = re.sub(pattern, '', text, flags=re.DOTALL|re.MULTILINE|re.IGNORECASE)
    
    # 11. Final cleanup - remove extra spaces and normalize whitespace
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # max 2 consecutive newlines
    text = re.sub(r'[ \t]+', ' ', text)  # normalize spaces

    # Now split into lines and process headings, lists, and tables
    lines = text.splitlines()
    result_lines = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # Skip empty or whitespace-only lines that are just punctuation
        if line and line in (']]', ')', ') ]]', '|', '||'):
            i += 1
            continue
        
        # A. Tables: collect lines from {| to |} and convert them
        if line.startswith('{|'):
            table_lines = [line]
            i += 1
            # Collect until end of table
            while i < len(lines) and not lines[i].strip().startswith('|}'):
                table_lines.append(lines[i].strip())
                i += 1
            if i < len(lines):
                table_lines.append(lines[i].strip())  # include '|}'
            # Convert the table block
            md_table = convert_wikitable_to_markdown(table_lines)
            result_lines.extend(md_table.splitlines())
            i += 1
            continue
        
        # B. Headings: e.g. == Heading == → "## Heading"
        m = re.match(r'^(=+)\s*(.*?)\s*(=+)$', line)
        if m and len(m.group(1)) == len(m.group(3)):
            level = min(len(m.group(1)), 6)
            heading_text = m.group(2).strip()
            
            # Apply formatting to heading text
            heading_text = convert_formatting(heading_text)

            # Skip metadata section headings if they somehow survived
            skip_headings = [
                'இவற்றையும் பார்க்கவும்', 'See also', 'மேலும் காண்க',
                'மேற்கோள்கள்', 'References', 'குறிப்புகள்', 'சான்றுகள்',
                'வெளி இணைப்புகள்', 'External links', 'புற இணைப்புகள்',
                'நூற்பட்டியல்', 'Bibliography', 'நூல்கள்',
                'மேலும் படிக்க', 'Further reading', 'பின்வருவனவற்றையும் பார்க்கவும்','உசாத்துணை'
            ]
            if heading_text not in skip_headings:
                result_lines.append('#' * level + ' ' + heading_text)
            i += 1
            continue
        
        # C. Horizontal rule: lines of ---- → ---
        if re.match(r'^-+$', line):
            result_lines.append('---')
            i += 1
            continue
        
        # D. Lists:
        m_list = re.match(r'^([*#]+)\s*(.*)', line)
        if m_list:
            markers, content = m_list.group(1), m_list.group(2).strip()
            # Apply formatting to list content
            content = convert_formatting(content)
            
            indent = '  ' * (len(markers) - 1)
            if markers[0] == '*':
                result_lines.append(f"{indent}- {content}")
            else:  # markers[0] == '#'
                result_lines.append(f"{indent}1. {content}")
            i += 1
            continue
        
        # E. Regular line
        if not line:
            result_lines.append("")
        else:
            # Apply formatting to regular line and add trailing spaces for line break
            result_lines.append(convert_formatting(line) + "  ")
        i += 1
    
    return "\n".join(result_lines)

def convert_wikitable_to_markdown(table_lines):
    """
    Convert a list of lines in a MediaWiki table ({| ... |}) into Markdown table syntax.
    """
    headers = []
    rows = []
    caption = None

    # Check for caption (first line after "{|") that starts with "|+"
    if len(table_lines) > 1 and table_lines[1].startswith('|+'):
        caption = table_lines[1].lstrip('|+').strip()
        table_lines.pop(1)
    
    # Join all lines and then re-split properly to handle multi-line cells
    full_text = '\n'.join(table_lines)
    
    # Remove HTML attributes from table cells (including colspan which we'll ignore)
    full_text = re.sub(r'\s*(?:colspan|rowspan|style|class|align|valign|bgcolor|width|height)\s*=\s*"[^"]*"\s*\|', ' | ', full_text, flags=re.IGNORECASE)
    full_text = re.sub(r'\s*(?:colspan|rowspan|style|class|align|valign|bgcolor|width|height)\s*=\s*"[^"]*"', '', full_text, flags=re.IGNORECASE)
    
    # Split by row separator |-
    table_rows = re.split(r'\n\|-+\n?', full_text)
    
    for row_text in table_rows:
        row_text = row_text.strip()
        if not row_text or row_text.startswith('{|') or row_text.startswith('|}'):
            continue
        
        # Check if this is a header row (starts with !)
        if row_text.startswith('!'):
            # Split by !! for inline cells or \n! for multi-line cells
            cells = re.split(r'!!|\n!', row_text)
            cleaned_cells = []
            for cell in cells:
                cell = cell.lstrip('!').strip()
                # Remove attributes after | in cell
                if '|' in cell:
                    cell = cell.split('|')[-1].strip()
                if cell:
                    # Convert bold/italic within cell
                    cell = convert_formatting(cell)
                    cleaned_cells.append(cell)
            if not headers and cleaned_cells:
                headers = cleaned_cells
        
        # Check if this is a data row (starts with |)
        elif row_text.startswith('|'):
            # Split by || for inline cells or \n| for multi-line cells
            cells = re.split(r'\|\||\n\|', row_text)
            cleaned_cells = []
            for cell in cells:
                cell = cell.lstrip('|').strip()
                # Remove attributes after | in cell (e.g., | content after removing attributes)
                if '|' in cell:
                    # Split and take last part (content after last |)
                    cell = cell.split('|')[-1].strip()
                if cell:
                    # Convert bold/italic within cell
                    cell = convert_formatting(cell)
                    cleaned_cells.append(cell)
            
            # If only one cell and no headers yet, treat as caption
            if len(cleaned_cells) == 1 and not headers and not rows:
                caption = cleaned_cells[0]
            elif cleaned_cells:
                rows.append(cleaned_cells)
    
    # If no explicit header but rows exist, use first row as header
    if not headers and rows:
        headers = rows.pop(0)
    if not headers:
        return ""

    # Determine the maximum number of columns
    max_cols = max(len(headers), max((len(row) for row in rows), default=0))
    
    # Pad headers if needed
    if len(headers) < max_cols:
        headers += [''] * (max_cols - len(headers))

    # Build markdown table
    md = []
    if caption:
        md.append(f"**{caption}**\n")  # Caption as bold heading
    md.append("| " + " | ".join(headers) + " |")
    md.append("| " + " | ".join("---" for _ in headers) + " |")
    for row in rows:
        # Pad row if it has fewer cells than headers
        if len(row) < len(headers):
            row += [''] * (len(headers) - len(row))
        # Truncate if too many cells
        elif len(row) > len(headers):
            row = row[:len(headers)]
        md.append("| " + " | ".join(row) + " |")
    md.append("")  # blank line after table
    return "\n".join(md)

In [72]:
index = 16

# Read index th entry from JSONL

with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i == index:
            page = json.loads(line)
            break

wikitext = page['wikitext']
wikitext

'{{Infobox country\n| conventional_long_name                = இந்தியக் குடியரசு<br/>Republic of India\n| common_name                = இந்தியா\n| native_name                = {{transl|hi|ISO|பாரத் கணராச்சியம்}}<br /><small>(பார்க்க [[அதிகாரப்பூர்வமான மொழிகளில் இந்தியாவின் பெயர்கள்|ஏனைய பெயர்கள்]])</small>\n| image_flag                = Flag of India.svg\n| alt_flag                = கிடைமட்ட மூவர்ணக் கொடி, மேலிருந்து கீழாக, ஆழமான குங்குமப்பூ, வெள்ளை, பச்சை கிடைமட்டப் பட்டைகள். வெள்ளைப் பட்டையின் நடுவில் 24 குறுக்குக் கம்பிகளுடன் நீலச் சக்கரம் உள்ளது.\n| image_coat                = Emblem of India.svg\n| symbol_width                = 60px\n| alt_coat                = இடது, வலது மற்றும் பார்வையாளரை நோக்கி மூன்று சிங்கங்கள், 24-குறுக்குக் கம்பிகள் கொண்ட சக்கரம் மற்றும் யானை ஆகியவற்றைக் கொண்ட பட்டையின் மேல் பாய்ந்து செல்லும் குதிரை. அடியில் உள்ள குறிக்கோள்: "சத்யமேவ செயதே".\n| symbol_type                = [[இந்திய தேசிய இலச்சினை|தேசிய இலச்சினை]]\n| other_symbol                = "[[வந்தே மாதர

In [73]:
md = wikitext_to_markdown(wikitext)
md

# save to test.md
with open("test.md", "w", encoding="utf-8") as f:
    f.write(md)

# save the raw wikitext
with open("raw_wiki.txt","w") as rf:
    rf.write(wikitext)

In [None]:
import pandas as pd
from tqdm import tqdm

# Process all pages from JSONL and create parquet with text column
data = []

print(f"Processing {INPUT_JSONL}...")

with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        record = json.loads(line)
        title = record.get('title', '')
        wikitext = record.get('wikitext', '')
        
        # Convert wikitext to markdown
        markdown_content = wikitext_to_markdown(wikitext)
        
        # Format as markdown with title as heading
        text = f"# {title}\n\n{markdown_content}"
        
        data.append({'text': text})

# Create DataFrame and save as parquet
df = pd.DataFrame(data)
output_file = "data/tawiki_markdown.parquet"
df.to_parquet(output_file, index=False)

print(f"\nSaved {len(df)} pages to {output_file}")
print(f"DataFrame shape: {df.shape}")