In [1]:
import json
import subprocess
from pathlib import Path

In [2]:

# ---------------- CONFIG ----------------
INPUT_JSONL = "data/tawiki_pages.jsonl"
OUTPUT_DIR = "data/markdown_batches"
BATCH_SIZE = 1000   # pages per batch file
# ---------------------------------------

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [6]:
index = 10

# Read index th entry from JSONL

with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i == index:
            page = json.loads(line)
            break


In [8]:
wikitext = page['wikitext']

In [9]:
wikitext

'{{Infobox Person\n  | name = கிறித்தோபர் கொலம்பசு<br />Christopher Columbus\n  | image = Colomb.jpeg\n  | image_size = \n  | occupation___do_we_want_this? = [[List of maritime explorers|Maritime explorer]] for the [[Crown of Castile]]\n  | title = பெருங்கடல் ஆட்மிரல்\n  | caption = கொலம்பசின் இறந்தபின்னரான உருவப் படம் (வரைந்தவர்: ரிடொல்ல்ஃபோ கேர்லாண்டையோ\n  | birth_date = ஆகத்து-அக்டோபர் [[1451]]\n  | birth_place = [[ஜெனோவா]], [[இத்தாலி]]\n  | death_date = {{death date|1506|5|20|mf=y}}\n  | death_place = வல்லடோலிட், [[எசுப்பானியா]]\n  | names in other languages = [[இலத்தீன்]]: Christophorus Columbus; [[இத்தாலிய மொழி|இத்தாலியம்]]: \'\'\'Cristoforo Colombo\'\'\'; [[போர்த்துக்கேய மொழி|போர்த்துக்கேயம்]]: \'\'\'Cristóvão Colombo\'\'\', formerly \'\'Christovam Colom\'\'; [[எசுப்பானியம்]]: \'\'\'Cristóbal Colón\'\'\'; [[காட்டலான் மொழி|காட்டலான்]]: \'\'\'Cristòfor Colom\'\'\'\n  | resting_place=[[செவீயா பெருங்கோவில்]]\n  | nationality = [[இத்தாலி|ஜெனோவியர்(சர்ச்சைக்குரியது)]]\n  | other_names

In [10]:
import re

def wikitext_to_markdown(wiki_text):
    """
    Convert MediaWiki wikitext (including Tamil text) into Markdown format.
    Strips templates, references, images, etc., and translates headings, lists,
    bold/italic, and tables into Markdown syntax.
    """
    text = wiki_text
    
    # 1. Remove HTML comments, <ref> references, and <nowiki> sections
    text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
    text = re.sub(r'<ref[^>]*?>.*?</ref>', '', text, flags=re.DOTALL|re.IGNORECASE)
    text = re.sub(r'<ref[^>]*/>', '', text, flags=re.IGNORECASE)
    text = re.sub(r'<nowiki>.*?</nowiki>', '', text, flags=re.DOTALL|re.IGNORECASE)
    
    # 2. Remove templates {{...}} (repeat in case of nested)
    while True:
        new_text = re.sub(r'\{\{[^{}]*\}\}', '', text)
        if new_text == text:
            break
        text = new_text

    # 3. Remove category tags and images/files
    text = re.sub(r'\[\[Category:[^\]]+\]\]', '', text)              # drop [[Category:...]]
    text = re.sub(r'\[\[:?(?:File|Image):[^\]]+\]\]', '', text, flags=re.IGNORECASE)  # drop [[File:...]], [[:Image:...]]

    # 4. Convert external links [http://url label] → label (or drop if no label)
    def ext_link_repl(m):
        return m.group(2) if m.group(2) else ''
    text = re.sub(
        r'\[(?:https?://|ftp://)([^\s\]]+)(?:\s+([^\]]+))?\]', 
        ext_link_repl, text
    )
    
    # 5. Convert internal links [[Page|Label]] → Label or [[Page]] → Page
    def int_link_repl(m):
        page = m.group(1)
        label = m.group(2)
        # Skip namespace links like File: or Category:
        if ':' in page:
            prefix = page.lower().split(':', 1)[0]
            if prefix in ('file', 'image', 'category', 'help', 'wikipedia'):
                return ''
        return label if label else page
    text = re.sub(
        r'\[\[([^|\]]+)(?:\|([^]]+))?\]\]', 
        int_link_repl, text
    )

    # 6. Convert bold/italic markup per MediaWiki rules
    text = re.sub(r"'''''(.*?)'''''", r'***\1***', text)  # bold+italic ''''' → ***
    text = re.sub(r"'''(.*?)'''", r'**\1**', text)        # bold ''' → **
    text = re.sub(r"''(.*?)''", r'*\1*', text)            # italic '' → *
    
    # 7. Replace <br> with newline (for tables or paragraphs)
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)

    # Now split into lines and process headings, lists, and tables
    lines = text.splitlines()
    result_lines = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # A. Tables: collect lines from {| to |} and convert them
        if line.startswith('{|'):
            table_lines = [line]
            i += 1
            # Collect until end of table
            while i < len(lines) and not lines[i].strip().startswith('|}'):
                table_lines.append(lines[i].strip())
                i += 1
            if i < len(lines):
                table_lines.append(lines[i].strip())  # include '|}'
            # Convert the table block
            md_table = convert_wikitable_to_markdown(table_lines)
            result_lines.extend(md_table.splitlines())
            i += 1
            continue
        
        # B. Headings: e.g. == Heading == → "## Heading"
        m = re.match(r'^(=+)\s*(.*?)\s*(=+)$', line)
        if m and len(m.group(1)) == len(m.group(3)):
            level = min(len(m.group(1)), 6)
            heading_text = m.group(2).strip()
            result_lines.append('#' * level + ' ' + heading_text)
            i += 1
            continue
        
        # C. Horizontal rule: lines of ---- → ---
        if re.match(r'^-+$', line):
            result_lines.append('---')
            i += 1
            continue
        
        # D. Lists:
        m_list = re.match(r'^([*#]+)\s*(.*)', line)
        if m_list:
            markers, content = m_list.group(1), m_list.group(2).strip()
            indent = '  ' * (len(markers) - 1)
            if markers[0] == '*':
                result_lines.append(f"{indent}- {content}")
            else:  # markers[0] == '#'
                result_lines.append(f"{indent}1. {content}")
            i += 1
            continue
        
        # E. Regular line (non-empty)
        if line:
            result_lines.append(line)
        i += 1
    
    return "\n".join(result_lines)

def convert_wikitable_to_markdown(table_lines):
    """
    Convert a list of lines in a MediaWiki table ({| ... |}) into Markdown table syntax.
    """
    headers = []
    rows = []
    caption = None

    # Check for caption (first line after "{|") that starts with "|+"
    if len(table_lines) > 1 and table_lines[1].startswith('|+'):
        caption = table_lines[1].lstrip('|+').strip()
        table_lines.pop(1)
    
    for line in table_lines:
        if line.startswith('{|') or line.startswith('|}') or line.startswith('|-'):
            continue
        if line.startswith('!'):  # header row
            parts = [p.strip() for p in re.split(r'!!|\n!\s*', line[1:])]
            if not headers:
                headers = parts
        elif line.startswith('|'):  # data row
            parts = [p.strip() for p in re.split(r'\|\||\n\|\s*', line[1:])]
            rows.append(parts)
    
    # If no explicit header but rows exist, use first row as header
    if not headers and rows:
        headers = rows.pop(0)
    if not headers:
        return ""

    # Build markdown table
    md = []
    if caption:
        md.append(f"*{caption}*")
    md.append("| " + " | ".join(headers) + " |")
    md.append("| " + " | ".join("---" for _ in headers) + " |")
    for row in rows:
        # Pad row if it has fewer cells than headers
        if len(row) < len(headers):
            row += [''] * (len(headers) - len(row))
        md.append("| " + " | ".join(row) + " |")
    md.append("")  # blank line after table
    return "\n".join(md)


In [12]:
md = wikitext_to_markdown(wikitext)
md

# save to test.md
with open("test.md", "w", encoding="utf-8") as f:
    f.write(md)