In [106]:
import json
import subprocess
from pathlib import Path
import sys

# Remove cached module to force reload
if 'preprocessing.utils' in sys.modules:
    del sys.modules['preprocessing.utils']

from preprocessing.utils import *

In [107]:

# ---------------- CONFIG ----------------
INPUT_JSONL = "data/tawiki_pages.jsonl"
OUTPUT_DIR = "data/markdown_batches"
BATCH_SIZE = 1000   # pages per batch file
# ---------------------------------------

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [None]:
index = 1090

# Read index th entry from JSONL

with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i == index:
            page = json.loads(line)
            break

wikitext = page['wikitext']
wikitext

"{{தகவற்சட்டம் நபர்\n|name = அகிலன்\n|image = akilan.jpg\n|imagesize  = 200px\n|caption = \n|birth_name = பி. வி. அகிலாண்டம்\n|birth_date ={{birth date|df=yes|1922|6|27}}\n| birth_place = [[பெருங்களூர் ஊராட்சி|பெருங்களூர்]], [[புதுக்கோட்டை சமஸ்தானம்|புதுக்கோட்டை]], [[பிரித்தானிய இந்தியாவின் மாகாணங்களும், ஆட்சிப் பகுதிகளும்|இந்தியா]]\n|death_date = {{Death date and age|1988|1|31|1922|3|27}}<ref>{{Citation |title=ஓர் எழுத்தாளரின் திரைப் பயணம்! |date=2020-01-31 |url=https://www.hindutamil.in/news/supplements/hindu-talkies/537449-a-writer-s-screen-trip.html |website=Hindu Tamil Thisai |language=ta |accessdate=2024-06-22}}</ref>\n|death_place =\n|death_cause =\n|resting_place =\n|resting_place_coordinates =\n|residence = \n|nationality =\n|other_names =\n|known_for = புதின, சிறுகதை எழுத்தாளர்\n|notableworks = சித்திரப்பாவை, வேங்கையின் மைந்தன், பாவை விளக்கு\n|education =\n|employer =\n| occupation = எழுத்தாளர்\n| title =\n| religion=\n| spouse=\n|children=\n|parents=\n|speciality=\n|relative

In [None]:
md = wikitext_to_markdown(wikitext)

# save to test.md
with open("test.md", "w", encoding="utf-8") as f:
    f.write(md)

# save the raw wikitext
with open("raw_wiki.txt","w", encoding="utf-8") as rf:
    rf.write(wikitext)

In [113]:
import pandas as pd
from tqdm import tqdm

# Process all pages from JSONL and create parquet with text column
data = []

print(f"Processing {INPUT_JSONL}...")

with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        record = json.loads(line)
        title = record.get('title', '')
        wikitext = record.get('wikitext', '')
        
        # Convert wikitext to markdown
        markdown_content = wikitext_to_markdown(wikitext)

        # the markdown content have less than 30 words, skip
        if len(markdown_content.split()) < 30:
            continue
        
        # Format as markdown with title as heading
        text = f"# {title}\n\n{markdown_content}"
        
        data.append({'text': text})

# Create DataFrame and save as parquet
df = pd.DataFrame(data)
output_file = "data/tawiki_markdown.parquet"
df.to_parquet(output_file, index=False)

print(f"\nSaved {len(df)} pages to {output_file}")
print(f"DataFrame shape: {df.shape}")

Processing data/tawiki_pages.jsonl...


300569it [10:31, 476.18it/s] 



Saved 171015 pages to data/tawiki_markdown.parquet
DataFrame shape: (171015, 1)
