In [212]:
import docx
from glob import glob
import re
import os
import unicodedata
from firecrawl import FirecrawlApp

In [206]:
def read_docx(path: str) -> str:
    doc = docx.Document(path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def generate_tokens(text: str) -> list[str]:
    tokens = re.split(r'([^\w\u0300-\u036f\s]|\s+)', text)
    tokens = [t for t in tokens if t]
    return tokens

def join_tokens(tokens: list[str]) -> str:
    return ''.join(tokens)

def remove_diacritics(word: str) -> str:
    normalized = unicodedata.normalize('NFKD', word)
    return ''.join(c for c in normalized if not unicodedata.combining(c))

def add_diacritics(word: str, mappings: dict[str, str]) -> str:
    word_with_diacritics = mappings.get(word.lower(), word)

    if word.isupper():
        return word_with_diacritics.upper()
    elif word.islower():
        return word_with_diacritics.lower()
    elif word[0].isupper():
        return word_with_diacritics[0].upper() + word_with_diacritics[1:]
    else:
        raise ValueError(f"Cannot add diacritics to word: {word} with mapping: {word_with_diacritics}")
    
def make_mappings(tokens: list[str]) -> dict[str, str]:
    mappings = {}
    for token in tokens:
        token_without_diacritics = remove_diacritics(token)
        if (token != token_without_diacritics) and not token.isspace():
            mappings[token_without_diacritics.lower()] = token.lower()
    return mappings

def reconstruct_tokens(tokens: list[str], mappings: dict[str, str]) -> list[str]:
    reconstructed_tokens = []
    for token in tokens:
        if token not in mappings:
            reconstructed_tokens.append(token)
        else:
            reconstructed_tokens.append(add_diacritics(token, mappings))
    return reconstructed_tokens

def verify(text: str, reconstructed_text: str, tokens: list[str], reconstructed_tokens: list[str]) -> bool:
    if text != reconstructed_text:

        #do length check on tokens
        if len(tokens) != len(reconstructed_tokens):
            print(f"length mismatch in {path}")
            print(f"tokens: {len(tokens)}, reconstructed_tokens: {len(reconstructed_tokens)}")

        #do length check
        if len(text) != len(reconstructed_text):
            print(f"length mismatch in {path}")
            print(f"text: {len(text)}, reconstructed_text: {len(reconstructed_text)}")

        for orig, recon in zip(tokens, reconstructed_tokens):
            if orig != recon:
                print(f"original: {orig}, reconstructed: {recon}")
        return False
    else:
        print(f"verification successful for {path}")
        return True

def test_txt_file(path: str) -> bool:
    text = open(path, "r").read()
    tokens = generate_tokens(text)
    mappings = make_mappings(tokens)
    reconstructed_tokens = reconstruct_tokens(tokens, mappings)
    reconstructed_text = join_tokens(reconstructed_tokens)
    return verify(text, reconstructed_text, tokens, reconstructed_tokens)

In [None]:
paths = glob("txts/*.txt")
for path in paths:
    test_txt_file(path)


In [188]:
urls = ["https://chitrapurmath.net/site/about-introduction",
"https://chitrapurmath.net/site/about-introduction-math",
"https://chitrapurmath.net/site/about-introduction-journey",
"https://chitrapurmath.net/site/about-parampara",
"https://chitrapurmath.net/gauravam?id=0",
"https://chitrapurmath.net/gauravam?id=1",
"https://chitrapurmath.net/gauravam?id=2",
"https://chitrapurmath.net/gauravam?id=3",
"https://chitrapurmath.net/gauravam?id=4",
"https://chitrapurmath.net/gauravam?id=5",
"https://chitrapurmath.net/gauravam?id=6",
"https://chitrapurmath.net/gauravam?id=7",
"https://chitrapurmath.net/gauravam?id=8",
"https://chitrapurmath.net/gauravam?id=9",
"https://chitrapurmath.net/gauravam?id=10",
"https://chitrapurmath.net/gauravam?id=11",
"https://chitrapurmath.net/gauravam?id=12",
"https://chitrapurmath.net/gauravam?id=13",
"https://chitrapurmath.net/gauravam?id=14",
"https://chitrapurmath.net/gauravam?id=15",
"https://chitrapurmath.net/news/default/story?id=66",
"https://chitrapurmath.net/news/default/story?id=168",
"https://chitrapurmath.net/news/default/story?id=48",
"https://chitrapurmath.net/news/default/story?id=213",
"https://chitrapurmath.net/news/default/story?id=229",
"https://chitrapurmath.net/news/default/story?id=253",
"https://chitrapurmath.net/news/default/story?id=90",
"https://chitrapurmath.net/news/default/story?id=13",
"https://chitrapurmath.net/news/default/story?id=172",
"https://chitrapurmath.net/news/default/story?id=164",
"https://chitrapurmath.net/news/default/story?id=271",
"https://chitrapurmath.net/news/default/story?id=1",
"https://chitrapurmath.net/news/default/story?id=88",
"https://chitrapurmath.net/news/default/story?id=74",
"https://chitrapurmath.net/news/default/story?id=68",
"https://chitrapurmath.net/news/default/story?id=277",
"https://chitrapurmath.net/news/default/story?id=15",
"https://chitrapurmath.net/news/default/story?id=161",
"https://chitrapurmath.net/news/default/story?id=107",
"https://chitrapurmath.net/news/default/story?id=207",
"https://chitrapurmath.net/news/default/story?id=266",
"https://chitrapurmath.net/news/default/story?id=5",
"https://chitrapurmath.net/news/default/story?id=7?",
"https://chitrapurmath.net/news/default/story?id=69",
"https://chitrapurmath.net/news/default/story?id=226",
"https://chitrapurmath.net/news/default/story?id=255",
"https://chitrapurmath.net/news/default/story?id=250",
"https://chitrapurmath.net/news/default/story?id=160",
"https://chitrapurmath.net/news/default/story?id=58",
"https://chitrapurmath.net/news/default/story?id=202",
"https://chitrapurmath.net/news/default/story?id=205",
"https://chitrapurmath.net/news/default/story?id=45",
"https://chitrapurmath.net/news/default/story?id=126",
"https://chitrapurmath.net/news/default/story?id=151",
"https://chitrapurmath.net/news/default/story?id=264",
"https://chitrapurmath.net/news/default/story?id=247",
"https://chitrapurmath.net/news/default/story?id=22",
"https://chitrapurmath.net/news/default/story?id=189",
"https://chitrapurmath.net/news/default/story?id=18",
"https://chitrapurmath.net/news/default/story?id=78",
"https://chitrapurmath.net/news/default/story?id=92",
"https://chitrapurmath.net/news/default/story?id=42",
"https://chitrapurmath.net/news/default/story?id=221",
"https://chitrapurmath.net/news/default/story?id=243",
"https://chitrapurmath.net/news/default/story?id=79",
"https://chitrapurmath.net/news/default/story?id=169",
"https://chitrapurmath.net/news/default/story?id=265",
"https://chitrapurmath.net/news/default/story?id=153",
"https://chitrapurmath.net/news/default/story?id=39?",
"https://chitrapurmath.net/news/default/story?id=8?",
"https://chitrapurmath.net/news/default/story?id=281",
"https://chitrapurmath.net/news/default/story?id=216",
"https://chitrapurmath.net/news/default/story?id=278",
"https://chitrapurmath.net/news/default/story?id=245",
"https://chitrapurmath.net/news/default/story?id=23",
"https://chitrapurmath.net/news/default/story?id=251",
"https://chitrapurmath.net/news/default/story?id=103",
"https://chitrapurmath.net/news/default/story?id=2",
"https://chitrapurmath.net/news/default/story?id=81",
"https://chitrapurmath.net/news/default/story?id=47",
"https://chitrapurmath.net/news/default/story?id=102",
"https://chitrapurmath.net/news/default/story?id=156",
"https://chitrapurmath.net/news/default/story?id=263",
"https://chitrapurmath.net/news/default/story?id=19",
"https://chitrapurmath.net/news/default/story?id=118",
"https://chitrapurmath.net/news/default/story?id=159",
"https://chitrapurmath.net/news/default/story?id=75",
"https://chitrapurmath.net/news/default/story?id=20",
"https://chitrapurmath.net/news/default/story?id=56",
"https://chitrapurmath.net/news/default/story?id=136",
"https://chitrapurmath.net/news/default/story?id=283",
"https://chitrapurmath.net/news/default/story?id=30",
"https://chitrapurmath.net/news/default/story?id=46",
"https://chitrapurmath.net/news/default/story?id=187",
"https://chitrapurmath.net/news/default/story?id=201",
"https://chitrapurmath.net/news/default/story?id=246",
"https://chitrapurmath.net/news/default/story?id=76",
"https://chitrapurmath.net/news/default/story?id=52",
"https://chitrapurmath.net/news/default/story?id=222",
"https://chitrapurmath.net/news/default/story?id=70",
"https://chitrapurmath.net/news/default/story?id=10",
"https://chitrapurmath.net/news/default/story?id=252",
"https://chitrapurmath.net/news/default/story?id=241",
"https://chitrapurmath.net/news/default/story?id=269",
"https://chitrapurmath.net/news/default/story?id=87",
"https://chitrapurmath.net/news/default/story?id=71",
"https://chitrapurmath.net/news/default/story?id=4",
"https://chitrapurmath.net/news/default/story?id=44",
"https://chitrapurmath.net/news/default/story?id=208",
"https://chitrapurmath.net/news/default/story?id=32?",
"https://chitrapurmath.net/news/default/story?id=57",
"https://chitrapurmath.net/news/default/story?id=210",
"https://chitrapurmath.net/news/default/story?id=3",
"https://chitrapurmath.net/news/default/story?id=223",
"https://chitrapurmath.net/news/default/story?id=163",
"https://chitrapurmath.net/news/default/story?id=154",
"https://chitrapurmath.net/news/default/story?id=254",
"https://chitrapurmath.net/news/default/story?id=273",
"https://chitrapurmath.net/news/default/story?id=270",
"https://chitrapurmath.net/news/default/story?id=174",
"https://chitrapurmath.net/news/default/story?id=197",
"https://chitrapurmath.net/news/default/story?id=59",
"https://chitrapurmath.net/news/default/story?id=166",
"https://chitrapurmath.net/news/default/story?id=17"]

In [218]:
#save all urls to txt file
with open("urls.txt", "w") as f:
    for url in urls:
        f.write(url + "\n")

In [None]:

FIRECRAWL_API_KEY = "fc-167746d1f6664c38b2c7824111f3f513"

app = FirecrawlApp(api_key=FIRECRAWL_API_KEY)

for i, url in enumerate(urls):
    print(f"scraping {url}")
    if os.path.exists(f"scrape/{i}.txt"):
        print(f"skipping {url} because it already exists")
        continue
    scrape = app.scrape_url(
        url, 
        params={'formats': ['markdown']}
    )
    save_path = f"scrape/{i}.txt"
    with open(save_path, "w") as f:
        f.write(scrape['markdown'])

In [None]:
paths = glob("scrape/*.txt")
for path in paths:
    test_txt_file(path)

In [None]:
#get all mappings
paths = glob("scrape/*.txt")
all_mappings = []
for path in paths:
    print(path)
    text = open(path, "r").read()
    tokens = generate_tokens(text)
    mappings = make_mappings(tokens)
    if "zia" in mappings:
        print(mappings["zia"])
    all_mappings.append(mappings)

all_mappings = {k: v for d in all_mappings for k, v in d.items()}

# print(all_mappings)
# with open("all_mappings.txt", "w") as f:
#     for k, v in all_mappings.items():
#         f.write(f"{k},{v}\n")

In [None]:
[k for k,v in all_mappings.items() if k.endswith("shram")]

In [None]:
#load from all_mappings.txt
all_mappings = {}
with open("all_mappings.txt", "r") as f:
    seen_keys = set()
    for line in f:
        k, v = line.strip().split(",")
        # Normalize to ASCII - keep only chars from space (0x20) to 'z' (0x7A)
        normalized_k = ''.join(c for c in k if 0x20 <= ord(c) <= 0x7A)
        if normalized_k != k:
            continue
        seen_keys.add(normalized_k)
        all_mappings[normalized_k] = v

all_mappings_new = {}
for k,v in all_mappings.items():
    all_mappings_new[unicodedata.normalize('NFKD', k).encode('ascii', 'ignore').decode('ascii')] = v

print(len(all_mappings))
print(len(all_mappings_new))


In [232]:
#save all mappings to txt file
with open("all_mappings_new.txt", "w") as f:
    for k, v in all_mappings_new.items():
        f.write(f"{k},{v}\n")