The Glossary data is availible here：https://www.swami-krishnananda.org/glossary/Glossary_Sanskrit_Terms.pdf

In [None]:
!pip -q install pdfplumber pandas

In [None]:
import re, json
import pdfplumber
import pandas as pd
from pathlib import Path

# 1) Set your PDF path
# If you uploaded the PDF to /content, use:
pdf_path = "/content/Glossary_Sanskrit_Terms.pdf"

# Or, if you want to use the copy in this environment:
# pdf_path = "/mnt/data/Glossary_Sanskrit_Terms.pdf"

out_dir = Path("/content/exports")
out_dir.mkdir(parents=True, exist_ok=True)

# 2) Start from page 3:
# pdfplumber uses 0-based indexing, so page 3 corresponds to index=2
start_page_index = 2

# 3) Regex extraction: lines starting with "Term: Definition..."
# Capture until the next "Term:" entry or end of text.
# Allows spaces, hyphens, parentheses, etc. in the term (e.g., "Abhava padartha").
entry_pattern = re.compile(
    r"(?m)^(?P<term>[A-Za-z][A-Za-z0-9\-\(\)\/' ]{0,80}?):\s*(?P<def>.*?)(?=^\s*[A-Za-z][A-Za-z0-9\-\(\)\/' ]{0,80}?:|\Z)",
    re.DOTALL
)

def clean_text(t: str) -> str:
    if not t:
        return ""
    # Remove odd artifacts / hyphenation (extend if needed)
    t = t.replace("\uf0ad", " ")   # rare bullet-like symbol
    t = t.replace("\u00ad", "")    # soft hyphen
    t = t.replace("\ufffd", "")    # replacement character
    # Some PDFs contain separators like "non￾existent" — normalize them away
    t = t.replace("￾", "")
    # Collapse multiple spaces/tabs
    t = re.sub(r"[ \t]+", " ", t)
    # Normalize line breaks (entry boundaries rely on "Term:" at line start)
    t = re.sub(r"\n+", "\n", t)
    return t.strip()

records = []

with pdfplumber.open(pdf_path) as pdf:
    for i in range(start_page_index, len(pdf.pages)):
        page = pdf.pages[i]
        text = page.extract_text() or ""
        text = clean_text(text)

        # Remove standalone page numbers at the bottom (e.g., "2" on its own line)
        text = re.sub(r"(?m)^\s*\d+\s*$", "", text).strip()

        # Extract glossary entries
        for m in entry_pattern.finditer(text):
            term = m.group("term").strip()
            definition = m.group("def").strip()
            # Flatten internal newlines inside the definition
            definition = re.sub(r"\s*\n\s*", " ", definition).strip()

            # Filter obvious non-entries (rare)
            if len(term) < 2 or len(definition) < 2:
                continue

            records.append({
                "term": term,
                "definition": definition,
                "source": Path(pdf_path).name,
                "page": i + 1,  # convert to 1-based page number
            })

df = (
    pd.DataFrame(records)
      .drop_duplicates(subset=["term", "definition"])
      .reset_index(drop=True)
)

csv_path = out_dir / "sanskrit_glossary_raw.csv"
jsonl_path = out_dir / "sanskrit_glossary_raw.jsonl"

df.to_csv(csv_path, index=False, encoding="utf-8")

with open(jsonl_path, "w", encoding="utf-8") as f:
    for row in records:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Extracted entries:", len(df))
print("CSV:", csv_path)
print("JSONL:", jsonl_path)
df.head(10)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m119.7 MB/s[0m eta [36m0:00:00[0m
[?25hExtracted entries: 2506
CSV: /content/exports/sanskrit_glossary_raw.csv
JSONL: /content/exports/sanskrit_glossary_raw.jsonl


Unnamed: 0,term,definition,source,page
0,Abhanavarana,Screening the outshining Brahman; one of the t...,Glossary_Sanskrit_Terms.pdf,3
1,Abhasa,"Reflection, appearance, semblance, not true.",Glossary_Sanskrit_Terms.pdf,3
2,Abhasamatra,In name only.,Glossary_Sanskrit_Terms.pdf,3
3,Abhasavada,Doctrine holding that all creation is reflecti...,Glossary_Sanskrit_Terms.pdf,3
4,Abhati,"Shines, illumines.",Glossary_Sanskrit_Terms.pdf,3
5,Abhava,In which one’s self is meditated upon as zero ...,Glossary_Sanskrit_Terms.pdf,3
6,Abhavamatra,Of a merely negative character.,Glossary_Sanskrit_Terms.pdf,3
7,Abhavana,Non-thought.,Glossary_Sanskrit_Terms.pdf,3
8,Abhava padartha,A thing which cannot have existence in reality...,Glossary_Sanskrit_Terms.pdf,3
9,Abhavarupavritti,The function of thinking of a non- existent th...,Glossary_Sanskrit_Terms.pdf,3


In [None]:
from google.colab import files
files.download("/content/exports/sanskrit_glossary_raw.csv")
files.download("/content/exports/sanskrit_glossary_raw.jsonl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>