I was going to select all the words first. Now I know to first combine all the text of the entire document into one. Then select the sentences. Then select the words from those sentences.

# Attempt to get a clean corpus
1. Collect all sentences for later contextual training
2. Challenges
   1. pymupdf4llm did not work
      1. used fitz
   2. fitz got the text but incomplete and corrupted text   
      1. tried using the OCR version

In [23]:
import re
import os

# Optional: List of Khasi stopwords (expand as needed)
khasi_stopwords = {
    'ka', 'bad', 'la', 'dei', 'ha', 'ba', 'u', 'ki', 'na', 'nga', 'phi',
    'kaei', 'kum', 'ban', 'lada', 'ne', 'da', 'ym', 'ngin', 'hangne'
}

# Define Khasi-specific cleaning
def clean_khasi_text(text, remove_stopwords=True):
    # Normalize Unicode
    text = text.strip()

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove digits and punctuation (but keep Khasi letters)
    text = re.sub(r'[^a-zA-ZñÑïÏ\s]', '', text)

    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = text.split()

    # Optional: Remove stopwords
    if remove_stopwords:
        tokens = [word for word in tokens if word not in khasi_stopwords]

    return ' '.join(tokens)

# Example usage
if __name__ == "__main__":
    input_file = "dictionaries\kha-eng-dictionary.txt"        # Replace with your input text
    output_file = "dictionaries\kha-eng-dictionary_cleaned.txt"   # Cleaned output

    if not os.path.exists(input_file):
        print(f"Input file '{input_file}' not found.")
    else:
        with open(input_file, 'r', encoding='utf-8') as infile:
            raw_lines = infile.readlines()

        cleaned_lines = [clean_khasi_text(line) for line in raw_lines if line.strip()]

        with open(output_file, 'w', encoding='utf-8') as outfile:
            outfile.write('\n'.join(cleaned_lines))

        print(f"Cleaned text saved to: {output_file}")


  input_file = "dictionaries\kha-eng-dictionary.txt"        # Replace with your input text
  output_file = "dictionaries\kha-eng-dictionary_cleaned.txt"   # Cleaned output


Cleaned text saved to: dictionaries\kha-eng-dictionary_cleaned.txt


# Attempt to get a Khasi vocabulary
1. Collect all Khasi words
2. Especially important because of special letters
3. | Character | Unicode  | Description      |
| --------- | -------- | ---------------- |
| Ñ         | `U+00D1` | Capital letter Ñ |
| ñ         | `U+00F1` | Small letter ñ   |
| Ï         | `U+00CF` | Capital letter Ï |
| ï         | `U+00EF` | Small letter ï   |

## Evaluation
    # Using text file as the source ignored the special characters
    # Using tesseract with OCR source and spa+fra (spanish + france) setting captured the special characters

In [1]:
import re
from collections import Counter

# --- Step 1: Clean and tokenize the text ---
def clean_and_tokenize(text):
    # Keep only Khasi letters, including ñ and ï
    # Remove digits and symbols, lowercase everything
    text = text.lower()
    text = re.sub(r'[^a-zñï\s]', ' ', text)     # Only retain Khasi-valid characters
    text = re.sub(r'\s+', ' ', text)            # Normalize whitespace
    tokens = text.strip().split()
    return tokens

# --- Step 2: Build vocabulary from a text corpus ---
def build_khasi_vocabulary(file_path, min_freq=1):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    tokens = clean_and_tokenize(text)
    counter = Counter(tokens)

    # Filter by frequency if needed
    vocab = {word: freq for word, freq in counter.items() if freq >= min_freq}

    return vocab

# --- Step 3: Write vocabulary to file (optional) ---
def save_vocab(vocab, output_path='outputs/khasi_vocab_attempt.txt'):
    with open(output_path, 'w', encoding='utf-8') as f:
        for word, freq in sorted(vocab.items(), key=lambda x: (-x[1], x[0])):
            f.write(f"{word}\t{freq}\n")
    print(f"Saved vocabulary with {len(vocab)} words to {output_path}")

# --- Main usage example ---
if __name__ == "__main__":
    corpus_path = r"downloads_text\2015.464360.Ka-Niam-ki-khasi-Ka-Niam-Tip-blei-tip-brieu-Ed-1st_djvu.txt"  # Replace with your actual file path
    vocab = build_khasi_vocabulary(corpus_path, min_freq=1)
    save_vocab(vocab)


Saved vocabulary with 2286 words to outputs/khasi_vocab_attempt.txt


Using tesseract with lat letters to allow for special characters in Khasi

In [1]:
import re
import os
from collections import Counter
from pdf2image import convert_from_path
import pytesseract

# --- Step 1: Convert PDF pages to images ---
def pdf_to_images(pdf_path, dpi=300):
    return convert_from_path(pdf_path, dpi=dpi)

# --- Step 2: Extract text from images using OCR ---
def ocr_extract_text_from_images(images):
    full_text = ''
    for i, img in enumerate(images):
        print(f"OCR processing page {i+1}...")
        page_text = pytesseract.image_to_string(img, lang='spa+fra')  # You can change lang to 'khasi' if available
        full_text += page_text + '\n'
    return full_text

# --- Step 3: Clean and tokenize Khasi text ---
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-zñï\s]', ' ', text)     # Retain Khasi characters
    text = re.sub(r'\s+', ' ', text)            # Normalize spaces
    tokens = text.strip().split()
    return tokens

# --- Step 4: Build vocabulary ---
def build_khasi_vocabulary_from_pdf(pdf_path, min_freq=1):
    images = pdf_to_images(pdf_path)
    text = ocr_extract_text_from_images(images)
    tokens = clean_and_tokenize(text)
    counter = Counter(tokens)
    vocab = {word: freq for word, freq in counter.items() if freq >= min_freq}
    return vocab

# --- Step 5: Save vocabulary to file ---
def save_vocab(vocab, output_path=r'outputs\khasi_vocab_attempt_from_ocr_pdf.txt'):
    with open(output_path, 'w', encoding='utf-8') as f:
        for word, freq in sorted(vocab.items(), key=lambda x: (-x[1], x[0])):
            f.write(f"{word}\t{freq}\n")
    print(f"Saved vocabulary with {len(vocab)} words to {output_path}")

# --- Main usage ---
if __name__ == "__main__":
    pdf_path = r"downloads_ocr\2015.464362.Tynrai-Jingkheinfundamental-Arithmetic-Ed-4th.pdf"  # Replace with your scanned PDF file
    vocab = build_khasi_vocabulary_from_pdf(pdf_path, min_freq=1)
    save_vocab(vocab)

OCR processing page 1...
OCR processing page 2...
OCR processing page 3...
OCR processing page 4...
OCR processing page 5...
OCR processing page 6...
OCR processing page 7...
OCR processing page 8...
OCR processing page 9...
OCR processing page 10...
OCR processing page 11...
OCR processing page 12...
OCR processing page 13...
OCR processing page 14...
OCR processing page 15...
OCR processing page 16...
OCR processing page 17...
OCR processing page 18...
OCR processing page 19...
OCR processing page 20...
OCR processing page 21...
OCR processing page 22...
OCR processing page 23...
OCR processing page 24...
OCR processing page 25...
OCR processing page 26...
OCR processing page 27...
OCR processing page 28...
OCR processing page 29...
OCR processing page 30...
OCR processing page 31...
OCR processing page 32...
OCR processing page 33...
OCR processing page 34...
OCR processing page 35...
OCR processing page 36...
OCR processing page 37...
OCR processing page 38...
OCR processing page 3

In [None]:
import re
import os
from collections import Counter
from pdf2image import convert_from_path
import pytesseract

# --- Step 1: Convert PDF pages to images ---
def pdf_to_images(pdf_path, dpi=300):
    return convert_from_path(pdf_path, dpi=dpi)

# --- Step 2: Extract text from images using OCR ---
def ocr_extract_text_from_images(images):
    full_text = ''
    for i, img in enumerate(images):
        # print(f"OCR processing page {i+1}...")
        page_text = pytesseract.image_to_string(img, lang='spa+fra')  # You can change lang to 'khasi' if available
        full_text += page_text + '\n'
    return full_text

# --- Step 3: Clean and tokenize Khasi text ---
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-zñï\s]', ' ', text)     # Retain Khasi characters
    text = re.sub(r'\s+', ' ', text)            # Normalize spaces
    tokens = text.strip().split()
    return tokens

# --- Step 4: Build vocabulary ---
def build_khasi_vocabulary_from_pdf(pdf_path, min_freq=1):
    images = pdf_to_images(pdf_path)
    text = ocr_extract_text_from_images(images)
    tokens = clean_and_tokenize(text)
    counter = Counter(tokens)
    vocab = {word: freq for word, freq in counter.items() if freq >= min_freq}
    return vocab

# --- Step 5: Save vocabulary to file ---
def save_vocab(vocab, output_path=r'outputs\khasi_vocab_attempt_from_ocr_pdf.txt'):
    with open(output_path, 'w', encoding='utf-8') as f:
        for word, freq in sorted(vocab.items(), key=lambda x: (-x[1], x[0])):
            f.write(f"{word}\t{freq}\n")
    print(f"Saved vocabulary with {len(vocab)} words to {output_path}")

# --- Main logic to process all PDFs ---
if __name__ == "__main__":
    input_folder = 'downloads_ocr'
    output_folder = 'outputs/vocabulary'

    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.pdf'):
            input_path = os.path.join(input_folder, filename)
            base_name = os.path.splitext(filename)[0]
            output_path = os.path.join(output_folder, f"{base_name}_vocab.txt")

            print(f"\n📄 Processing: {filename}")
            try:
                vocab = build_khasi_vocabulary_from_pdf(input_path, min_freq=1)
                save_vocab(vocab, output_path)
            except Exception as e:
                print(f"❌ Failed to process {filename}: {e}")



📄 Processing: 2015.464359.Ki-Jingthoh-Halor-Ka-Kolshor-Bad-Ka-Politik-Ed-1st.pdf
Saved vocabulary with 4944 words to outputs/vocabulary\2015.464359.Ki-Jingthoh-Halor-Ka-Kolshor-Bad-Ka-Politik-Ed-1st_vocab.txt

📄 Processing: 2015.464360.Ka-Niam-ki-khasi-Ka-Niam-Tip-blei-tip-brieu-Ed-1st.pdf
Saved vocabulary with 2110 words to outputs/vocabulary\2015.464360.Ka-Niam-ki-khasi-Ka-Niam-Tip-blei-tip-brieu-Ed-1st_vocab.txt

📄 Processing: 2015.464361.Manik-Raitong.pdf
Saved vocabulary with 1201 words to outputs/vocabulary\2015.464361.Manik-Raitong_vocab.txt

📄 Processing: 2015.464362.Tynrai-Jingkheinfundamental-Arithmetic-Ed-4th.pdf
Saved vocabulary with 978 words to outputs/vocabulary\2015.464362.Tynrai-Jingkheinfundamental-Arithmetic-Ed-4th_vocab.txt

📄 Processing: 2015.464363.Improbed-Instruction-In-Khasi-Ka-Jingpynroi.pdf
Saved vocabulary with 10410 words to outputs/vocabulary\2015.464363.Improbed-Instruction-In-Khasi-Ka-Jingpynroi_vocab.txt

📄 Processing: 2015.464364.Ki-Saimuka-Na-Diengky