# Train llm to find meanings
* gemma3, t5-base, mistral, BART, T5-base-multi-sentence-doctor models do not work


## Attempt to clean corpus using ChatGPT
* Cost: ₹3000
* Time: 70 minutes per file

In [None]:
import os
import openai

# Initialize OpenAI client
client = openai.OpenAI(api_key="")

# OCR cleaning function using OpenAI
def ocr_cleaner_gpt(text):
    prompt = f"""
    You are an expert in the Khasi language and OCR correction. Clean and correct the following sentence so that it makes clear and sensible meaning to a native Khasi speaker.

    Instructions:
    1. Fix OCR errors: correct misspellings, remove misread characters, and fix punctuation.
    2. Convert the entire sentence to lowercase.
    3. Remove any unnecessary metadata, page numbers, author tags, class numbers, or formatting artifacts that do not belong in natural language.
    4. Keep proper names or culturally important terms, but only if they make sense in context.
    5. Remove any words or phrases that disrupt the meaning or are likely OCR noise.

    OCR text: "{text}"

    Cleaned and corrected text:
    """
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )

    return response.choices[0].message.content.strip()

# File paths
input_path = r"outputs/full_sentences"
output_path = r"outputs/ocr_cleaned_full_sentences"

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Process each file
for filename in os.listdir(input_path):
    input_file_path = os.path.join(input_path, filename)
    output_file_path = os.path.join(output_path, os.path.splitext(filename)[0] + ".txt")

    with open(input_file_path, 'r', encoding='utf-8') as input_file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:

        for line in input_file:
            line = line.strip()
            if not line:
                continue

            try:
                cleaned_text = ocr_cleaner_gpt(line)
                output_file.write(cleaned_text + "\n")
            except Exception as e:
                print(f"Error processing line: {line}\n{e}")

## Attempt using ollama gemma3

In [None]:
import os
import ollama  # Make sure you have `pip install ollama`

# OCR cleaning function using Gemma via Ollama
def ocr_cleaner_gemma_ollama(text):
    prompt = f"""
You are an expert in the Khasi language and OCR correction. Clean and correct the following sentence so that it makes clear and sensible meaning to a native Khasi speaker.

Instructions:
1. Fix OCR errors: correct misspellings, remove misread characters, and fix punctuation.
2. Convert the entire sentence to lowercase.
3. Remove any unnecessary metadata, page numbers, author tags, class numbers, or formatting artifacts that do not belong in natural language.
4. Keep proper names or culturally important terms, but only if they make sense in context.
5. Remove any words or phrases that disrupt the meaning or are likely OCR noise.

OCR text: "{text}"

Cleaned and corrected text:
"""
    try:
        response = ollama.chat(
            model='gemma:3b',  # use your local model name
            messages=[{"role": "user", "content": prompt}]
        )
        return response['message']['content'].strip()
    except Exception as e:
        print(f"Error with ollama: {e}")
        return ""

# File paths
input_path = r"outputs/full_sentences"
output_path = r"outputs/ocr_cleaned_full_sentences"
os.makedirs(output_path, exist_ok=True)

# Process each file
for filename in os.listdir(input_path):
    input_file_path = os.path.join(input_path, filename)
    output_file_path = os.path.join(output_path, os.path.splitext(filename)[0] + ".txt")

    with open(input_file_path, 'r', encoding='utf-8') as input_file, \
         open(output_file_path, 'w', encoding='utf-8') as output_file:

        for line in input_file:
            line = line.strip()
            if not line:
                continue

            try:
                cleaned_text = ocr_cleaner_gemma_ollama(line)
                output_file.write(cleaned_text + "\n")
            except Exception as e:
                print(f"Error processing line: {line}\n{e}")


In [None]:
# KI JINGTHOH HALOR KA KOLSHOR BAD KA POLITIK O Da U Hipshon Roy Kharshjing 1993 Becas 10380 8 Class No RAS-AD Wo TADA Author SAS NES KR Title a AN ke NN Borrower'sl  -  Rarrnwaro  Ace.

# """
# You are an expert in the Khasi language and OCR correction. Clean and correct the following sentence so that it makes clear and sensible meaning to a native Khasi speaker.

# Instructions:
# 1. Fix OCR errors: correct misspellings, remove misread characters, and fix punctuation.
# 2. Convert the entire sentence to lowercase.
# 3. Remove any unnecessary metadata, page numbers, author tags, class numbers, or formatting artifacts that do not belong in natural language.
# 4. Keep proper names or culturally important terms, but only if they make sense in context.
# 5. Remove any words or phrases that disrupt the meaning or are likely OCR noise.

# OCR text: "KI JINGTHOH HALOR KA KOLSHOR BAD KA POLITIK O Da U Hipshon Roy Kharshjing 1993 Becas 10380 8 Class No RAS-AD Wo TADA Author SAS NES KR Title a AN ke NN Borrower'sl  -  Rarrnwaro  Ace."

# Cleaned and corrected text:
# """

## Attempt using bart

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

# Load BART model and tokenizer
model_name = "facebook/bart-large"  # BART model suitable for your system
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Function to clean the OCR text using BART
def ocr_cleaner_bart(text):
    input_text = f"fix: "{text}"
    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    
    # Generate output
    summary_ids = model.generate(inputs["input_ids"], max_length=512, num_beams=4, early_stopping=True)
    
    # Decode the generated text
    corrected_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return corrected_text.strip()

# Example usage
text = "KI JINGTHOH HALOR KA KOLSHOR BAD KA POLITIK O Da U Hipshon Roy Kharshjing 1993 Becas 10380 8 Class No RAS-AD Wo TADA Author SAS NES KR Title a AN ke NN Borrower'sl  -  Rarrnwaro  Ace."
print(ocr_cleaner_bart(text))


Note:    You are an expert in the Khasi language and OCR correction. Please use the following instructions to correct the following sentence: "KI JINGTHOH HALOR KA KOLSHOR BAD KA POLITIK O Da U Hipshon Roy Kharshjing 1993 Becas 10380 8 Class No RAS-AD Wo TADA Author SAS NES KR Title a AN ke NN Borrower'sl  -  Rarrnwaro  Ace." OCR text: "Ki Jingthoh HALor Ka KOLshor BAD Ka POLITIK"   Cleaned and corrected text: 1. Fix OCR errors: correct misspellings, remove misread characters, and fix punctuation. 2. Convert the entire sentence to lowercase. 3. Remove any unnecessary metadata, page numbers, author tags, class numbers, or formatting artifacts that do not belong in natural language. 4. Keep proper names or culturally important terms, but only if they make sense in context. 5. Clean up any typos or grammatical errors.   Instructions:  *   1. Correct OCR error:  #   2. Correct typos: #   3. Fix punctuation: .#   4. Remove unnecessary formatting artifacts: :#   5. Remove words or phrases th

## Attempt cleaning manually

In [9]:
import re

def clean_text(text: str) -> str:
    # Remove page numbers like "Page 23", "[23]", or just "23" at line start/end
    text = re.sub(r'\bPage\s*\d+\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'^\s*\[?\d+\]?\s*$', '', text, flags=re.MULTILINE)
    
    # Remove lines that are mostly uppercase and short (headers/footers)
    text = re.sub(r'^[A-Z\s,.\-]{5,50}$', '', text, flags=re.MULTILINE)

    # Remove author/editor tags
    text = re.sub(r'(?:^|\n)[–—-]?\s*(By|Edited by|Editor|Author)\s+[\w\s.]+\n?', '', text, flags=re.IGNORECASE)
    
    # Remove class or catalog numbers (e.g. "Class 491.25", "Dewey 899.233")
    text = re.sub(r'\b(Class|Dewey|ISBN)\s*\d+(\.\d+)?\b', '', text, flags=re.IGNORECASE)

    # Fix hyphenated words broken across lines
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # Fix broken sentences across lines
    text = re.sub(r'(?<!\n)\n(?![\n])', ' ', text)

    # Remove multiple newlines
    text = re.sub(r'\n{2,}', '\n\n', text)

    # Normalize extra spaces
    text = re.sub(r'[ \t]{2,}', ' ', text)

    # Strip overall leading/trailing whitespace
    text = text.strip()

    return text

# Example usage
if __name__ == "__main__":
    print(clean_text("KI JINGTHOH HALOR KA KOLSHOR BAD KA POLITIK O Da U Hipshon Roy Kharshjing 1993 Becas 10380 8 Class No RAS-AD Wo TADA Author SAS NES KR Title a AN ke NN Borrower'sl  -  Rarrnwaro  Ace."))
    # with open("input_raw.txt", "r", encoding="utf-8") as f:
    #     raw = f.read()

    # cleaned = clean_text(raw)

    # with open("output_cleaned.txt", "w", encoding="utf-8") as f:
    #     f.write(cleaned)


KI JINGTHOH HALOR KA KOLSHOR BAD KA POLITIK O Da U Hipshon Roy Kharshjing 1993 Becas 10380 8 Class No RAS-AD Wo TADA Author SAS NES KR Title a AN ke NN Borrower'sl - Rarrnwaro Ace.
