In [4]:
# Install required dependencies
!apt-get install -y poppler-utils
!sudo apt install tesseract-ocr -y
!sudo apt install tesseract-ocr-hin -y  # Install Hindi OCR model
!pip install pytesseract pdf2image pandas openpyxl

import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import os
import re

# ✅ Set up Tesseract for Hindi OCR
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
os.environ["TESSDATA_PREFIX"] = "/usr/share/tesseract-ocr/4.00/tessdata/"

# ✅ Make sure your PDF file exists in /content/
pdf_path = "/content/your_file.pdf"  # Change this to your actual file name

# Check if file exists
if not os.path.exists(pdf_path):
    print(f"❌ Error: File '{pdf_path}' not found. Upload the PDF file to /content/ and try again.")
    from google.colab import files
    uploaded = files.upload()  # Allow user to upload a file

# ✅ Convert PDF pages to images
images = convert_from_path(pdf_path)

# ✅ Function to clean illegal characters for Excel
def clean_text(text):
    return re.sub(r'[\x00-\x1F\x7F]', '', text)  # Remove control characters

# ✅ Extract Hindi text from images
extracted_text = []
for i, img in enumerate(images):
    text = pytesseract.image_to_string(img, lang="hin")  # Extract Hindi text
    cleaned_text = clean_text(text)  # Clean text for Excel
    extracted_text.append(cleaned_text)

# ✅ Save extracted text to an Excel file
df = pd.DataFrame({"Page": range(1, len(extracted_text) + 1), "Extracted Text": extracted_text})
excel_path = "/content/extracted_text.xlsx"
df.to_excel(excel_path, index=False)

print("✅ Hindi text extracted and saved to:", excel_path)

# ✅ Automatically download the Excel file
from google.colab import files
files.download(excel_path)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr-hin is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
✅ Hindi text extracted and saved to: /content/extracted_text.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>