In [9]:
!pip install streamlit pyngrok opencv-python-headless pytesseract Pillow --quiet
!pkill streamlit || echo "No running streamlit process"


No running streamlit process


In [10]:
%%writefile streamlit_app.py
import streamlit as st
from PIL import Image
import numpy as np
import pytesseract
import cv2

def extract_text_from_image(image):
    try:
        img_np = np.array(image.convert('RGB'))
        img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
        text = pytesseract.image_to_string(img_cv)
        return text.strip() if text.strip() else "No text detected."
    except Exception as e:
        return f"⚠️ Error: {str(e)}"

st.set_page_config(page_title="OCR Text Extractor", page_icon="📄", layout="centered")
st.title("📄 OCR Text Extractor")
st.markdown("Upload an image to extract text using Tesseract OCR.")

uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

if uploaded_file:
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Image", use_column_width=True)
    if st.button("🔍 Extract Text"):
        with st.spinner("Extracting text..."):
            extracted_text = extract_text_from_image(image)
        st.success("✅ Extraction complete")
        st.text_area("Extracted Text", extracted_text, height=300)
else:
    st.info("Upload a valid image to get started.")


Overwriting streamlit_app.py


In [12]:
import time
from pyngrok import ngrok

# Start Streamlit (non-blocking)
get_ipython().system_raw("streamlit run streamlit_app.py &")

# Wait a few seconds to let Streamlit boot
time.sleep(5)

# Connect to ngrok tunnel
public_url = ngrok.connect(port=8501)
print(f"🔗 Public App URL: {public_url}")




PyngrokNgrokHTTPError: ngrok client exception, API returned 400: {"error_code":102,"status_code":400,"msg":"invalid tunnel configuration","details":{"err":"yaml: unmarshal errors:\n  line 1: field port not found in type config.HTTPv2Tunnel"}}


In [2]:
!pip install streamlit pyngrok opencv-python-headless pytesseract Pillow --quiet


In [3]:
%%writefile streamlit_app.py
import streamlit as st
from PIL import Image
import numpy as np
import pytesseract
import cv2
import io

# OCR function (refined from ocr.py or define here)
def extract_text_from_image(image):
    try:
        img_np = np.array(image.convert('RGB'))
        img_cv = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
        text = pytesseract.image_to_string(img_cv)
        return text.strip() if text.strip() else "No text detected."
    except Exception as e:
        return f"⚠️ Error: {str(e)}"

# Streamlit page settings
st.set_page_config(page_title="OCR Text Extractor", page_icon="📄", layout="centered")

st.markdown("<h1 style='text-align: center;'>📄 OCR Text Extractor</h1>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center;'>Upload an image and click the button to extract printed or handwritten text using Tesseract OCR.</p>", unsafe_allow_html=True)
st.markdown("---")

uploaded_file = st.file_uploader("Upload an image (PNG, JPG, JPEG)", type=["png", "jpg", "jpeg"])

if uploaded_file:
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Image", use_column_width=True)

    if st.button("🔍 Extract Text"):
        with st.spinner("Analyzing Image..."):
            extracted_text = extract_text_from_image(image)
        st.success("✅ Text Extraction Complete")
        st.text_area("📋 Extracted Text", extracted_text, height=300)
else:
    st.info("Please upload a valid image file to begin.")


Overwriting streamlit_app.py


In [7]:
!ngrok authtoken 2wAoVE6iok87m1MDlwV2w13zOfd_3EzLpj2WfmkhiKkxrCx6

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [5]:
!pkill -f ngrok

In [11]:
# Kill ngrok process if running (works in Colab)
import os
os.system("pkill ngrok")


256

In [8]:
# Make sure old tunnel is killed
try:
    from pyngrok import ngrok
    ngrok.kill()
except:
    pass

# Connect again
public_url = ngrok.connect(8501)
print(f"✅ Streamlit app is live at: {public_url}")


✅ Streamlit app is live at: NgrokTunnel: "https://d598-34-145-136-82.ngrok-free.app" -> "http://localhost:8501"


In [3]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import spacy
import re

KeyboardInterrupt: 

# New section

In [None]:
import os
print(os.listdir('/content/'))

In [None]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import spacy
import re

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Path to uploaded PDF
pdf_path = "/content/2025042842.pdf"  # Corrected path assignment

# Convert PDF to images
print("Converting PDF to images...")
images = convert_from_path(pdf_path, dpi=300)

# OCR extraction
print("Running OCR on each page...")
full_text = ""
for i, image in enumerate(images):
    text = pytesseract.image_to_string(image)
    full_text += f"\n--- Page {i + 1} ---\n{text}"

# Save raw text
with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(full_text)

print("OCR complete. Starting NLP analysis...")

# NLP Entity Extraction
doc = nlp(full_text)
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Simple clause detection
clauses = re.findall(r"(section\s\d+[a-zA-Z]*|clause\s\d+[a-zA-Z]*|article\s\d+)", full_text, re.IGNORECASE)

# Save final report
with open("legal_analysis_report.txt", "w", encoding="utf-8") as f:
    f.write("=== OCR TEXT ===\n")
    f.write(full_text)
    f.write("\n\n=== Named Entities ===\n")
    for text, label in entities:
        f.write(f"{text} -> {label}\n")
    f.write("\n\n=== Detected Clauses ===\n")
    for clause in clauses:
        f.write(f"{clause}\n")

print("✅ Analysis complete.")

# Download results
# Assuming 'files' is defined elsewhere for downloading
# files.download("extracted_text.txt")
# files.download("legal_analysis_report.txt")

Converting PDF to images...
Running OCR on each page...
OCR complete. Starting NLP analysis...
✅ Analysis complete.


In [13]:
!pip install gradio pytesseract opencv-python-headless Pillow --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.9/322.9 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
!pip install gradio pytesseract opencv-python-headless Pillow PyMuPDF --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/20.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/20.0 MB[0m [31m114.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/20.0 MB[0m [31m114.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m12.7/20.0 MB[0m [31m125.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m17.6/20.0 MB[0m [31m135.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m20.0/20.0 MB[0m [31m139.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m20.0/20.0 MB[0m [31m139.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m 

In [18]:
import gradio as gr
import pytesseract
from PIL import Image
import numpy as np
import cv2
import fitz  # PyMuPDF
import os

def extract_text_from_image(image: Image.Image) -> str:
    try:
        image_np = np.array(image.convert("RGB"))
        gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
        gray = cv2.GaussianBlur(gray, (3, 3), 0)
        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        text = pytesseract.image_to_string(thresh)
        return text.strip() if text.strip() else "⚠️ No text detected."
    except Exception as e:
        return f"❌ OCR Error: {str(e)}"

def process_file(file_path) -> str:
    try:
        if isinstance(file_path, str):
            file_path = file_path  # Already a path string

        if file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
            image = Image.open(file_path)
            return extract_text_from_image(image)

        elif file_path.lower().endswith('.pdf'):
            text_output = ""
            doc = fitz.open(file_path)
            for i, page in enumerate(doc):
                pix = page.get_pixmap(dpi=300)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                page_text = extract_text_from_image(img)
                text_output += f"\n\n📄 Page {i+1}:\n{page_text}"
            return text_output.strip() if text_output.strip() else "⚠️ No text detected in PDF."
        else:
            return "❌ Unsupported file format. Upload PNG, JPG, JPEG, or PDF."
    except Exception as e:
        return f"❌ Error processing file: {str(e)}"

title = "📄 OCR Extractor (Image + PDF)"
description = "Upload an image or PDF containing printed or handwritten text. This app uses Tesseract OCR with preprocessing to extract accurate text."

gr.Interface(
    fn=process_file,
    inputs=gr.File(label="📁 Upload Image or PDF", file_types=[".png", ".jpg", ".jpeg", ".pdf"]),
    outputs=gr.Textbox(label="📋 Extracted Text", lines=20),
    title=title,
    description=description,
    theme="soft",
    allow_flagging="never"
).launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f5299d3f757c3589d0.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


