In [4]:
# Install necessary Python packages
!pip install PyPDF2 pdfplumber python-docx streamlit




In [6]:
import PyPDF2
import pdfplumber
import docx
from pathlib import Path

# --- PDF Extraction ---
def extract_text_from_pdf(file_path):
    try:
        # Try using pdfplumber first (better for layout)
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
        return text
    except Exception as e:
        print("pdfplumber failed, using PyPDF2:", e)
        # Fallback to PyPDF2
        try:
            reader = PyPDF2.PdfReader(file_path)
            text = "\n".join(page.extract_text() or "" for page in reader.pages)
            return text
        except Exception as e2:
            print("PyPDF2 failed too:", e2)
            return ""

# --- DOCX Extraction ---
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# --- TXT Extraction ---
def extract_text_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

# --- Auto-detect file type and extract text ---
def extract_text(file_path):
    ext = Path(file_path).suffix.lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext == ".txt":
        return extract_text_from_txt(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")


In [7]:
import streamlit as st

st.title("Multi-format Text Extractor")

# Upload file
uploaded_file = st.file_uploader("Upload a PDF, DOCX, or TXT file", type=["pdf", "docx", "txt"])

if uploaded_file:
    # Save uploaded file temporarily
    temp_file_path = "temp_file" + Path(uploaded_file.name).suffix
    with open(temp_file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    # Extract text
    extracted_text = extract_text(temp_file_path)

    st.subheader("Extracted Text:")
    st.text_area("Text Output", value=extracted_text, height=300)



2025-10-03 09:07:14.351 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
