In [None]:
!pip install -q pdfplumber sentence-transformers gradio

In [None]:
import pdfplumber
import re
import numpy as np
import tempfile
from sentence_transformers import SentenceTransformer
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Define task-related embeddings
task_keywords = ['review', 'complete', 'submit', 'schedule', 'finalize',
                 'prepare', 'organize', 'plan', 'assign', 'follow up',
                 'update', 'approve', 'action item', 'next step']
task_embeddings = model.encode(task_keywords)

In [None]:
def extract_text_from_pdf(file_path):
    """Improved text extraction with error handling"""
    try:
        with pdfplumber.open(file_path) as pdf:
            return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    except Exception as e:
        return f"Error extracting text: {str(e)}"

In [None]:
def is_task_sentence(sentence, threshold=0.6):
    """Use semantic similarity to detect tasks"""
    sentence_embedding = model.encode([sentence])
    similarities = cosine_similarity(sentence_embedding, task_embeddings)
    return np.max(similarities) > threshold

In [None]:
def extract_tasks_from_text(text):
    """Improved task extraction with semantic analysis"""
    tasks = []
    # Safer sentence splitting
    sentences = re.split(r'(?<=[\.\?\!])\s+', text)

    for sentence in sentences:
        if is_task_sentence(sentence):
            cleaned = re.sub(r'\s+', ' ', sentence.strip())
            cleaned = re.sub(r'\[[^\]]*\]', '', cleaned)  # Remove [HIGH] etc.
            tasks.append(cleaned)

    return tasks[:10]

In [None]:
def process_document(file):
    import os, tempfile
    try:
        if file is None:
            return {"error": "No file uploaded"}

        if hasattr(file, "read"):
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
                tmp.write(file.read())
                tmp_path = tmp.name
        elif isinstance(file, str) and os.path.exists(file):
            tmp_path = file
        else:
            return {"error": "Invalid file input"}

        text = extract_text_from_pdf(tmp_path)
        if text.startswith("Error"):
            return {"error": text}

        # 👇 Check what was extracted
        return {"extracted_text": text}

    except Exception as e:
        return {"error": f"Processing failed: {str(e)}"}

In [None]:
# Gradio UI
iface = gr.Interface(
    fn=process_document,
    inputs=gr.File(label="📄 Upload PDF Document", file_types=['.pdf']),
    outputs=gr.JSON(label="🚀 Extracted Tasks"),
    title="DocAgent - AI Workflow Automator",
    description="Upload a PDF to extract actionable tasks using AI",
    examples=[
        ["./meeting_notes.pdf"],
        ["./project_plan.pdf"]
    ],
    allow_flagging="never"
)

iface.launch(debug=True)