In [1]:
!pip uninstall -y fitz
!pip install -q pymupdf
!pip install gradio pymupdf pytesseract pandas matplotlib
!apt-get install -y tesseract-ocr
!pip install gradio pymupdf pandas matplotlib pytesseract pillow
# Run this in a Colab cell
!pip install pdfplumber pytesseract dateparser spacy gradio plotly pandas scikit-learn sentence-transformers pillow
!sudo apt install tesseract-ocr
!python -m spacy download en_core_web_sm
!pip install -q gradio fitz PyMuPDF pandas matplotlib pytesseract pdf2image
!apt-get install -y poppler-utils tesseract-ocr
!apt-get install -y poppler-utils tesseract-ocr
!pip install -q gradio pdf2image pytesseract PyMuPDF pandas scikit-learn
!pip uninstall -y fitz
!pip install pymupdf pdf2image pytesseract gradio
!apt-get install -y poppler-utils tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to loa

In [3]:
# Install required packages
!pip install PyMuPDF pdfplumber pytesseract dateparser spacy gradio plotly pandas scikit-learn sentence-transformers pillow

# Download spaCy model
!python -m spacy download en_core_web_sm

# Import libraries
import numpy as np
import os
import fitz  # PyMuPDF
import pdfplumber
import pytesseract
import dateparser
import spacy
import gradio as gr
import plotly.express as px
import pandas as pd
import traceback
from datetime import datetime, timedelta
from io import BytesIO
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
from PIL import Image
import warnings
import time

# Suppress warnings
warnings.filterwarnings('ignore')

# Configure environment
os.environ["TORCH_USE_CUDA_DSA"] = "0"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"

# Initialize models
try:
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
except Exception as e:
    print(f"Model loading failed: {str(e)}")
    import spacy
    nlp = spacy.blank("en")
    from sklearn.feature_extraction.text import TfidfVectorizer
    model = TfidfVectorizer()

class DocumentAnalyzer:
    def __init__(self):
        self.tasks = []
        self.priority_keywords = {
            "high": ["urgent", "asap", "critical", "immediately", "deadline"],
            "medium": ["review", "follow up", "update", "submit"],
            "low": ["optional", "when possible", "backlog"]
        }

    def extract_text(self, file_bytes):
        """Extract text from PDF with robust local OCR"""
        text = ""
        try:
            # First try with PyMuPDF (fitz) for text extraction
            with fitz.open(stream=file_bytes, filetype="pdf") as doc:
                for page in doc:
                    text += page.get_text() + "\n"

            # If no text extracted, try OCR with pdfplumber
            if not text.strip():
                text = self._local_ocr(file_bytes)

            return text
        except Exception as e:
            print(f"Text extraction error: {str(e)}")
            return self._local_ocr(file_bytes)  # Fallback to OCR

    def _local_ocr(self, file_bytes):
        """Use OCR for text extraction"""
        text = ""
        try:
            with pdfplumber.open(BytesIO(file_bytes)) as pdf:
                for page in pdf.pages:
                    # Try text extraction first
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                    else:
                        # Use OCR if text extraction fails
                        img = page.to_image(resolution=150).original
                        if img.mode != 'L':
                            img = img.convert('L')
                        img = img.point(lambda x: 0 if x < 128 else 255, '1')
                        page_text = pytesseract.image_to_string(img)
                        text += page_text + "\n"
            return text
        except Exception as e:
            raise RuntimeError(f"OCR failed: {str(e)}")

    def extract_deadlines(self, text):
        """Extract dates with enhanced context awareness"""
        deadlines = []

        # Simple sentence splitting
        sentences = text.split('\n\n')  # Use paragraph splitting
        sentences = [sent for sent in sentences if len(sent.strip()) > 20]  # Filter short paragraphs

        for sent in sentences:
            # Find dates in text
            parsed_dates = dateparser.search.search_dates(
                sent,
                settings={
                    'RELATIVE_BASE': datetime.now(),
                    'PREFER_DAY_OF_MONTH': 'first',
                    'PREFER_DATES_FROM': 'future',
                    'LANGUAGES': ['en']
                }
            )
            if parsed_dates:
                for date_str, date_obj in parsed_dates:
                    if self._is_valid_date(date_obj):
                        deadlines.append({
                            "text": sent.strip(),
                            "date": date_obj,
                            "raw_date": date_str
                        })
        return deadlines

    def _is_valid_date(self, date_obj):
        """Filter out invalid dates"""
        now = datetime.now()
        min_date = now - timedelta(days=365)
        max_date = now + timedelta(days=365*5)
        return min_date <= date_obj <= max_date

    def cluster_tasks(self, tasks):
        """Cluster similar tasks"""
        if not tasks or len(tasks) < 2:
            for task in tasks:
                task["cluster"] = 0
            return tasks

        task_texts = [t["text"] for t in tasks]

        try:
            embeddings = model.encode(task_texts, convert_to_numpy=True)
        except:
            from sklearn.feature_extraction.text import TfidfVectorizer
            vectorizer = TfidfVectorizer()
            embeddings = vectorizer.fit_transform(task_texts).toarray()

        eps = 0.5  # Simpler clustering parameter
        min_samples = 2
        clustering = DBSCAN(
            eps=eps,
            min_samples=min_samples,
            metric='cosine',
            n_jobs=-1
        ).fit(embeddings)

        for i, label in enumerate(clustering.labels_):
            tasks[i]["cluster"] = int(label)
        return tasks

    def predict_priority(self, task):
        """Determine task priority"""
        text = task["text"].lower()

        for priority, keywords in self.priority_keywords.items():
            if any(keyword in text for keyword in keywords):
                return priority

        if task.get("date"):
            days_remaining = (task["date"] - datetime.now()).days
            if days_remaining < 3:
                return "high"
            if days_remaining < 7:
                return "medium"

        return "low"

    def build_gantt(self, tasks):
        """Create timeline visualization"""
        if not tasks:
            return px.scatter(title="No tasks found")

        task_data = []
        for t in tasks:
            if t.get("date"):
                task_data.append({
                    "Task": t["text"][:60] + ("..." if len(t["text"]) > 60 else ""),
                    "Start": t["date"],
                    "Finish": t["date"] + timedelta(hours=2),
                    "Priority": t["priority"],
                    "Cluster": t.get("cluster", -1),
                    "Full Text": t["text"],
                    "Due Date": t["date"].strftime("%Y-%m-%d")
                })

        if not task_data:
            return px.scatter(title="No dated tasks found")

        df = pd.DataFrame(task_data)

        fig = px.timeline(
            df,
            x_start="Start",
            x_end="Finish",
            y="Task",
            color="Priority",
            hover_data=["Full Text", "Cluster", "Due Date"],
            color_discrete_map={
                "high": "#FF0000",
                "medium": "#FFA500",
                "low": "#008000"
            }
        )
        fig.update_layout(
            title="Task Timeline",
            xaxis_title="Timeline",
            yaxis_title="Tasks",
            hovermode="closest",
            showlegend=True,
            height=600  # Fixed height for stability
        )
        return fig

    def analyze_documents(self, files):
        """Main processing pipeline"""
        self.tasks = []
        file_count = len(files)
        processed_files = 0
        extracted_tasks = 0

        try:
            for file_info in files:
                try:
                    # Get file path from Gradio file info
                    if hasattr(file_info, 'name'):
                        file_path = file_info.name
                    else:
                        file_path = file_info

                    with open(file_path, "rb") as f:
                        file_bytes = f.read()

                    text = self.extract_text(file_bytes)
                    if not text.strip():
                        continue

                    deadlines = self.extract_deadlines(text)
                    self.tasks.extend(deadlines)
                    extracted_tasks += len(deadlines)
                    processed_files += 1
                except Exception as e:
                    # Get filename from path
                    filename = os.path.basename(file_path)
                    print(f"❌ Error processing {filename}: {str(e)}")
                    continue

            if not self.tasks:
                return None, pd.DataFrame(), "No deadlines found in documents"

            self.cluster_tasks(self.tasks)

            for task in self.tasks:
                task["priority"] = self.predict_priority(task)

            table_data = []
            for t in self.tasks:
                table_data.append({
                    "Task": t["text"][:100] + ("..." if len(t["text"]) > 100 else ""),
                    "Priority": t["priority"].title(),
                    "Due Date": t["date"].strftime("%Y-%m-%d %H:%M") if t.get("date") else "N/A",
                    "Cluster": t.get("cluster", -1)
                })

            df = pd.DataFrame(table_data)

            status = f"✅ Processed {processed_files}/{file_count} files | Found {extracted_tasks} tasks"
            return self.build_gantt(self.tasks), df, status

        except Exception as e:
            traceback.print_exc()
            return None, pd.DataFrame(), f"❌ Error: {str(e)}"


analyzer = DocumentAnalyzer()

# Create optimized Gradio interface with enhanced reliability
with gr.Blocks(title="PDF Task Analyzer", theme=gr.themes.Soft(), css=".gradio-container {max-width: 1200px}") as app:
    gr.Markdown("""
    <div style="text-align: center; padding: 10px;">
        <h1>📋 PDF Task Analyzer</h1>
        <p>Upload PDFs to extract deadlines and visualize tasks</p>
    </div>
    """)

    # Input section
    file_input = gr.File(
        file_types=[".pdf"],
        file_count="multiple",
        label="Upload PDF Documents"
    )

    # Action buttons with progress indicator
    with gr.Row():
        with gr.Column(scale=1):
            analyze_btn = gr.Button("Analyze Documents", variant="primary")
        with gr.Column(scale=1):
            clear_btn = gr.Button("Clear Results", variant="secondary")

    # Status output
    status_output = gr.Textbox(
        label="Status",
        value="✅ Ready to analyze documents",
        interactive=False
    )

    # Progress bar
    progress_bar = gr.Progress()

    # Results output
    with gr.Tabs():
        with gr.Tab("📅 Task Timeline", id="timeline_tab"):
            gantt_plot = gr.Plot(
                label="Task Timeline",
                value=px.scatter(title="Upload PDFs and click Analyze")
            )

        with gr.Tab("📋 Extracted Tasks", id="tasks_tab"):
            results_table = gr.Dataframe(
                label="Extracted Tasks",
                headers=["Task", "Priority", "Due Date", "Cluster"],
                interactive=False,
                wrap=True,
                value=pd.DataFrame({
                    "Task": ["Upload PDFs and click Analyze to see results"],
                    "Priority": [""],
                    "Due Date": [""],
                    "Cluster": [""]
                })
            )

    def handle_analysis(files, progress=gr.Progress()):
        progress(0, desc="Starting analysis...")
        if not files:
            return (
                px.scatter(title="Please upload PDF files first"),
                pd.DataFrame({"Task": ["No files uploaded"]}),
                "⚠️ Please upload PDF files first"
            )

        try:
            file_count = len(files)
            progress(0.1, desc=f"Processing {file_count} files...")

            # Process files with progress updates
            fig, df, status = analyzer.analyze_documents(files)

            progress(1.0, desc="Analysis complete")
            return fig, df, status
        except Exception as e:
            progress(1.0, desc="Analysis failed")
            return (
                px.scatter(title=f"Error: {str(e)}"),
                pd.DataFrame({"Task": [f"Error: {str(e)}"]}),
                f"❌ Error: {str(e)}"
            )

    def clear_all():
        return (
            px.scatter(title="Upload PDFs and click Analyze"),
            pd.DataFrame({
                "Task": ["Upload PDFs and click Analyze to see results"],
                "Priority": [""],
                "Due Date": [""],
                "Cluster": [""]
            }),
            "✅ Ready to analyze documents"
        )

    analyze_btn.click(
        fn=handle_analysis,
        inputs=[file_input],
        outputs=[gantt_plot, results_table, status_output],
        api_name="analyze"
    )

    clear_btn.click(
        fn=clear_all,
        inputs=[],
        outputs=[gantt_plot, results_table, status_output]
    )


# Run the application with enhanced reliability
print("Starting application...")

# First try with standard launch
try:
    app.launch(
        server_name="0.0.0.0",
        server_port=7872,
        share=True,
        debug=False,
        show_error=True
    )
except Exception as e:
    print(f"Standard launch failed: {str(e)}")
    print("Trying with queue enabled...")

    # Fallback to queued launch
    app.queue(concurrency_count=1)
    app.launch(
        server_name="0.0.0.0",
        server_port=7872,
        share=True,
        debug=False,
        show_error=True
    )

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Starting application...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://edddec447045ae994f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugg