In [1]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [3]:
import json
import os
import pdfplumber

# Step 1: Load the input JSON
json_path = "/content/challenge1b_input.json"
with open(json_path, "r", encoding="utf-8") as f:
    input_data = json.load(f)

# Step 2: Extract persona and task
persona = input_data["persona"]["role"]
task = input_data["job_to_be_done"]["task"]
documents = input_data["documents"]

print(f"Persona: {persona}")
print(f"Task: {task}")
print(f"Found {len(documents)} documents in input JSON.")

# Step 3: Extract text from each PDF listed in the input JSON
doc_texts = {}
for doc in documents:
    filename = doc["filename"]
    title = doc.get("title", "")
    pdf_path = os.path.join("/content", filename)

    if not os.path.exists(pdf_path):
        print(f"[❌] Missing file: {pdf_path}")
        continue

    with pdfplumber.open(pdf_path) as pdf:
        all_text = []
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text.append(text)
        doc_texts[filename] = "\n".join(all_text)

print(f"[✅] Loaded {len(doc_texts)} PDFs successfully.")


Persona: Travel Planner
Task: Plan a trip of 4 days for a group of 10 college friends.
Found 7 documents in input JSON.
[✅] Loaded 7 PDFs successfully.


In [5]:
import os
import json
import pdfplumber
from sentence_transformers import SentenceTransformer, util
import torch
from datetime import datetime
from collections import defaultdict

# Load model
model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

# Constants
INPUT_JSON = "challenge1b_input.json"

# Load input JSON
with open(INPUT_JSON, "r") as f:
    input_data = json.load(f)

# Extract document names, persona, and job description
input_documents = [doc["filename"] for doc in input_data["documents"]]
persona = input_data["persona"]["role"]
job_description = input_data["job_to_be_done"]["task"]

# Extract paragraphs with section titles and page numbers
def extract_paragraphs(pdf_path):
    all_paragraphs = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")
            current_title = ""
            current_para = ""
            for line in lines:
                if line.isupper() or (len(line.split()) <= 6 and line == line.title()):
                    if current_para:
                        all_paragraphs.append({
                            "title": current_title,
                            "text": current_para.strip(),
                            "page_number": page_num
                        })
                        current_para = ""
                    current_title = line.strip()
                else:
                    current_para += " " + line
            if current_para:
                all_paragraphs.append({
                    "title": current_title,
                    "text": current_para.strip(),
                    "page_number": page_num
                })
    return all_paragraphs

# Step 1: Extract all paragraphs from all PDFs
all_candidates = []
for doc_name in input_documents:
    pdf_path = os.path.join("/content", doc_name)
    if not os.path.exists(pdf_path):
        print(f"[❌] Missing file: {pdf_path}")
        continue
    paras = extract_paragraphs(pdf_path)
    for para in paras:
        para["document"] = doc_name
        all_candidates.append(para)

# Step 2: Compute embedding for job description
job_embedding = model.encode(job_description, convert_to_tensor=True)

# Step 3: Group candidates by document
grouped_by_doc = defaultdict(list)
for para in all_candidates:
    grouped_by_doc[para["document"]].append(para)

# Step 4: Select top-1 relevant paragraph per document
selected_chunks = []
for doc, paras in grouped_by_doc.items():
    texts = [p["text"] for p in paras]
    embeddings = model.encode(texts, convert_to_tensor=True)
    scores = util.cos_sim(job_embedding, embeddings)[0]
    best_idx = int(torch.argmax(scores))
    best_score = float(scores[best_idx])
    best_para = paras[best_idx]
    best_para["score"] = best_score
    selected_chunks.append(best_para)

# Step 5: Rank selected chunks globally
selected_chunks = sorted(selected_chunks, key=lambda x: x["score"], reverse=True)

# Step 6: Build output JSON
extracted_sections = []
subsection_analysis = []

for rank, para in enumerate(selected_chunks, 1):
    extracted_sections.append({
        "document": para["document"],
        "section_title": para["title"],
        "importance_rank": rank,
        "page_number": para["page_number"]
    })
    subsection_analysis.append({
        "document": para["document"],
        "refined_text": para["text"],
        "page_number": para["page_number"]
    })

output_json = {
    "metadata": {
        "input_documents": input_documents,
        "persona": persona,
        "job_to_be_done": job_description,
        "processing_timestamp": datetime.utcnow().isoformat()
    },
    "extracted_sections": extracted_sections,
    "subsection_analysis": subsection_analysis
}

# Save output
output_file = "final_output.json"
with open(output_file, "w") as f:
    json.dump(output_json, f, indent=2)

print(f"✅ Output saved as {output_file}")


✅ Output saved as final_output.json
