In [1]:
# Test Service Connectivity
import requests
import subprocess
import json

services_to_test = {
    "FastAPI": "http://localhost:8000/api/v1/health",
    "PostgreSQL (via API)": "http://localhost:8000/api/v1/health",
    "Ollama": "http://localhost:11434/api/version",
    "OpenSearch": "http://localhost:9200/_cluster/health",
    "Airflow": "http://localhost:8080/health",
}

print("WEEK 2 PREREQUISITE CHECK")
print("=" * 50)

all_healthy = True

for service_name, url in services_to_test.items():
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            print(f"✓ {service_name}: Healthy")
        else:
            print(f"✗ {service_name}: HTTP {response.status_code}")
            all_healthy = False
    except requests.exceptions.ConnectionError:
        print(f"✗ {service_name}: Not accessible")
        all_healthy = False
    except Exception as e:
        print(f"✗ {service_name}: {type(e).__name__}")
        all_healthy = False

print()
if all_healthy:
    print("All services healthy! Ready for Week 2 development.")
else:
    print("Some services need attention. Check Week 1 notebook.")

WEEK 2 PREREQUISITE CHECK
✓ FastAPI: Healthy
✓ PostgreSQL (via API): Healthy
✓ Ollama: Healthy
✓ OpenSearch: Healthy
✓ Airflow: Healthy

All services healthy! Ready for Week 2 development.


In [2]:
# ensure repo root and src are on sys.path
import sys, pathlib

repo_root = pathlib.Path().resolve()
# if notebooks live in a folder, adjust repo_root = repo_root.parent etc.
if (repo_root / "src").exists():
    sys.path.insert(0, str(repo_root))
    sys.path.insert(0, str(repo_root / "src"))
else:
    for p in repo_root.parents:
        if (p / "src").exists():
            sys.path.insert(0, str(p))
            sys.path.insert(0, str(p / "src"))
            break
print("sys.path[0:3]:", sys.path[:3])

sys.path[0:3]: ['/Users/surekha/Documents/projects/RAG/research-assistant/src', '/Users/surekha/Documents/projects/RAG/research-assistant', '/Users/surekha/.local/share/uv/python/cpython-3.12.12-macos-aarch64-none/lib/python312.zip']


In [3]:
import asyncio
from datetime import datetime, timedelta

from src.services.arxiv.factory import make_arxiv_client

print("Testing ARXIV API Client")
print("=" * 40)
arxiv_client = make_arxiv_client()
print(f" Client created: {arxiv_client.base_url}")
print(f" Rate limit: {arxiv_client.rate_limit_delay}")
print(f" Max results: {arxiv_client.max_results}")
print(f" Category: {arxiv_client.search_category}")
print()

Testing ARXIV API Client
 Client created: https://export.arxiv.org/api/query
 Rate limit: 3.0
 Max results: 15
 Category: cs.AI



In [4]:
async def test_fetch_papers():
    """Testing fetching papers from arxiv client with rate limiting"""

    print("Test 1: Fetch Recent cs.AI papers")
    try:
        papers = await arxiv_client.fetch_papers(
            max_results=1, sort_by="submittedDate", sort_order="descending"
        )

        print(f"Fetched {len(papers)} papers")

        if papers:
            for i,paper in enumerate(papers[:2],1):
                print(f"   {i}. [{paper.arxiv_id}] {paper.title[:60]}...")
                print(f"      Authors: {', '.join(paper.authors[:2])}{'...' if len(paper.authors) > 2 else ''}")
                print(f"      Categories: {', '.join(paper.categories)}")
                print(f"      Published: {paper.published_date}")
                print()

        return papers
    except Exception as e:
        print(f"Error fetching papers: {e}")
        if "503" in str(e):
            print(" arxiv API temporarily unavailable(normal)")
            print(" Rate limiting and error handling working correctly")
        return []

papers = await test_fetch_papers()

Test 1: Fetch Recent cs.AI papers
Fetched 1 papers
   1. [2512.05117v1] The Universal Weight Subspace Hypothesis...
      Authors: Prakhar Kaushik, Shravan Chaudhari...
      Categories: cs.LG, cs.AI, cs.CV
      Published: 2025-12-04T18:59:58Z



In [5]:
# test date filtering
async def test_date_filtering():

    print("Test 2: Date Range Filtering")

    from_date = "20250808"
    to_date = "20250809"

    try:
        date_papers = await arxiv_client.fetch_papers(max_results=2, from_date=from_date,to_date=to_date)

        print(f" Date filtering test: {len(date_papers)} papers from {from_date}-{to_date}")
        if date_papers:
            for i, paper in enumerate(date_papers, 1):
                print(f"   {i}. [{paper.arxiv_id}] {paper.title[:60]}...")
                print(
                    f"      Authors: {', '.join(paper.authors[:2])}{'...' if len(paper.authors) > 2 else ''}"
                )
                print(f"      Categories: {', '.join(paper.categories)}")
                print(f"      Published: {paper.published_date}")
                print()

        return date_papers

    except Exception as e:
        print(f"✗ Date filtering error: {e}")
        return []

# Run date filtering test
date_papers = await test_date_filtering()

Test 2: Date Range Filtering
 Date filtering test: 2 papers from 20250808-20250809
   1. [2508.07111v1] Investigating Intersectional Bias in Large Language Models u...
      Authors: Falaah Arif Khan, Nivedha Sivakumar...
      Categories: cs.CL, cs.AI
      Published: 2025-08-09T22:24:40Z

   2. [2508.07107v2] Designing a Feedback-Driven Decision Support System for Dyna...
      Authors: Timothy Oluwapelumi Adeyemi, Nadiah Fahad AlOtaibi
      Categories: cs.AI, cs.CY
      Published: 2025-08-09T21:24:54Z



In [6]:
# test pdf download and caching locally
async def test_pdf_download(test_papers):

    print("Test 3: PDF Download & Caching")

    if not test_papers:
        print("No papers available for PDF download test")

    test_paper = test_papers[0]
    print(f"Testing PDF download for : {test_paper.arxiv_id}")
    print(f"Title: {test_paper.title[:60]}")

    try:
        pdf_path = await arxiv_client.download_pdf(test_paper)
        print(pdf_path)
        if pdf_path and pdf_path.exists():
            size_mb = pdf_path.stat().st_size / (1024*1024)
            print(f"PDF downloaded {pdf_path.name} ({size_mb:.2f}) MB")
            return pdf_path
        else:
            print("PDF downlod failed")
            return None
        
    except Exception as e:
        print(f"PDF download failed")
        return None
    
pdf_path = await test_pdf_download(date_papers[:1])

Test 3: PDF Download & Caching
Testing PDF download for : 2508.07111v1
Title: Investigating Intersectional Bias in Large Language Models u
data/arxiv_pdfs/2508.07111v1.pdf
PDF downloaded 2508.07111v1.pdf (6.81) MB


In [7]:
# Testing PDF parsing using docling

from src.services.pdf_parser.factory import make_pdf_parser_service
from src.config import get_settings
from pathlib import Path

print("Testing PDF parsing")

pdf_parser = make_pdf_parser_service()
cache_dir = Path(
    "data/arxiv_pdfs"
)

if cache_dir.exists():
    pdf_files = list(cache_dir.glob("*.pdf"))
    print(f"\n Found {len(pdf_files)} PDF files to test parsing")

    if pdf_files:
        test_pdf = pdf_files[0]
        print(f"Testing PDF parsing with: {test_pdf.name}")

        try:
            pdf_content = await pdf_parser.parse_pdf(test_pdf)

            if pdf_content:
                print(f" PDF parsing successful")
                print(f"Sections: {len(pdf_content.sections)}")
                print(f"Raw text length: {len(pdf_content.raw_text)} characters")
                print(f"Parser used: {pdf_content.parser_used}")

            if pdf_content.sections:
                first_section = pdf_content.sections[0]
                print(f" First section: {first_section.title}, ({len(first_section.content)}) chars")
            else:
                print("✗ PDF parsing failed (Docling compatibility issue)")
                print("This is expected - not all PDFs work with Docling")

        except Exception as e:
            print(f"✗ PDF parsing error: {e}")
            print("This demonstrates the error handling in action")
    else:
        print("No PDF files available for parsing test")
else:
    print("No PDF cache directory found")

2025-12-06 12:03:48,293 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-12-06 12:03:48,351 - INFO - Going to convert document batch...
2025-12-06 12:03:48,351 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 70256a236a6856c82de2c96fe229a58e
2025-12-06 12:03:48,358 - INFO - Loading plugin 'docling_defaults'
2025-12-06 12:03:48,360 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-12-06 12:03:48,365 - INFO - Loading plugin 'docling_defaults'
2025-12-06 12:03:48,367 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-12-06 12:03:48,400 - INFO - Accelerator device: 'mps'


Testing PDF parsing

 Found 1 PDF files to test parsing
Testing PDF parsing with: 2508.07111v1.pdf


2025-12-06 12:03:50,312 - INFO - Accelerator device: 'mps'
2025-12-06 12:03:50,890 - INFO - Processing document 2508.07111v1.pdf
2025-12-06 12:04:11,727 - INFO - Finished converting document 2508.07111v1.pdf in 23.44 sec.
2025-12-06 12:04:11,894 - INFO - Parsed data/arxiv_pdfs/2508.07111v1.pdf


 PDF parsing successful
Sections: 23
Raw text length: 85167 characters
Parser used: ParserType.DOCLING
 First section: Content, (84) chars


In [8]:
# database storage testing
from src.db.factory import make_database
from src.repositories.paper import PaperRepository
from src.schemas.arxiv.paper import PaperCreate
from dateutil import parser as date_parser

print("Test 5: Database Storage")

settings = get_settings()
database = make_database()
print("Database connection created")

if papers:
    test_paper = papers[0]
    print(f"Storing paper: {test_paper.arxiv_id}")

    try:
        with database.get_session() as session:
            paper_repo = PaperRepository(session)

            published_date = date_parser.parse(test_paper.published_date) if isinstance(test_paper.published_date, str) else test_paper.published_date
            
            paper_create = PaperCreate(
                arxiv_id=test_paper.arxiv_id,
                title=test_paper.title,
                authors=test_paper.authors,
                abstract=test_paper.abstract,
                categories=test_paper.categories,
                published_date=published_date,
                pdf_url=test_paper.pdf_url,
            )
            
            
            stored_paper = paper_repo.upsert(paper_create)
            print(stored_paper)
            if stored_paper:
                print(f"Paper created with ID: {stored_paper.id}")
                print(f"Database ID: {stored_paper.id}")
                print(f"arXiv ID: {stored_paper.arxiv_id}")
                print(f"Title: {stored_paper.title[:50]}...")
                print(f"Authors: {len(stored_paper.authors)} authors")
                print(f"Categories: {', '.join(stored_paper.categories)}")

                retrieved_paper = paper_repo.get_by_arxiv_id(test_paper.arxiv_id)
                if retrieved_paper:
                    print("Paper retrieveal test passes")
                else:
                    print("Paper retrieval failed")
            else:
                print("Paper storage failed")

    except Exception as e:
        print(f"Database error: {e}")

else:
    print("No papers available for database storage test")

2025-12-06 12:04:12,031 - INFO - Attempting to connect to PostgreSQL at: localhost:5432/rag_db


Test 5: Database Storage
postgresql+psycopg2://rag_user:rag_password@localhost:5432/rag_db
20


2025-12-06 12:04:12,178 - INFO - Database connection test successfully
2025-12-06 12:04:12,188 - INFO - All tables already exist - no new tables created
2025-12-06 12:04:12,188 - INFO - PostgreSQL database initialized successfully
2025-12-06 12:04:12,188 - INFO - Database: rag_db
2025-12-06 12:04:12,190 - INFO - Database connection established


Database connection created
Storing paper: 2512.05117v1
<src.models.paper.Paper object at 0x378891250>
Paper created with ID: 3b71bdc4-d4b8-47cf-808b-64af09ac6cda
Database ID: 3b71bdc4-d4b8-47cf-808b-64af09ac6cda
arXiv ID: 2512.05117v1
Title: The Universal Weight Subspace Hypothesis...
Authors: 5 authors
Categories: cs.LG, cs.AI, cs.CV
Paper retrieveal test passes


In [9]:
# Test Complete Pipeline
from src.services.metadata_fetcher import make_metadata_fetcher

print("Test 6: Complete Metadata Fetcher Pipeline")
print("=" * 50)

# Create metadata fetcher
metadata_fetcher = make_metadata_fetcher(arxiv_client, pdf_parser)
print("✓ Metadata fetcher service created")

# Test with small batch
print("Running small batch test (2 papers, no PDF processing for speed)...")

try:
    with database.get_session() as session:
        results = await metadata_fetcher.fetch_and_process_papers(
            max_results=10, process_pdfs=True, store_to_db=True, db_session=session
        )

    print("\nPIPELINE RESULTS:")
    print(f"Papers fetched: {results.get('papers_fetched', 0)}")
    print(f"PDFs downloaded: {results.get('pdfs_downloaded', 0)}")
    print(f"PDFs parsed: {results.get('pdfs_parsed', 0)}")
    print(f"Papers stored: {results.get('papers_stored', 0)}")
    print(f"Processing time: {results.get('processing_time', 0):.1f}s")
    print(f"Errors: {len(results.get('errors', []))}")

    if results.get("errors"):
        print("\nErrors encountered:")
        for error in results.get("errors", [])[:3]:  # Show first 3 errors
            print(f"   - {error}")

    if results.get("papers_fetched", 0) > 0:
        print("\nPipeline test successful!")
    else:
        print("\nNo papers fetched - may be arXiv API unavailability")

except Exception as e:
    print(f"Pipeline error: {e}")

2025-12-06 12:04:12,212 - INFO - Fetching 10 cs.AI papers from arXiv


Test 6: Complete Metadata Fetcher Pipeline
✓ Metadata fetcher service created
Running small batch test (2 papers, no PDF processing for speed)...


2025-12-06 12:04:12,276 - INFO - HTTP Request: GET https://export.arxiv.org/api/query?search_query=cat:cs.AI&start=0&max_results=10&sortBy=submittedDate&sortOrder=descending "HTTP/1.1 200 OK"
2025-12-06 12:04:12,283 - INFO - Fetched 10 papers
2025-12-06 12:04:12,284 - INFO - Starting async pipeline for 10 PDFs
2025-12-06 12:04:12,284 - INFO - Concurrent downloads: 5
2025-12-06 12:04:12,284 - INFO - Concurrent parsing: 1
2025-12-06 12:04:12,284 - INFO - Downloaded pdf path for paper: 2512.05117v1: data/arxiv_pdfs/2512.05117v1.pdf
2025-12-06 12:04:12,285 - INFO - Downloading PDF from https://arxiv.org/pdf/2512.05117v1
2025-12-06 12:04:12,285 - INFO - Downloaded pdf path for paper: 2512.05112v1: data/arxiv_pdfs/2512.05112v1.pdf
2025-12-06 12:04:12,285 - INFO - Downloading PDF from https://arxiv.org/pdf/2512.05112v1
2025-12-06 12:04:12,285 - INFO - Downloaded pdf path for paper: 2512.05110v1: data/arxiv_pdfs/2512.05110v1.pdf
2025-12-06 12:04:12,286 - INFO - Downloading PDF from https://arx


PIPELINE RESULTS:
Papers fetched: 10
PDFs downloaded: 7
PDFs parsed: 7
Papers stored: 10
Processing time: 133.7s
Errors: 3

Errors encountered:
   - Pipeline error for 2512.05112v1: Pipeline error for 2512.05112v1, continuing with metadata only
   - Pipeline error for 2512.05110v1: Pipeline error for 2512.05110v1, continuing with metadata only
   - Pipeline error for 2512.05098v1: Pipeline error for 2512.05098v1, continuing with metadata only

Pipeline test successful!
