In [1]:

import os
import json
import requests
import fitz                  # PyMuPDF
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging

# ───────────────────────────────────────────────────────────────────────────────
# 1) DEFINE THE SPIDER WITH OVERWRITE SETTINGS
# ───────────────────────────────────────────────────────────────────────────────

class EfifoundationBeccsSpider(Spider):
    name = "efibeccs"
    allowed_domains = ["efifoundation.org"]
    start_urls = ["https://efifoundation.org/reports/"]
    
    custom_settings = {
        "ROBOTSTXT_OBEY": False,
        "DOWNLOAD_DELAY": 1.0,
        "CONCURRENT_REQUESTS_PER_DOMAIN": 2,
        "FEEDS": {
            "beccs_pdfs.json": {
                "format": "json",
                "encoding": "utf8",
                "indent": 2,
                "overwrite": True   # ← ensure a single valid JSON array
            }
        }
    }

    def parse(self, response):
        for href in response.css("a::attr(href)").getall():
            href = href.strip()
            if not href:
                continue

            url = response.urljoin(href)

            # If it’s a BECCS PDF, yield it
            if url.lower().endswith(".pdf") and "beccs" in url.lower():
                yield {
                    "pdf_url": url,
                    "referrer": response.url,
                    "filename": url.split("/")[-1]
                }

            # Else if it’s still under /reports/ (and not a PDF), follow it
            elif url.startswith("https://efifoundation.org/reports/") and not url.lower().endswith(".pdf"):
                yield response.follow(url, callback=self.parse)

# ───────────────────────────────────────────────────────────────────────────────
# 2) RUN THE CRAWL (OVERWRITE beccs_pdfs.json)
# ───────────────────────────────────────────────────────────────────────────────

configure_logging({"LOG_FORMAT": "%(levelname)s: %(message)s"})
process = CrawlerProcess()
process.crawl(EfifoundationBeccsSpider)
process.start()  # ← creates/overwrites beccs_pdfs.json as a clean JSON array

# ───────────────────────────────────────────────────────────────────────────────
# 3) LOAD beccs_pdfs.json SAFELY (ARRAY OR LINES)
# ───────────────────────────────────────────────────────────────────────────────

json_path = "beccs_pdfs.json"
with open(json_path, "r") as f:
    text = f.read().strip()

if not text:
    pdf_entries = []
elif text.startswith("["):
    # Valid JSON array
    pdf_entries = json.loads(text)
else:
    # Fallback: JSON lines (one object per line)
    pdf_entries = [
        json.loads(line)
        for line in text.splitlines()
        if line.strip()
    ]

print(f"\n🗂  Found {len(pdf_entries)} PDF entries in {json_path}\n")

# ───────────────────────────────────────────────────────────────────────────────
# 4) DOWNLOAD EACH PDF WITH A BROWSER-LIKE HEADER + EXTRACT METADATA
# ───────────────────────────────────────────────────────────────────────────────

os.makedirs("downloaded_pdfs", exist_ok=True)
pdf_metadata = []

for entry in pdf_entries:
    url = entry["pdf_url"]
    filename = url.split("/")[-1]
    local_path = os.path.join("downloaded_pdfs", filename)

    # Only download if missing
    if not os.path.exists(local_path):
        print(f"Downloading: {filename}")
        headers = {"User-Agent": "Mozilla/5.0"} 
        r = requests.get(url, headers=headers, timeout=30)
        try:
            r.raise_for_status()
        except requests.HTTPError as e:
            print(f"  ⚠️  Failed to download {filename}: {e}")
            continue

        with open(local_path, "wb") as fp:
            fp.write(r.content)

    # Extract metadata via PyMuPDF
    try:
        doc = fitz.open(local_path)
        meta = doc.metadata or {}
    except Exception as e:
        meta = {}
        print(f"  ⚠️  Could not read metadata for {filename}: {e}")

    pdf_metadata.append({
        "source_url": url,
        "title": meta.get("title", filename),
        "creation_date": meta.get("creationDate", ""),
        "filepath": local_path
    })

print(f"\n✅ Downloaded and extracted metadata for {len(pdf_metadata)} PDFs.\n")

# ───────────────────────────────────────────────────────────────────────────────
# 5) RANK ENTRIES AND PRINT
# ───────────────────────────────────────────────────────────────────────────────

def score(item):
    pts = 0
    domain = item["source_url"].split("/")[2]

    if domain.endswith((".gov", ".org", ".int", ".eu")):
        pts += 3
    elif ".edu" in domain:
        pts += 2
    else:
        pts += 1

    title_lower = (item["title"] or "").lower()
    if any(kw in title_lower for kw in ["beccs", "bioenergy", "carbon capture"]):
        pts += 1

    if "2024" in item["source_url"] or "2025" in item["source_url"]:
        pts += 1

    return pts

ranked = sorted(pdf_metadata, key=score, reverse=True)

print("📄 Ranked List of BECCS PDFs:\n")
for idx, doc in enumerate(ranked, 1):
    print(f"{idx}. {doc['title']}   [{score(doc)} pts]")
    print(f"    → {doc['source_url']}\n")


2025-06-02 18:03:21 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2025-06-02 18:03:21 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.10.4, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 10:07:17) [Clang 14.0.6 ], pyOpenSSL 24.0.0 (OpenSSL 3.0.15 3 Sep 2024), cryptography 42.0.5, Platform macOS-15.4.1-arm64-arm-64bit
2025-06-02 18:03:21 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2025-06-02 18:03:21 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2025-06-02 18:03:21 [scrapy.extensions.telnet] INFO: Telnet Password: 7a0be453f52a7d16
2025-06-02 18:03:21 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensi


🗂  Found 4 PDF entries in beccs_pdfs.json

Downloading: Survey-the-BECCS-Landscape_Report-v2.pdf


2025-06-02 18:04:24 [urllib3.connectionpool] DEBUG: https://efifoundation.org:443 "GET /wp-content/uploads/sites/3/2022/03/Survey-the-BECCS-Landscape_Report-v2.pdf HTTP/11" 200 7146036
2025-06-02 18:04:26 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): efifoundation.org:443


Downloading: Surveying-the-BECCS-Landscape_ExecutiveSummary_2022.pdf


2025-06-02 18:04:27 [urllib3.connectionpool] DEBUG: https://efifoundation.org:443 "GET /wp-content/uploads/sites/3/2022/03/Surveying-the-BECCS-Landscape_ExecutiveSummary_2022.pdf HTTP/11" 200 2712204



✅ Downloaded and extracted metadata for 4 PDFs.

📄 Ranked List of BECCS PDFs:

1.    [3 pts]
    → https://efifoundation.org/wp-content/uploads/sites/3/2022/03/Survey-the-BECCS-Landscape_Report-v2.pdf

2.    [3 pts]
    → https://efifoundation.org/wp-content/uploads/sites/3/2022/03/Survey-the-BECCS-Landscape_Report-v2.pdf

3.    [3 pts]
    → https://efifoundation.org/wp-content/uploads/sites/3/2022/03/Surveying-the-BECCS-Landscape_ExecutiveSummary_2022.pdf

4.    [3 pts]
    → https://efifoundation.org/wp-content/uploads/sites/3/2022/03/Surveying-the-BECCS-Landscape_ExecutiveSummary_2022.pdf

