# Findability Demonstration Notebook (parameterisable via Papermill)
=================================================================

This notebook ingests a CSV of identifiers (DOIs, repository URLs, or
accession numbers), resolves each to a canonical landing‑page URL, issues an
HTTP GET request with aiohttp, and assigns a binary *Findability* score based
on the response status (1 = 2xx, 0 = otherwise).  Results are saved to a CSV
inside `reports/` and displayed inline.

*Prerequisites*
---------------
```bash
pip install aiohttp pandas python-dotenv nbconvert papermill
```

*Execution (CLI)*
----------------
```bash
papermill notebooks/findability_demo.ipynb \
          notebooks/output.ipynb \
          -p csv_path data/sample_identifiers.csv \
          -p output_dir reports

jupyter nbconvert --to html notebooks/output.ipynb --output reports/latest.html
```

In [1]:
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('findability_demo.log', 'a')
    ]
)
logger = logging.getLogger(__name__)

In [2]:
import os, sys
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

In [3]:
# %% [parameters]
# Papermill will inject values here; defaults are for local runs
csv_path = os.path.join(project_root,"data/sample_identifiers.csv")  # Path to input CSV
output_dir = os.path.join(project_root, "reports")  # Directory to write outputs

In [4]:
import asyncio
import json
import re
import time
from pathlib import Path
from typing import Literal, Tuple, Optional

import aiohttp
import pandas as pd

## Helper Functions

In [5]:
DOI_REGEX = re.compile(r"^10\.\d{4,9}/\S+$", re.IGNORECASE)
HTTP_REGEX = re.compile(r"^https?://", re.IGNORECASE)
# Mapping of *accession prefix* -> *landing‑page URL template*.
ACCESSION_TEMPLATES = {
    # NCBI resources
    "GSE": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={acc}",  # GEO Series
    "GSM": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={acc}",  # GEO Sample
    "SRR": "https://www.ncbi.nlm.nih.gov/sra/{acc}",                     # SRA Run
    "SRX": "https://www.ncbi.nlm.nih.gov/sra/{acc}",                     # SRA Experiment
    "SRP": "https://www.ncbi.nlm.nih.gov/sra/{acc}",                     # SRA Project
    "NC_": "https://www.ncbi.nlm.nih.gov/nuccore/{acc}",                 # GenBank RefSeq
    # EMBL‑EBI resources
    "ERR": "https://www.ebi.ac.uk/ena/browser/view/{acc}",               # ENA Run
    "ERP": "https://www.ebi.ac.uk/ena/browser/view/{acc}",               # ENA Project
    "ENCSR": "https://www.encodeproject.org/experiments/{acc}/",          # ENCODE
    # Proteomics / Expression / Other
    "E-": "https://www.ebi.ac.uk/arrayexpress/experiments/{acc}",        # ArrayExpress (prefixes E-MTAB‑, E-GEOD‑…)
    "PXD": "https://www.ebi.ac.uk/pride/archive/projects/{acc}",        # PRIDE dataset
}

In [6]:

def detect_type(identifier: str) -> Literal["doi", "url", "accession"]:
    """Classify identifier string."""
    logger.info(f"Detecting type for identifier: {identifier}")
    if HTTP_REGEX.match(identifier):
        return "URL"
    if DOI_REGEX.match(identifier):
        return "DOI"
    return "Accession"


async def resolve_doi(doi: str, session: aiohttp.ClientSession) -> str | None:
    """Resolve DOI to canonical URL using Crossref."""
    logger.info(f"Resolving DOI: {doi}")
    api_url = f"https://api.crossref.org/works/{doi}"
    try:
        async with session.get(api_url, timeout=10) as resp:
            if resp.status == 200:
                data = await resp.json()
                resolved_url = data.get("message", {}).get("URL")
                logger.info(f"Resolved DOI: {resolved_url}")
                return resolved_url
    except Exception:
        logger.error(f"Error resolving DOI: {doi}")
        return None
    return None

def resolve_accession(acc: str) -> Optional[str]:
    """Return landing‑page URL for supported accession prefixes.

    The function checks `ACCESSION_TEMPLATES` for the *longest* matching prefix
    (allowing multi‑char keys like "ENCSR" or "NC_"), then formats the URL. If
    no prefix matches, `None` is returned so the caller can skip scoring.
    """
    logger.info(f"Resolving accession: {acc}")
    try:
        for prefix in sorted(ACCESSION_TEMPLATES, key=len, reverse=True):
            if acc.startswith(prefix):
                template = ACCESSION_TEMPLATES[prefix]
                resolved_url = template.format(acc=acc)
                logger.info(f"Resolved accession: {resolved_url}")
                return resolved_url
    except Exception:
        logger.error(f"Error resolving accession: {acc}")
        return None
    return None


async def fetch_status(
    session: aiohttp.ClientSession, url: str
) -> Tuple[int | None, str | None, float]:
    """GET `url` and return (status, final_url, response_time_s)."""
    logger.info(f"Fetching status for URL: {url}")
    start = time.perf_counter()
    try:
        async with session.get(url, allow_redirects=True, timeout=10) as resp:
            status = resp.status
            final_url = str(resp.url)
            logger.info(f"Fetched status: {status}, final URL: {final_url}")
    except Exception:
        logger.error(f"Error fetching status for URL: {url}")
        status = None
        final_url = None
    duration = time.perf_counter() - start
    return status, final_url, duration

## Asynchronous pipeline


In [7]:
async def score_identifiers(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("Starting score_identifiers")
    async def _placeholder_task():
    # This is a simple coroutine that returns the desired default tuple
        return (None, None, 0.0)
    async with aiohttp.ClientSession() as session:
        # First pass – resolve DOIs and accessions to URLs
        resolved: list[str | None] = []
        for ident, typ in zip(df["Asset"], df["AssetType"]):
            if typ == "DOI":
                url = await resolve_doi(ident, session)
            elif typ == "Accession":
                url = resolve_accession(ident)
            else:  # url
                url = ident
            resolved.append(url)
        df["resolved_url"] = resolved

        # Second pass – probe landing pages
        # tasks = [fetch_status(session, url) if url else (None, None, 0.0) for url in resolved]
        tasks = [fetch_status(session, url) if url else _placeholder_task() for url in resolved]
        results = await asyncio.gather(*tasks)
        logger.info("Completed fetch_status")
    statuses, finals, times = zip(*results)
    df["http_status"] = statuses
    df["final_url"] = finals
    df["response_s"] = times
    df["findable"] = [1 if 200 <= (s or 0) < 300 else 0 for s in statuses]
    logger.info("Completed score_identifiers")
    return df

## Main execution block


In [8]:
# Ensure output directory exists
out_dir = Path(output_dir)
out_dir.mkdir(parents=True, exist_ok=True)

# Read input
input_df = pd.read_csv(csv_path)
if "AssetType" not in input_df.columns:
    input_df["AssetType"] = input_df["Asset"].apply(detect_type)

# Run pipeline
logger.info("Starting pipeline")
result_df = await score_identifiers(input_df.copy())
logger.info("Completed pipeline")

# Save results
logger.info("Saving results")
result_csv = out_dir / "findability_results.csv"
result_df.to_csv(result_csv, index=False)
logger.info(f"Results written to {result_csv.relative_to(Path(project_root))}")
# Display
result_df

2025-05-20 16:53:28,155 - __main__ - INFO - Starting pipeline
2025-05-20 16:53:28,160 - __main__ - INFO - Starting score_identifiers
2025-05-20 16:53:28,165 - __main__ - INFO - Resolving DOI: 10.5061/dryad.4j0zpc8p9


2025-05-20 16:53:28,284 - __main__ - INFO - Resolving DOI: 10.6084/m9.figshare.6025748
2025-05-20 16:53:28,316 - __main__ - INFO - Fetching status for URL: https://zenodo.org/record/7673768
2025-05-20 16:53:28,317 - __main__ - INFO - Fetching status for URL: https://openneuro.org/datasets/ds004470/about
2025-05-20 16:53:28,319 - __main__ - INFO - Fetching status for URL: NC_045512
2025-05-20 16:53:28,320 - __main__ - ERROR - Error fetching status for URL: NC_045512
2025-05-20 16:53:28,428 - __main__ - INFO - Fetched status: 200, final URL: https://openneuro.org/datasets/ds004470/about
2025-05-20 16:53:30,895 - __main__ - INFO - Fetched status: 200, final URL: https://zenodo.org/records/7673769
2025-05-20 16:53:30,897 - __main__ - INFO - Completed fetch_status
2025-05-20 16:53:30,900 - __main__ - INFO - Completed score_identifiers
2025-05-20 16:53:30,901 - __main__ - INFO - Completed pipeline
2025-05-20 16:53:30,902 - __main__ - INFO - Saving results
2025-05-20 16:53:30,907 - __main__ -

Unnamed: 0,Repo,Asset,AssetType,resolved_url,http_status,final_url,response_s,findable
0,Zenodo,https://zenodo.org/record/7673768,URL,https://zenodo.org/record/7673768,200.0,https://zenodo.org/records/7673769,2.580201,1
1,OpenNeuro,https://openneuro.org/datasets/ds004470/about,URL,https://openneuro.org/datasets/ds004470/about,200.0,https://openneuro.org/datasets/ds004470/about,0.111066,1
2,Dryad,10.5061/dryad.4j0zpc8p9,DOI,,,,0.0,0
3,Figshare,10.6084/m9.figshare.6025748,DOI,,,,0.0,0
4,GenBank,NC_045512,Accession,NC_045512,,,0.000905,0


## Quick summary table

In [9]:
summary = result_df[["Asset", "findable"]]
print("\nFindability summary:\n", summary.to_string(index=False))


Findability summary:
                                         Asset  findable
            https://zenodo.org/record/7673768         1
https://openneuro.org/datasets/ds004470/about         1
                      10.5061/dryad.4j0zpc8p9         0
                  10.6084/m9.figshare.6025748         0
                                    NC_045512         0
