# Make Data Count Submission 

This work makes use of [MiniSom](https://github.com/JustGlowing/minisom) 
by Giuseppe Vettigli (2018).
External data source used for meta datasets: https://api.staging.crossref.org/swagger-ui/index.html#/Funders/get_funders__id__works

In [1]:
import os

# Silence TF/XLA/absl chatter that spams STDERR on Kaggle
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"        # 0=all,1=INFO,2=WARNING,3=ERROR
os.environ["ABSL_LOGGING_MIN_LOG_LEVEL"] = "3"
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

TRAIN_Y_PATH: str = "/kaggle/input/make-data-count-finding-data-references/train_labels.csv"
TRAIN_DIR_PATH:  str = "/kaggle/input/make-data-count-finding-data-references/train"
TEST_DIR_PATH:  str = "/kaggle/input/make-data-count-finding-data-references/train"

META_PAPER_API = "https://api.crossref.org/works/{doi}"
DEFAULT_SOURCE_TYPE = 'Unknown'
MODEL_ID = "all-MiniLM-L6-v2"

In [2]:
# Install dependencies 

!pip install -U sentence-transformers
!python -m sentence_transformers all-MiniLM-L6-v2
!pip install -U pypdf
!pip install pdfminer.six

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [8]:
# Data Helpers and Utilities 

import re
import io
import glob
import logging
import requests
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from pdfminer.high_level import extract_text
from dataclasses import dataclass, field, asdict
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union, Any

import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer

logger = logging.getLogger("kaggle_notebook")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s | %(levelname)-8s | %(message)s", "%Y-%m-%d %H:%M:%S"
)
handler.setFormatter(formatter)

def _read_file_binary(path: str) -> bytes:
    with open(path, "rb") as f:
        return f.read()


def _clean_ws(text: str) -> str:
    text = re.sub(r"\r\n?", "\n", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)  # collapse >2 blank lines
    return text.strip()


def _pdf_to_text(path: str) -> str:
    """Extract text from PDF using pdfminer.six if available, else PyPDF2 as fallback."""
    # Try pdfminer.six (best quality)
    try:
        # Note: extract_text opens file internally; pass path.
        text = extract_text(path) or ""
        return _clean_ws(text)
    except Exception:
        pass

    # Fallback: PyPDF2
    try:
        import PyPDF2  # type: ignore
        text_chunks: List[str] = []
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for pg in reader.pages:
                try:
                    s = pg.extract_text() or ""
                except Exception:
                    s = ""
                if s:
                    text_chunks.append(s)
        return _clean_ws("\n\n".join(text_chunks))
    except Exception:
        return ""

def _xml_to_text(path: str) -> str:
    """Parse XML with lxml if available, else ElementTree. Extracts title/abstract/body-ish text."""
    xml_bytes = _read_file_binary(path)

    # Try lxml first (best for namespaces/xpaths).
    try:
        from lxml import etree  # type: ignore
        parser = etree.XMLParser(recover=True, huge_tree=True)
        root = etree.fromstring(xml_bytes, parser=parser)

        # Common scholarly XML patterns (JATS-ish)
        texts: List[str] = []

        # title
        titles = root.xpath("//article-title|//title-group//article-title|//title")
        titles = [t.text if isinstance(t, etree._Element) else str(t) for t in titles]
        titles = [t for t in titles if t]
        if titles:
            texts.append("# " + titles[0].strip())

        # abstract
        abs_nodes = root.xpath("//abstract//p|//Abstract//p|//abstract")
        for n in abs_nodes:
            s = "".join(n.itertext()) if hasattr(n, "itertext") else str(n)
            s = s.strip()
            if s:
                texts.append(s)

        # body
        body_nodes = root.xpath("//body//p|//sec//p|//Body//p")
        for n in body_nodes:
            s = "".join(n.itertext()) if hasattr(n, "itertext") else str(n)
            s = s.strip()
            if s:
                texts.append(s)

        # fallback: all text
        if not texts:
            all_text = " ".join(root.itertext())
            texts = [all_text]

        return _clean_ws("\n\n".join(texts))

    except Exception:
        # Fallback to stdlib ElementTree
        import xml.etree.ElementTree as ET

        try:
            root = ET.fromstring(xml_bytes)
        except Exception:
            return ""  # unreadable

        def itxt(el):
            try:
                return "".join(el.itertext())
            except Exception:
                return el.text or ""

        # Attempt similar sections by tag name
        parts: List[str] = []
        # naive title
        for tag in ("article-title", "title"):
            for n in root.iter(tag):
                s = (n.text or "").strip()
                if s:
                    parts.append("# " + s)

        # abstract
        for tag in ("abstract",):
            for n in root.iter(tag):
                s = itxt(n).strip()
                if s:
                    parts.append(s)

        # paragraphs
        for tag in ("p",):
            for n in root.iter(tag):
                s = itxt(n).strip()
                if s:
                    parts.append(s)

        if not parts:
            parts = [itxt(root)]

        return _clean_ws("\n\n".join([p for p in parts if p]))

@dataclass
class Author:
    family: Optional[str] = None
    given: Optional[str] = None
    literal: Optional[str] = None


@dataclass
class Issued:
    date_parts: List[List[int]] = field(default_factory=list)


@dataclass
class DoiResponse:
    type: str
    id: str
    categories: List[str]
    author: List[Author]
    issued: Issued
    abstract: str
    DOI: str
    publisher: str
    title: str
    URL: str
    copyright: str

    @staticmethod 
    def parse_response(data: Dict[str, Any]):
        authors = [Author(**a) for a in data.get("author", [])]
        issued = Issued(date_parts=data.get("issued", {}).get("date-parts", []))
        return DoiResponse(
            type=data.get("type", ""),
            id=data.get("id", ""),
            categories=data.get("categories", []),
            author=authors,
            issued=issued,
            abstract=data.get("abstract", ""),
            DOI=data.get("DOI", ""),
            publisher=data.get("publisher", ""),
            title=data.get("title", ""),
            URL=data.get("URL", ""),
            copyright=data.get("copyright", ""),
        )

@dataclass
class Article:
    article_id: str 
    text: str 
    extension: str 
    source: str = DEFAULT_SOURCE_TYPE
    dataset_id: str | None = None 
    embedding: np.ndarray | None = None

    @staticmethod
    def fetch_meta_external(input_doi: str) -> dict | None:
        url = META_PAPER_API.format(doi=input_doi)
        
        try:
            r = requests.get(url)
            return r.json()
        except Exception as e: 
            logger.error(e)
            return None

    @staticmethod
    def fetch_meta_doi(doi_url: str) -> DoiResponse | None:
        try:
            headers = {"Accept": "application/vnd.citationstyles.csl+json"}
            r = requests.get(doi_url, headers=headers, timeout=30)
            if r.status_code == 200:
                result = r.json()
                return DoiResponse.parse_response(result)
                
        except Exception as e: 
            logger.error(e)
            return None



In [4]:
def load_train_dataset(model: SentenceTransformer):
    """ Loads the dataset for training. """
    
    targets = pd.read_csv(TRAIN_Y_PATH)
    logger.info(f"Dataset Labels: {targets['type'].unique()}")
    
    for path in tqdm(Path("/kaggle/input").rglob("*"), desc="Loading Train datasets"):
        if path.parents[1].stem == 'train' and path.is_file():
            info = {}
            
            ext = path.suffix
            
            if ext == '.pdf': 
                text = _pdf_to_text(str(path))
            elif ext == '.xml': 
                text = _xml_to_text(str(path))

            meta = targets[targets['article_id'] == path.stem]
            
            info['extension'] = ext 
            info['text'] = text 
            info['article_id'] = path.stem
            info['embedding'] = model.encoder(info['text']) if text != '' else None
            
            if not meta.empty:
                metas = meta.iloc[0].to_dict()
                info["source"] = metas.get("type", DEFAULT_SOURCE_TYPE)
                info["dataset_id"] = metas.get("dataset_id", None)
            else:
                logger.warning("No label metadata found for %s", path.stem)

            yield Article(**info)


In [5]:
class DoiData(Dataset): 
    """ Doi Dataset handler. 
    Target types:
    'Unknown': missing from train dataset
    'Missing': missing from data - predefined in the dataset
    'Primary' / 'Secondary': Main Data Referencing Labels
    """
    
    def __init__(self):
        self.data = list(load_train_dataset())
        self.model = SentenceTransformer(MODEL_ID)
        
        assert len(self.data) > 0, "Empty dataset loaded to instance"
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int): 
        if idx > len(self.data) or idx < 0:
            raise ValueError('Index out of range.')
            
        article = self.data[idx]
        emb = self.model.encode(
            article.text,
            convert_to_numpy=True,               # ensures np.ndarray (not torch)
            normalize_embeddings=False           # set True if you want L2-normalized
        )
        assert emb.shape[0] == 384, f'Unexpected shape returned for encoded text: {emb.shape}'
        
        article.embedding = emb 
        # TODO: CONVERT TO X_train, y_train outputs
        if meta := Article.fetch_meta_doi(article.dataset_id):
            return {**asdict(article), **asdict(meta)}
            
        return asdict(article)
        
ds = DoiData()
cache_path = "/kaggle/working/train_dataset.parquet"
ds.build_cache(cache_path)

Loading Train datasets: 988it [54:04,  3.28s/it] 


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
df = pd.DataFrame(ds.data)

In [7]:
df.to_parquet('/kaggle/temp/train_dataset.parquet')

Unnamed: 0,article_id,text,extension,source,dataset_id,embedding
0,10.1590_1678-4685-gmb-2018-0055,# Mitochondrial genomes of genus\n\nAbstractTh...,.xml,Missing,https://doi.org/10.6084/m9.figshare.11609370.v1,
1,10.1021_jacs.2c06519,# Identification of Oxidation\nState +1 in a M...,.xml,Primary,https://doi.org/10.25377/sussex.21184705,
2,10.1107_s2056989015019891,"# Crystal structure of 1,1,2,2-tetra­methyl-1,...",.xml,Missing,https://doi.org/10.5517/cc1k2lx4,
3,10.1186_s12881-019-0773-3,# A pharmacogenetic study of patients with sch...,.xml,Missing,https://doi.org/10.6084/m9.figshare.7975355,
4,10.3762_bjoc.8.42,# Synthesis of mesomeric betaine compounds wit...,.xml,Missing,https://doi.org/10.5517/ccdc.csd.ccy1n8w,
...,...,...,...,...,...,...
919,10.1590_1414-431x20198292,Brazilian Journal of Medical and Biological Re...,.pdf,Missing,https://doi.org/10.6084/m9.figshare.8324420.v1,
920,10.1080_15476286.2016.1232238,RNA Biology\n\nISSN: 1547-6286 (Print) 1555-85...,.pdf,Missing,https://doi.org/10.6084/m9.figshare.3830103,
921,10.1371_journal.pntd.0005385,RESEARCH ARTICLE\n\nAdvances in neglected trop...,.pdf,Primary,https://doi.org/10.5061/dryad.72v34,
922,10.1371_journal.pone.0170126,RESEARCH ARTICLE\n\nNovel Porcine Epidemic Dia...,.pdf,Missing,KU363060,


In [12]:
df.to_parquet('/kaggle/working/train_dataset.parquet')

In [18]:
df.to_csv("/kaggle/working/train_data.csv", index=False)


In [10]:
# TEST URL EXTRACTION 

samples = [
    {
        "dataset_id": "https://doi.org/10.1098/rspb.2016.1151",
        "data": ["https://doi.org/10.5061/dryad.6m3n9"],
        "in_text_span": "The data we used in this publication can be accessed from Dryad at doi:10.5061/dryad.6m3n9.",
        "citation_type": "Primary",
    },
    {
        "dataset_id": "https://doi.org/10.1098/rspb.2018.1563",
        "data": ["https://doi.org/10.5061/dryad.c394c12"],
        "in_text_span": "Phenotypic data and gene sequences are available from the Dryad Digital Repository: http://dx.doi.org/10.5061/dryad.c394c12",
        "citation_type": "Primary",
    },
    {
        "dataset_id": "https://doi.org/10.1534/genetics.119.302868",
        "data": ["https://doi.org/10.25386/genetics.11365982"],
        "in_text_span": "The authors state that all data necessary for confirming the conclusions presented in the article are represented fully within the article. Supplemental material available at figshare: https://doi.org/10.25386/genetics.11365982.",
        "citation_type": "Primary",
    },
    {
        "dataset_id": "https://doi.org/10.1038/sdata.2014.33",
        "data": ["GSE37569", "GSE45042", "GSE28166"],
        "in_text_span": "Primary data for Agilent and Affymetrix microarray experiments are available at the NCBI Gene Expression Omnibus (GEO, http://www.ncbi.nlm.nih.gov/geo/) under the accession numbers GSE37569, GSE45042 , GSE28166",
        "citation_type": "Primary",
    },
    {
        "dataset_id": "https://doi.org/10.12688/wellcomeopenres.15142.1",
        "data": ["pdb 5yfp"],
        "in_text_span": "Figure 1. Evolution and structure of the exocyst... All structural images were modelled by the authors from PDB using UCSF Chimera.",
        "citation_type": "Secondary",
    },
    {
        "dataset_id": "https://doi.org/10.3389/fimmu.2021.690817",
        "data": ["E-MTAB-10217", "PRJE43395"],
        "in_text_span": "The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: https://www.ebi.ac.uk/arrayexpress/, E-MTAB-10217 and https://www.ebi.ac.uk/ena, PRJE43395.",
        "citation_type": "Secondary",
    },
]

import re
from typing import Optional

# -------- DOI detection --------
def is_doi(s: str) -> bool:
    """Check if a string looks like a DOI identifier."""
    s = s.strip()
    return bool(
        re.match(r"^(https?://(dx\.)?doi\.org/)?10\.\d{4,9}/\S+$", s, flags=re.I)
        or s.lower().startswith("doi:")
    )

def doi_to_url(s: str) -> Optional[str]:
    """Normalize a DOI string into https://doi.org/... form."""
    s = s.strip()
    # strip leading "doi:" or "http(s)://doi.org/"
    s = re.sub(r"^(?i)(doi:|https?://(dx\.)?doi\.org/)", "", s)
    m = re.match(r"^(10\.\d{4,9}/\S+)$", s)
    if m:
        return "https://doi.org/" + m.group(1)
    return None

# -------- Accession detection --------
def is_accession(s: str) -> bool:
    """Check if a string looks like an accession ID (non-DOI dataset identifier)."""
    s = s.strip()
    return (
        re.match(r"^GSE\d+$", s, re.I)          # GEO
        or re.match(r"^E-(MTAB|MEXP|GEO)-\d+$", s, re.I)  # ArrayExpress
        or re.match(r"^PRJ\w+\d+$", s, re.I)   # ENA/NCBI BioProject
        or re.match(r"^(pdb\s+)?[0-9A-Za-z]{4}$", s, re.I)  # PDB
    ) is not None

def accession_to_url(s: str) -> Optional[str]:
    """Map a known accession ID into the correct repository URL."""
    s = s.strip()

    # GEO
    if re.match(r"^GSE\d+$", s, re.I):
        return f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={s}"

    # ArrayExpress
    if re.match(r"^E-(MTAB|MEXP|GEO)-\d+$", s, re.I):
        return f"https://www.ebi.ac.uk/arrayexpress/experiments/{s}"

    # ENA/BioProject
    if re.match(r"^PRJ\w+\d+$", s, re.I):
        return f"https://www.ebi.ac.uk/ena/browser/view/{s}"

    # PDB
    if s.lower().startswith("pdb "):
        pdb_id = s.split()[1]
        return f"https://www.rcsb.org/structure/{pdb_id.lower()}"
    if len(s) == 4 and re.match(r"^[0-9A-Za-z]{4}$", s):
        return f"https://www.rcsb.org/structure/{s.lower()}"

    return None

examples = [
    "10.1371/journal.pone.0303785",
    "https://doi.org/10.5061/dryad.r6nq870",
    "doi:10.6084/m9.figshare.11609370.v1",
    "GSE12345",
    "E-MEXP-568",
    "E-MTAB-10217",
    "PRJE43395",
    "PDB 1Y2T",
    "5YFP",
]

for ex in examples:
    if is_doi(ex):
        print(ex, "-> DOI:", doi_to_url(ex))
    elif is_accession(ex):
        print(ex, "-> Accession:", accession_to_url(ex))
    else:
        print(ex, "-> Unknown")

error: global flags not at the start of the expression at position 1