# Make Data Count Data Preparation

Dataclasses to extract dataset from `.xml` and `.pdf` for model training / tunings. 
This also fetches meta datasets for the articles.


In [18]:
import os

# Silence TF/XLA/absl chatter that spams STDERR on Kaggle
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"        # 0=all,1=INFO,2=WARNING,3=ERROR
os.environ["ABSL_LOGGING_MIN_LOG_LEVEL"] = "3"
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

TRAIN_Y_PATH: str = "/kaggle/input/make-data-count-finding-data-references/train_labels.csv"
TRAIN_DIR_PATH:  str = "/kaggle/input/make-data-count-finding-data-references/train"
TEST_DIR_PATH:  str = "/kaggle/input/make-data-count-finding-data-references/train"

META_PAPER_API = "https://api.crossref.org/works/{doi}"
DEFAULT_SOURCE_TYPE = 'Unknown'
MODEL_ID = "all-MiniLM-L6-v2"

In [19]:
# Install dependencies 

#!pip install -U sentence-transformers
#!python -m sentence_transformers all-MiniLM-L6-v2
!pip install -U pypdf
!pip install pdfminer.six



In [20]:
# Data Helpers and Utilities 

import re
import io
import glob
import logging
import requests
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from pdfminer.high_level import extract_text
from dataclasses import dataclass, field, asdict
from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union, Any, Optional

import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer

logger = logging.getLogger("kaggle_notebook")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s | %(levelname)-8s | %(message)s", "%Y-%m-%d %H:%M:%S"
)
handler.setFormatter(formatter)

def _read_file_binary(path: str) -> bytes:
    with open(path, "rb") as f:
        return f.read()

def _clean_ws(text: str) -> str:
    text = re.sub(r"\r\n?", "\n", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)  # collapse >2 blank lines
    return text.strip()

def _pdf_to_text(path: str) -> str:
    """Extract text from PDF using pdfminer.six if available, else PyPDF2 as fallback."""
    # Try pdfminer.six (best quality)
    try:
        # Note: extract_text opens file internally; pass path.
        text = extract_text(path) or ""
        return _clean_ws(text)
    except Exception:
        pass

    # Fallback: PyPDF2
    try:
        import PyPDF2  # type: ignore
        text_chunks: List[str] = []
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for pg in reader.pages:
                try:
                    s = pg.extract_text() or ""
                except Exception:
                    s = ""
                if s:
                    text_chunks.append(s)
        return _clean_ws("\n\n".join(text_chunks))
    except Exception:
        return ""

def _xml_to_text(path: str) -> str:
    """Parse XML with lxml if available, else ElementTree. Extracts title/abstract/body-ish text."""
    xml_bytes = _read_file_binary(path)

    # Try lxml first (best for namespaces/xpaths).
    try:
        from lxml import etree  # type: ignore
        parser = etree.XMLParser(recover=True, huge_tree=True)
        root = etree.fromstring(xml_bytes, parser=parser)

        # Common scholarly XML patterns (JATS-ish)
        texts: List[str] = []

        # title
        titles = root.xpath("//article-title|//title-group//article-title|//title")
        titles = [t.text if isinstance(t, etree._Element) else str(t) for t in titles]
        titles = [t for t in titles if t]
        if titles:
            texts.append("# " + titles[0].strip())

        # abstract
        abs_nodes = root.xpath("//abstract//p|//Abstract//p|//abstract")
        for n in abs_nodes:
            s = "".join(n.itertext()) if hasattr(n, "itertext") else str(n)
            s = s.strip()
            if s:
                texts.append(s)

        # body
        body_nodes = root.xpath("//body//p|//sec//p|//Body//p")
        for n in body_nodes:
            s = "".join(n.itertext()) if hasattr(n, "itertext") else str(n)
            s = s.strip()
            if s:
                texts.append(s)

        # fallback: all text
        if not texts:
            all_text = " ".join(root.itertext())
            texts = [all_text]

        return _clean_ws("\n\n".join(texts))

    except Exception:
        # Fallback to stdlib ElementTree
        import xml.etree.ElementTree as ET

        try:
            root = ET.fromstring(xml_bytes)
        except Exception:
            return ""  # unreadable

        def itxt(el):
            try:
                return "".join(el.itertext())
            except Exception:
                return el.text or ""

        # Attempt similar sections by tag name
        parts: List[str] = []
        # naive title
        for tag in ("article-title", "title"):
            for n in root.iter(tag):
                s = (n.text or "").strip()
                if s:
                    parts.append("# " + s)

        # abstract
        for tag in ("abstract",):
            for n in root.iter(tag):
                s = itxt(n).strip()
                if s:
                    parts.append(s)

        # paragraphs
        for tag in ("p",):
            for n in root.iter(tag):
                s = itxt(n).strip()
                if s:
                    parts.append(s)

        if not parts:
            parts = [itxt(root)]

        return _clean_ws("\n\n".join([p for p in parts if p]))

@dataclass
class Author:
    family: Optional[str] = None
    given: Optional[str] = None
    literal: Optional[str] = None

@dataclass
class Issued:
    date_parts: List[List[int]] = field(default_factory=list)

@dataclass
class DoiResponse:
    type: str
    id: str
    categories: List[str]
    author: List[Author]
    issued: Issued
    abstract: str
    DOI: str
    publisher: str
    title: str
    URL: str
    copyright: str

    @staticmethod 
    def parse_response(data: Dict[str, Any]):
        authors = [Author(**a) for a in data.get("author", [])]
        issued = Issued(date_parts=data.get("issued", {}).get("date-parts", []))
        return DoiResponse(
            type=data.get("type", ""),
            id=data.get("id", ""),
            categories=data.get("categories", []),
            author=authors,
            issued=issued,
            abstract=data.get("abstract", ""),
            DOI=data.get("DOI", ""),
            publisher=data.get("publisher", ""),
            title=data.get("title", ""),
            URL=data.get("URL", ""),
            copyright=data.get("copyright", ""),
        )

@dataclass
class Article:
    article_id: str 
    text: str 
    extension: str 
    source: str = DEFAULT_SOURCE_TYPE
    dataset_id: str | None = None 
    dataset_id_cited: str | None = None
    embedding: np.ndarray | None = None

    @staticmethod
    def fetch_meta_external(input_doi: str) -> dict | None:
        url = META_PAPER_API.format(doi=input_doi)
        
        try:
            r = requests.get(url)
            return r.json()
        except Exception as e: 
            logger.error(e)
            return None

    @staticmethod
    def fetch_meta_doi(doi_url: str) -> DoiResponse | None:
        try:
            headers = {"Accept": "application/vnd.citationstyles.csl+json"}
            r = requests.get(doi_url, headers=headers, timeout=30)
            if r.status_code == 200:
                result = r.json()
                return DoiResponse.parse_response(result)
                
        except Exception as e: 
            logger.error(e)
            return None

In [21]:
# Dataset ID / URL Cleaners & Converters 
import re 

# Test datasets that could possibly exist in the data
samples = [
    {
        "dataset_id": "https://doi.org/10.1098/rspb.2016.1151",
        "data": ["https://doi.org/10.5061/dryad.6m3n9"],
        "in_text_span": "The data we used in this publication can be accessed from Dryad at doi:10.5061/dryad.6m3n9.",
        "citation_type": "Primary",
    },
    {
        "dataset_id": "https://doi.org/10.1098/rspb.2018.1563",
        "data": ["https://doi.org/10.5061/dryad.c394c12"],
        "in_text_span": "Phenotypic data and gene sequences are available from the Dryad Digital Repository: http://dx.doi.org/10.5061/dryad.c394c12",
        "citation_type": "Primary",
    },
    {
        "dataset_id": "https://doi.org/10.1534/genetics.119.302868",
        "data": ["https://doi.org/10.25386/genetics.11365982"],
        "in_text_span": "The authors state that all data necessary for confirming the conclusions presented in the article are represented fully within the article. Supplemental material available at figshare: https://doi.org/10.25386/genetics.11365982.",
        "citation_type": "Primary",
    },
    {
        "dataset_id": "https://doi.org/10.1038/sdata.2014.33",
        "data": ["GSE37569", "GSE45042", "GSE28166"],
        "in_text_span": "Primary data for Agilent and Affymetrix microarray experiments are available at the NCBI Gene Expression Omnibus (GEO, http://www.ncbi.nlm.nih.gov/geo/) under the accession numbers GSE37569, GSE45042 , GSE28166",
        "citation_type": "Primary",
    },
    {
        "dataset_id": "https://doi.org/10.12688/wellcomeopenres.15142.1",
        "data": ["pdb 5yfp"],
        "in_text_span": "Figure 1. Evolution and structure of the exocyst... All structural images were modelled by the authors from PDB using UCSF Chimera.",
        "citation_type": "Secondary",
    },
    {
        "dataset_id": "https://doi.org/10.3389/fimmu.2021.690817",
        "data": ["E-MTAB-10217", "PRJE43395"],
        "in_text_span": "The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: https://www.ebi.ac.uk/arrayexpress/, E-MTAB-10217 and https://www.ebi.ac.uk/ena, PRJE43395.",
        "citation_type": "Secondary",
    },
]

ACCESSION_PATTERNS = [
    # DOI (bare "10." prefix, or full http(s) doi.org link, or "doi:10...")
    (re.compile(r"^(?:https?://(?:dx\.)?doi\.org/|doi:)?(10\.\d{4,9}/\S+)$", re.I),
     lambda m: f"https://doi.org/{m.group(1)}"),

    # GEO (Gene Expression Omnibus)
    (re.compile(r"^GSE\d+$", re.I),
     lambda m: f"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={m.group(0)}"),

    # ENA run/experiment (ERR/ERS/SRR/DRR/etc.)
    (re.compile(r"^(ERR|ERS|SRR|SRX|SRP|DRR|DRX|DRP|ERX|ERP)\d+$", re.I),
     lambda m: f"https://www.ebi.ac.uk/ena/browser/view/{m.group(0)}"),

    # dbSNP rs IDs
    (re.compile(r"^rs\d+$", re.I),
     lambda m: f"https://www.ncbi.nlm.nih.gov/snp/{m.group(0)}"),

    # PDB (4-char alphanumeric IDs)
    (re.compile(r"^[0-9A-Za-z]{4}$"),
     lambda m: f"https://www.rcsb.org/structure/{m.group(0)}"),

    # ChEMBL compounds/targets/assays
    (re.compile(r"^CHEMBL\d+$", re.I),
     lambda m: f"https://www.ebi.ac.uk/chembl/compound_report_card/{m.group(0)}/"),

    # DDBJ/GenBank/RefSeq nucleotide accessions (D10700, CP013147, NC_#######)
    (re.compile(r"^(?:[A-Z]{1,2}\d{5,6}|NC_\d+)$", re.I),
     lambda m: f"https://www.ncbi.nlm.nih.gov/nuccore/{m.group(0)}"),
]

def resolve_accession(acc: str) -> Optional[str]:
    """Return a best-effort URL for any accession/identifier/DOI."""
    if acc is None or (isinstance(acc, float) and pd.isna(acc)):
        return None
        
    s = str(acc).strip()
    if not s:
        return None

    # Try regex patterns
    for pattern, builder in ACCESSION_PATTERNS:
        m = pattern.match(s)
        if m:
            return builder(m)

    # Special-case string prefixes
    if s.upper().startswith("ENS"):  # Ensembl
        return f"https://www.ensembl.org/id/{s}"
    if s.upper().startswith("IPR"):  # InterPro
        return f"https://www.ebi.ac.uk/interpro/entry/{s.upper()}"
    if s.upper().startswith("CVCL_"):  # Cellosaurus
        return f"https://www.cellosaurus.org/{s.upper()}"
    if s.upper().startswith("EMPIAR-"):  # EMPIAR
        return f"https://www.ebi.ac.uk/empiar/{s.upper()}"
    if s.upper().startswith("HGNC:"):  # HGNC gene IDs
        return f"https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/{s.upper()}"
    if re.match(r"^K\d{5}$", s, flags=re.I):  # KEGG Orthology
        return f"https://www.genome.jp/dbget-bin/www_bget?ko:{s.upper()}"
    if s.upper().startswith("EPI_ISL_"):  # GISAID
        return f"https://www.gisaid.org/search?query={s}"

    # If it's already an HTTP(S) URL but didn't match DOI/PDB etc., keep as-is
    if s.lower().startswith("http"):
        return s

    # Fallback
    return s

In [25]:
def load_train_dataset():
    """ Loads the dataset for training. """
    
    targets = pd.read_csv(TRAIN_Y_PATH)
    logger.info(f"Total distinct ref type Labels: {targets['type'].unique()}")
    
    for path in tqdm(Path("/kaggle/input").rglob("*"), desc="Loading Train datasets"):
        if path.parents[1].stem == 'train' and path.is_file():
            info = {}
            
            ext = path.suffix
            
            if ext == '.pdf': 
                text = _pdf_to_text(str(path))
            elif ext == '.xml': 
                text = _xml_to_text(str(path))

            meta = targets[targets['article_id'] == path.stem]
            
            info['extension'] = ext 
            info['text'] = text 
            info['article_id'] = path.stem
            # info['embedding'] = model.encoder(info['text']) if text != '' else None
            
            if not meta.empty:
                metas = meta.iloc[0].to_dict()
                info["source"] = metas.get("type", DEFAULT_SOURCE_TYPE)
                info["dataset_id"] = metas.get("dataset_id", None)
                info["dataset_id_cited"] = resolve_accession(info["dataset_id"]) if info["dataset_id"] is not None else None
            else:
                logger.warning("No metadata found for %s", path.stem)

            yield Article(**info)


In [27]:
class DoiData(Dataset): 
    """ Doi Dataset handler. 
    Target types:
    'Unknown': missing from train dataset
    'Missing': missing from data - predefined in the dataset
    'Primary' / 'Secondary': Main Data Referencing Labels
    """
    
    def __init__(self):
        self.data = list(load_train_dataset())
        
        assert len(self.data) > 0, "Empty dataset loaded to instance"
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int): 
        if idx > len(self.data) or idx < 0:
            raise ValueError('Index out of range.')

        # TODO: CONVERT TO X_train, y_train outputs
        article = self.data[idx]
        
        if article.dataset_id_cited:
            if meta := Article.fetch_meta_doi(article.dataset_id_cited):
                return {**asdict(article), **asdict(meta)}
                
        return asdict(article)
        
ds = DoiData()

Loading Train datasets: 988it [53:56,  3.28s/it] 


In [39]:
full_data = list(ds)

In [42]:
data = pd.DataFrame.from_records(full_data)

In [45]:
data.shape

(924, 18)

In [46]:
# Saving to /kaggle/working dir as parquet OR csv
data.to_parquet('/kaggle/working/train_dataset.parquet')
data.to_csv('/kaggle/working/train_dataset.csv', index=False)