In [3]:
# -----------------------------------------------------------------------------------
# 0.  Notebook boiler-plate  ────────────────────────────────────────────────────────
# -----------------------------------------------------------------------------------
!pip install --quiet requests lxml tqdm pandas

import json, re, time, requests, os, gzip
import camelot
import pandas as pd
import glob
import pdfplumber # Import pdfplumber
import pandas as pd, textwrap, os, sys, pprint, itertools
from pathlib import Path
from datetime import datetime
from typing import List, Dict
from lxml import etree        # robust XML parser
from tqdm import tqdm # progress-bar in Jupyter
from pebble import ProcessPool
from concurrent.futures import TimeoutError as PebbleTimeoutError
import docx 


# --- Configuration: Keywords and Regular Expressions ---
FORWARD_KEYWORDS = ['forward', 'fw', 'fp', 'fwd', 'sense', 'forw']
REVERSE_KEYWORDS = ['reverse', 'rev', 'rp', 'rv', 'antisense', 'revs']
GENE_HEADER_KEYWORDS = ['gene', 'target', 'name', 'symbol', 'oligonucleotide', 'oligos', 'locus']
ALLOWED_DNA_CHARS_FOR_FILTER = "ACTG"


# -----------------------------------------------------------------------------------
# 0.5  abhi, change htis directoy here───────────────────────────────────────────────
# -----------------------------------------------------------------------------------
DATA_DIR = Path("data/psc")
DATA_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR = Path("data/psc/output")
OUT_DIR.mkdir(parents=True, exist_ok=True)


# Europe PMC imposes a 30 req/min soft limit → be polite.
HEADERS   = {"User-Agent": "PrimerMiner/0.1 (tmolley@ucsd.edu)"}
BASE_URL  = "https://www.ebi.ac.uk/europepmc/webservices/rest/"


# -----------------------------------------------------------------------------------
# 1.  Search Europe PMC for the latest 10 OA papers mentioning “GAPDH”  ─────────────
# -----------------------------------------------------------------------------------
def cleanup_downloaded_files(dirs_to_clean: List[Path]):
    """
    Deletes all files within a list of specified directories.
    This is used to manage storage space between batch runs.
    """
    for directory in dirs_to_clean:
        if not directory.is_dir():
            print(f"Cleanup skipped: Directory '{directory}' does not exist.")
            continue

        print(f"  Cleaning files in: {directory}")
        files_deleted = 0
        for item in directory.iterdir():
            # Ensure we only delete files, not subdirectories
            if item.is_file():
                try:
                    item.unlink()
                    files_deleted += 1
                except Exception as e:
                    print(f"    Could not delete file {item}: {e}")
        print(f"    ...deleted {files_deleted} file(s).")
        
def europepmc_search_all(query: str,
                         max_records: int = 10000000,
                         page_size: int = 1000,
                         throttle: float = 0.3) -> List[Dict]:
    """
    Fetch up to `max_records` hits for `query`, paging with Europe PMC's
    cursorMark API. Returns a list of JSON result dicts.
    This version is updated to handle cases where no 'nextCursorMark' is returned.
    """
    hits, retrieved = [], 0
    cursor = "*"

    while retrieved < max_records:
        url = (f"{BASE_URL}search"
               f"?query={requests.utils.quote(query)}"
               f"&format=json"
               f"&pageSize={page_size}"
               f"&cursorMark={cursor}")
        r = requests.get(url, headers=HEADERS, timeout=40)
        r.raise_for_status()
        data = r.json()

        if "resultList" not in data:
            raise RuntimeError(f"Europe PMC API error: {data.get('error', data)}")

        batch = data["resultList"]["result"]
        if not batch:  # no more results
            break

        hits.extend(batch)
        retrieved += len(batch)
        
        # --- MODIFICATION ---
        # Gracefully handle the end of results when 'nextCursorMark' is absent.
        # The .get() method returns None if the key is not found, preventing a KeyError.
        cursor = data.get("nextCursorMark")
        if not cursor:
            # No more pages to fetch, so we exit the loop.
            break
        # --- END MODIFICATION ---

        time.sleep(throttle)  # be polite (≤ 30 req / min)

    return hits[:max_records]
    
def load_hgnc_symbol_set(cache_dir="hgnc_cache") -> set[str]:
    # Use Path directly instead of pathlib.Path
    cache_path = Path(cache_dir) / "homo_sapiens.gene_info.gz"
    if not cache_path.exists():
        cache_path.parent.mkdir(parents=True, exist_ok=True)
        url = ("https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/"
               "Mammalia/Homo_sapiens.gene_info.gz")
        print("⏬  downloading HGNC symbol list …") # Corrected emoji display if needed
        try:
            r = requests.get(url, timeout=60)
            r.raise_for_status()
            cache_path.write_bytes(r.content)
        except requests.exceptions.RequestException as e:
            print(f"Error downloading HGNC symbols: {e}")
            print("Proceeding without HGNC symbol validation for XML gene names.")
            return set() # Return an empty set if download fails

    symbols = set()
    try:
        with gzip.open(cache_path, "rt", encoding='utf-8') as fh: # Added encoding
            for line in fh:
                if line.startswith("#"):
                    continue
                cols = line.rstrip("\n").split("\t")
                if len(cols) > 2: # Ensure there are enough columns
                    symbols.add(cols[2].upper())  # official symbol column
    except Exception as e:
        print(f"Error reading or processing HGNC file {cache_path}: {e}")
        print("Proceeding without HGNC symbol validation for XML gene names.")
        return set() # Return an empty set if file processing fails
    
    if not symbols:
        print(f"Warning: HGNC symbol set is empty after attempting to load from {cache_path}.")
    else:
        print(f"Loaded {len(symbols)} HGNC symbols.")
    return symbols
    
# ---------------------------------------------------------------------------
# 3  Helper: fetch XML only if we don't have it already
# ---------------------------------------------------------------------------
def get_or_download_xml(pmcid: str, dest_dir: Path) -> Path:
    """
    Return the path to the XML file for `pmcid`.
    If the file is already on disk, just return the path.
    Otherwise download from Europe PMC and save it.
    """
    fpath = dest_dir / f"{pmcid}.xml"
    if fpath.exists():                # ✅ cached
        return fpath

    # 🠞 not cached → download
    url = f"{BASE_URL}{pmcid}/fullTextXML"
    r   = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    fpath.write_text(r.text, encoding="utf-8")
    return fpath


# -----------------------------------------------------------------------------------
# 4.  Download full-text XML and supplement for each PMCID  ─────────────────────────
# -----------------------------------------------------------------------------------
def fetch_fulltext_xml(pmcid: str) -> str:
    """Return the raw JATS XML for a given PMCID from Europe PMC."""
    url = f"{BASE_URL}{pmcid}/fullTextXML"
    r   = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text

def save_xml(pmcid: str, xml_text: str) -> Path:
    """Persist the XML to disk and return the file path."""
    fpath = DATA_DIR / f"{pmcid}.xml"
    fpath.write_text(xml_text, encoding="utf-8")
    return fpath
    
def safe_json(resp):
    """Return resp.json() or an empty dict if the payload isn’t JSON."""
    try:
        if resp.text.strip():
            return resp.json()
    except json.JSONDecodeError:
        pass
    return {}
    
def rename_supp_files(pmcid: str, file_paths: list[Path]) -> list[Path]:
    """
    Rename each Path in `file_paths` to <pmcid>_supp<N>.<ext>
    Returns a list of new Path objects (same length/order as input).
    """
    new_paths = []
    for i, old_path in enumerate(sorted(file_paths), 1):
        new_name = f"{pmcid}_supp{i}{old_path.suffix.lower()}"
        new_path = old_path.with_name(new_name)
        if new_path.exists():            # avoid accidental overwrite
            new_path = old_path          # keep original
        else:
            old_path.rename(new_path)
        new_paths.append(new_path)
    return new_paths
    
def fetch_supplements_from_xml(pmcid: str,
                               xml_path: Path,
                               dest_dir: Path,
                               pause: float = 0.2) -> list[Path]:
    """
    Download *only* PDF supplementary files referenced in the JATS XML.
    """
    dest_dir.mkdir(parents=True, exist_ok=True)
    if not xml_path.exists():
        print(f"⚠️  XML file missing for {pmcid}")
        return []

    # ---- 1. gather every xlink:href ----------------------------------------
    root = etree.parse(str(xml_path))
    ns   = {"x": "http://www.w3.org/1999/xlink"}
    hrefs = set()
    hrefs.update(root.xpath("//supplementary-material/@x:href", namespaces=ns))
    hrefs.update(root.xpath("//supplementary-material//media/@x:href", namespaces=ns))
    hrefs.update(root.xpath("//ext-link[contains(@ext-link-type,'supplement')]/@x:href",
                            namespaces=ns))

    if not hrefs:
        return []

    # ---- 2. download PDF files only ---------------------------------------
    base  = f"https://europepmc.org/articles/{pmcid}/bin/"
    saved = []

    for href in hrefs:
        url   = href if href.startswith("http") else base + href.lstrip("/")
        fname = url.split("/")[-1].split("?")[0]

        # **PDF filter**
        if not fname.lower().endswith((".pdf", ".docx")):
            continue

        fpath = dest_dir / fname
        if fpath.exists():
            saved.append(fpath)
            continue

        try:
            r = requests.get(url, headers=HEADERS, timeout=60)
            r.raise_for_status()
            fpath.write_bytes(r.content)
            saved.append(fpath)
            time.sleep(pause)
        except requests.HTTPError as e:
            print(f"⚠️  {pmcid}  {fname} →", e)

    return saved
# -----------------------------------------------------------------------------------
# 5.  supplmental reader functions  ────────────────────────────────────────────────-
# -----------------------------------------------------------------------------------

def parse_docx_table_to_dataframe(docx_table_obj):
    """Converts a python-docx table object to a pandas DataFrame."""
    data = []
    keys = None
    
    # Attempt to use the first row as headers
    if docx_table_obj.rows:
        potential_header_cells = [cell.text.strip() for cell in docx_table_obj.rows[0].cells]
        # Crude check: if not too many empty cells and not overly long strings, assume header
        if len(potential_header_cells) > 0 and \
           sum(1 for h in potential_header_cells if not h) < len(potential_header_cells) / 2 and \
           all(len(h) < 100 for h in potential_header_cells): # Avoid very long cell content as header
            keys = potential_header_cells
            
    for i, row in enumerate(docx_table_obj.rows):
        text = [cell.text.strip() for cell in row.cells]
        if keys and i == 0 and len(docx_table_obj.rows) > 1: # If keys were taken from first row and there are more rows
            continue # Skip adding the header row to data list
        
        # If keys were set and match current row length, create a dict for this row
        if keys and len(text) == len(keys) and (i > 0 or len(docx_table_obj.rows) == 1):
             data.append(dict(zip(keys, text)))
        else: # Fallback to list of lists if no keys or length mismatch
            data.append(text) # Will have default integer columns if this path is taken mostly

    if not data:
        return pd.DataFrame()

    try:
        df = pd.DataFrame(data)
        # If dataframe was created from list of lists and keys were determined but not used as columns yet
        if keys and not isinstance(data[0], dict) and len(keys) == df.shape[1]:
            df.columns = keys
    except Exception: # Fallback if DataFrame creation is problematic
        df = pd.DataFrame()

    # If after all that, columns are still default integers, and first data row looks like header
    if not df.empty and all(isinstance(col, int) for col in df.columns) and df.shape[0] > 0:
        first_data_row_values = [str(c).strip() for c in df.iloc[0]]
        # Simple heuristic: if first row doesn't contain typical long sequences and has text
        is_first_row_likely_header = not any(is_valid_primer_sequence(extract_sequence_and_direction_from_cell(str(c))[0] or "", 18, 40) for c in first_data_row_values) and \
                                   any(len(str(c)) > 0 for c in first_data_row_values)
        if is_first_row_likely_header and df.shape[0] > 1:
            df.columns = first_data_row_values
            df = df[1:].reset_index(drop=True)
            
    return df

def extract_primers_from_docx(docx_path: Path, pmcid: str, hgnc_symbols: set) -> List[Dict]:
    """
    Extracts primer sequences and context from DOCX files (tables and prose).
    """
    found_docx_primers = []
    if not docx_path.exists():
        return found_docx_primers

    try:
        doc = docx.Document(docx_path) # from python-docx library
    except Exception as e:
        print(f"    Error opening DOCX file {docx_path.name}: {e}")
        return found_docx_primers

    # --- 1. Process Tables in DOCX ---
    if doc.tables:
        # print(f"    Found {len(doc.tables)} table(s) in DOCX: {docx_path.name}")
        for table_idx, docx_table_obj in enumerate(doc.tables):
            # print(f"      Processing DOCX Table {table_idx+1}...")
            df_from_docx = parse_docx_table_to_dataframe(docx_table_obj)
            if not df_from_docx.empty:
                # print(f"        Parsed DOCX table to DataFrame (shape: {df_from_docx.shape})")
                # Use your existing scan_dataframe_for_primers function
                source_desc = f"DOCX Table {table_idx+1}" # Page info is not direct like PDF
                primers_from_table = scan_dataframe_for_primers(
                    df_from_docx, 
                    pmcid,
                    source_desc, 
                    docx_path.name, # Source File
                    hgnc_symbols
                )
                if primers_from_table:
                    found_docx_primers.extend(primers_from_table)
            # else:
                # print(f"      DOCX Table {table_idx+1} resulted in an empty DataFrame.")

    # --- 2. Process Prose Text (paragraphs) in DOCX ---
    # print(f"    Scanning {len(doc.paragraphs)} paragraphs in DOCX: {docx_path.name}...")
    unique_prose_blocks_docx = set()
    dna_candidate_regex_prose = re.compile(r'\b((?:[ACGUTNRYKMSWBDHV]\s*){15,50})\b', re.IGNORECASE)

    for para_idx, para in enumerate(doc.paragraphs):
        text_block = para.text.strip()
        if not text_block or len(text_block) < 20 or text_block in unique_prose_blocks_docx:
            continue
        unique_prose_blocks_docx.add(text_block)

        for match in dna_candidate_regex_prose.finditer(text_block):
            raw_dna_match_with_spaces = match.group(1)
            core_bases, direction_from_text_body, _ = \
                extract_sequence_and_direction_from_cell(raw_dna_match_with_spaces)

            if core_bases and is_valid_primer_sequence(core_bases):
                sequence = core_bases
                probable_direction = direction_from_text_body if direction_from_text_body else "Unknown"
                probable_gene = "Unknown"
                
                match_start_in_block = match.start()
                match_end_in_block = match.end()
                
                context_chars = 80
                text_before = text_block[max(0, match_start_in_block - context_chars) : match_start_in_block]
                text_after = text_block[match_end_in_block : min(len(text_block), match_end_in_block + context_chars)]
                
                if probable_direction == "Unknown":
                    if any(k in text_before.lower() or k in text_after.lower() for k in FORWARD_KEYWORDS):
                        probable_direction = "Forward"
                    elif any(k in text_before.lower() or k in text_after.lower() for k in REVERSE_KEYWORDS):
                        probable_direction = "Reverse"
                
                gene_search_context = text_before + raw_dna_match_with_spaces + text_after
                potential_gene_matches = re.findall(r'\b([A-Z][A-Za-z0-9-]{2,15})\b', gene_search_context) # Case sensitive for initial find
                best_dist = float('inf')
                common_prose_noise = {"PCR", "DNA", "RNA", "PRIMER", "FORWARD", "REVERSE", "SEQUENCE", "METHOD", "FIGURE", "TABLE"} | set(k.upper() for k in FORWARD_KEYWORDS) | set(k.upper() for k in REVERSE_KEYWORDS)

                for pg_cand_raw in potential_gene_matches:
                    pg_cand_upper = pg_cand_raw.upper()
                    # Check if it's an HGNC symbol OR just a plausible non-noise capitalized word
                    is_hgnc_or_plausible = (pg_cand_upper in hgnc_symbols) or \
                                           (len(pg_cand_raw) > 2 and pg_cand_upper not in common_prose_noise and not pg_cand_raw.islower())

                    if is_hgnc_or_plausible:
                        try:
                            for m_pg in re.finditer(re.escape(pg_cand_raw), gene_search_context): # Match original case
                                primer_rel_start = len(text_before)
                                dist = abs(m_pg.start() - primer_rel_start)
                                if dist < best_dist and dist < context_chars : 
                                    best_dist = dist
                                    cleaned_pg_cand = clean_gene_name(pg_cand_raw)
                                    if cleaned_pg_cand != "Unknown": # Ensure clean_gene_name doesn't reject it
                                        probable_gene = cleaned_pg_cand
                        except: pass 
                
                original_context_snip = f"...{text_block[max(0,match_start_in_block-30):min(len(text_block),match_end_in_block+30)]}..."
                found_docx_primers.append({
                    "PMCID": pmcid, "Gene": probable_gene, "Sequence": sequence,
                    "Orientation": probable_direction, "Source File": docx_path.name, 
                    "Page": f"DOCX Paragraph", # Can't get page number easily from python-docx paragraphs
                    "Original Cell Text": original_context_snip 
                })
    return found_docx_primers

    
def extract_pmcid_from_filename(filename):
    if not isinstance(filename, str):
        return "UnknownPMCID"
    # Regex to capture characters before "_supp" followed by anything and ".pdf"
    # This handles variations like _supp1.pdf, _supplemental.pdf, etc.
    match = re.match(r"^(.*?)_supp(?:lemental|lement|s)?\d*\.pdf$", filename, re.IGNORECASE)
    if match:
        pmcid_candidate = match.group(1)
        # Further ensure it looks like a PMCID if that's the strict requirement
        if pmcid_candidate.upper().startswith("PMC"):
            return pmcid_candidate
        # If not starting with PMC, it might be another ID format used before "_supp"
        return pmcid_candidate 
    
    # Fallback if no "_supp" pattern but filename itself is a PMCID
    pmc_direct_match = re.match(r"^(PMC\d+)\.pdf$", filename, re.IGNORECASE)
    if pmc_direct_match:
        return pmc_direct_match.group(1)

    # Generic fallback: return filename without extension if no specific pattern found
    return os.path.splitext(filename)[0]


def is_plausible_dna_segment(text_segment):
    if not isinstance(text_segment, str) or not text_segment.strip():
        return False
    allowed_interspersed_chars = " -" 
    has_dna_char = False
    for char_val in text_segment.upper():
        if char_val in ALLOWED_DNA_CHARS_FOR_FILTER:
            has_dna_char = True
        elif char_val in allowed_interspersed_chars:
            continue
        elif char_val.isalpha(): 
            return False 
    return has_dna_char

def extract_sequence_and_direction_from_cell(cell_text):
    if not isinstance(cell_text, str) or not cell_text.strip():
        return None, None, ""

    text_body = cell_text.strip()
    original_text_for_context = text_body
    inferred_direction = None
    
    # This section for identifying direction keywords (e.g., "Forward", "Rv") is preserved as it works well.
    direction_affixes = {
        "Forward": [
            (r"^(?:FP[\s:-]*|FWD[\s:-]*|FORWARD[\s:-]*|SENSE[\s:-]*)", "prefix"),
            (r"\s*\((?:Forward|FWD|FP|Sense)\)$", "suffix")
        ],
        "Reverse": [
            (r"^(?:RP[\s:-]*|REV[\s:-]*|REVERSE[\s:-]*|ANTISENSE[\s:-]*)", "prefix"),
            (r"\s*\((?:Reverse|REV|RP|Antisense)\)$", "suffix")
        ]
    }
    prime_5_prefix_pattern = r"^(?:5['`´‘’]?[-–—]?)"
    prime_3_suffix_pattern = r"(?:[-–—]?3['`´‘’]?)$"

    text_body_changed_in_pass = True
    while text_body_changed_in_pass:
        text_body_before_affix_pass = text_body
        text_body_changed_in_pass = False
        current_pass_direction_found = False
        for dir_key, patterns in direction_affixes.items():
            if current_pass_direction_found and inferred_direction: break
            for pattern_str, affix_type in patterns:
                original_len_text_body = len(text_body)
                if affix_type == "prefix":
                    match = re.match(pattern_str, text_body, re.IGNORECASE)
                    if match:
                        if not inferred_direction: inferred_direction = dir_key
                        text_body = text_body[match.end():].strip()
                        current_pass_direction_found = True
                elif affix_type == "suffix":
                    match = re.search(pattern_str, text_body, re.IGNORECASE)
                    if match:
                        if not (match.start() == 0 and inferred_direction):
                            if not inferred_direction: inferred_direction = dir_key
                            text_body = text_body[:match.start()].strip()
                
                if len(text_body) != original_len_text_body:
                    text_body_changed_in_pass = True
                    if current_pass_direction_found: break
            if current_pass_direction_found and inferred_direction: break

    text_body = re.sub(prime_5_prefix_pattern, "", text_body, flags=re.IGNORECASE).strip()
    text_body = re.sub(prime_3_suffix_pattern, "", text_body, flags=re.IGNORECASE).strip()

    # --- REPLACEMENT LOGIC ---
    # The old logic (using is_plausible_dna_segment and filter) is replaced.
    # This new regex approach is more robust and solves both of your issues.

    # 1. Find all contiguous blocks of valid DNA characters (A, C, T, G).
    #    - We set a minimum length (e.g., 10) to avoid matching short acronyms.
    #    - This cleanly separates primers from gene names ("GAPDH") and ignores junk ("â€²").
    primer_candidates = re.findall(r'([ACTG]{10,})', text_body, re.IGNORECASE)
    
    if not primer_candidates:
        return None, inferred_direction, original_text_for_context

    # 2. From the candidates found, choose the longest one that passes full validation.
    best_sequence = ""
    for candidate in primer_candidates:
        # Your 'is_valid_primer_sequence' function already checks for the correct length (e.g., 18-40).
        if is_valid_primer_sequence(candidate) and len(candidate) > len(best_sequence):
            best_sequence = candidate
    
    if best_sequence:
        # 3. Return the cleaned, validated, uppercase sequence if found.
        return best_sequence.upper(), inferred_direction, original_text_for_context
    else:
        # No candidate passed the final validation (e.g., all were too short or too long).
        return None, inferred_direction, original_text_for_context

def is_valid_primer_sequence(sequence_bases, min_len=18, max_len=40):
    if not isinstance(sequence_bases, str): return False 
    if not (min_len <= len(sequence_bases) <= max_len):
        return False
    if not all(char_val.upper() in ALLOWED_DNA_CHARS_FOR_FILTER for char_val in sequence_bases):
        return False
    return True

def clean_gene_name(gene_text):
    if not isinstance(gene_text, str) or not gene_text.strip() or gene_text.lower() == 'nan':
        return "Unknown"
    match = re.match(r"([a-zA-Z0-9_.\-]+(?:[-/][a-zA-Z0-9_.\-]+)*)", gene_text.strip())
    if match:
        name = match.group(1)
        if len(name) > 1 and not (name.isnumeric() and len(name) > 4): 
            # Convert common direction keywords found in gene cells to "Unknown" gene
            # if they were mistakenly identified as gene name
            if name.lower() not in (FORWARD_KEYWORDS + REVERSE_KEYWORDS + ["sense", "antisense", "forward primer", "reverse primer"]):
                 # Avoid names that are purely sequence like
                if not (len(name) > 15 and all(c.upper() in ALLOWED_DNA_CHARS_FOR_FILTER for c in name)):
                    return name
    return "Unknown" 


def find_primers_sequence_first(pdf_path, page_spec="all"):
    all_found_primers_details = []
    pdf_filename = os.path.basename(pdf_path)

    if not os.path.exists(pdf_path):
        print(f"Error: PDF file not found at {pdf_path}")
        return all_found_primers_details

    print(f"Attempting to read tables from '{pdf_filename}' pages: {page_spec}")
    
    start_camelot_time = time.time()
    lattice_tables_content = []
    stream_tables_content = []

    try:
        tables_lattice_obj = camelot.read_pdf(pdf_path, pages=page_spec, flavor='lattice', line_scale=30, shift_text=[' '], copy_text=['v'], suppress_stdout=True)
        if tables_lattice_obj.n > 0: 
            lattice_tables_content = list(tables_lattice_obj) 
    except Exception as e_lattice:
        print(f"  Lattice flavor failed for {pdf_filename}. Error: {e_lattice}")

    try:
        tables_stream_obj = camelot.read_pdf(pdf_path, pages=page_spec, flavor='stream', suppress_stdout=True)
        if tables_stream_obj.n > 0: 
            stream_tables_content = list(tables_stream_obj) 
    except Exception as e_stream:
        print(f"  Stream flavor failed for {pdf_filename}. Error: {e_stream}")
    
    end_camelot_time = time.time()
    print(f"  Camelot PDF processing took: {end_camelot_time - start_camelot_time:.2f} seconds.")
        
    combined_tables_list = lattice_tables_content + stream_tables_content
        
    if not combined_tables_list:
        print(f"No tables found by Camelot (lattice or stream combined) in {pdf_filename} on pages '{page_spec}'.")
        return all_found_primers_details
    # --- End Timing for Camelot ---  
    
    start_python_processing_time = time.time()
    unique_table_identifiers = set() 

    for table_idx, table_obj in enumerate(combined_tables_list): 
        try:
            table_content_hash = hash(table_obj.df.to_string())
            table_id = (table_obj.page, table_content_hash, table_obj.flavor) 
            if table_id in unique_table_identifiers: continue
            unique_table_identifiers.add(table_id)
        except Exception: pass

        df = table_obj.df 
        # print(f"\n--- Scanning Table {table_idx+1} (Page {table_obj.page}, Shape {df.shape}, Flavor {table_obj.flavor}) ---")

        if df.empty or df.shape[0] < 1 or df.shape[1] < 1: continue

        # Get column headers for context (from df.columns and potentially df.iloc[0])
        headers_from_df_columns = [str(col).strip().lower() for col in df.columns] 
        first_row_values_as_headers = [str(df.iloc[0, c_idx]).strip().lower() for c_idx in range(min(df.shape[1], len(df.iloc[0])))] if df.shape[0] > 0 else []
    
        for r_idx in range(df.shape[0]):
            # Attempt to determine a contextual gene for the entire row first
            row_context_gene = "Unknown"
            # Check first cell of the row for gene, if it's not a sequence itself
            if df.shape[1] > 0:
                first_cell_text = str(df.iloc[r_idx, 0]).strip()
                if first_cell_text and first_cell_text.lower() != 'nan':
                    temp_bases, _, _ = extract_sequence_and_direction_from_cell(first_cell_text)
                    if not (temp_bases and is_valid_primer_sequence(temp_bases, min_len=15)):
                        cleaned_name = clean_gene_name(first_cell_text)
                        if cleaned_name != "Unknown": row_context_gene = cleaned_name
            
            for c_idx in range(df.shape[1]):
                try:
                    raw_cell_text = str(df.iloc[r_idx, c_idx]).strip()
                    if not raw_cell_text or raw_cell_text.lower() == 'nan' or len(raw_cell_text) < 10: 
                        continue

                    core_bases, direction_from_cell, original_cell_text = \
                        extract_sequence_and_direction_from_cell(raw_cell_text)

                    if core_bases and is_valid_primer_sequence(core_bases):
                        probable_gene = row_context_gene # Start with row-level gene context
                        probable_direction = direction_from_cell if direction_from_cell else "Unknown"

                        # If gene is still unknown from row context, try column to the left (if not first col)
                        if probable_gene == "Unknown" and c_idx > 0:
                            gene_cand_text = str(df.iloc[r_idx, c_idx - 1]).strip()
                            if gene_cand_text and gene_cand_text.lower() != 'nan':
                                temp_bases, _, _ = extract_sequence_and_direction_from_cell(gene_cand_text)
                                if not (temp_bases and is_valid_primer_sequence(temp_bases, min_len=15)):
                                    cleaned_name = clean_gene_name(gene_cand_text)
                                    if cleaned_name != "Unknown": probable_gene = cleaned_name
                        
                        # Refine gene if current cell (which has primer) also has a gene-like prefix not caught by direction stripping
                        if probable_gene == "Unknown" or len(probable_gene) < 2 : # If context gene weak or unknown
                            gene_part_from_primer_cell_match = re.match(r"([a-zA-Z0-9_.\-]+)\s*[:\-(]", original_cell_text)
                            if gene_part_from_primer_cell_match:
                                potential_gene_prefix = clean_gene_name(gene_part_from_primer_cell_match.group(1))
                                if potential_gene_prefix != "Unknown":
                                    probable_gene = potential_gene_prefix


                        # Attempt to find Direction from column header if not found in cell:
                        if probable_direction == "Unknown":
                            current_col_header = ""
                            # Prioritize Camelot's direct column name if it's text
                            if c_idx < len(headers_from_df_columns) and not all(isinstance(c, int) for c in df.columns):
                                current_col_header = headers_from_df_columns[c_idx]
                            # Else, if columns were int, first row might be header
                            elif c_idx < len(first_row_values_as_headers): 
                                current_col_header = first_row_values_as_headers[c_idx]

                            if current_col_header: 
                                if any(k in current_col_header for k in FORWARD_KEYWORDS): probable_direction = "Forward"
                                elif any(k in current_col_header for k in REVERSE_KEYWORDS): probable_direction = "Reverse"
                        
                        # Final fallback for direction if sequence is in one cell, and gene name is in cell above, 
                        # and direction keywords are beside gene name in cell above
                        if probable_direction == "Unknown" and r_idx > 0 and probable_gene != "Unknown":
                            cell_above_text = str(df.iloc[r_idx-1, c_idx]).strip().lower()
                            if probable_gene.lower() in cell_above_text: # if gene name is part of cell above
                                if any(k in cell_above_text for k in FORWARD_KEYWORDS): probable_direction = "Forward"
                                elif any(k in cell_above_text for k in REVERSE_KEYWORDS): probable_direction = "Reverse"


                        all_found_primers_details.append({
                            "Sequence": core_bases, "Probable Gene": probable_gene, 
                            "Probable Direction": probable_direction, "Source File": pdf_filename, 
                            "Page": table_obj.page, "Original Cell Text": original_cell_text
                        })
                except Exception: continue

    
     # --- Transform to new output format (with .get() for safety) ---
    final_results_transformed = []
    for p_info in all_found_primers_details:
        # Use .get() to provide default values if a key is somehow missing from p_info
        source_file = p_info.get("Source File", pdf_filename) # Fallback to current pdf_filename
        probable_gene = p_info.get("Probable Gene", "Unknown")
        sequence_val = p_info.get("Sequence", "") # Must have sequence
        probable_direction = p_info.get("Probable Direction", "Unknown")
        page_val = p_info.get("Page", 0) # Default page to 0 if missing
        original_text = p_info.get("Original Cell Text", "")

        if not sequence_val: # Skip if no sequence was extracted for this entry
            continue

        pmcid = extract_pmcid_from_filename(source_file)
        
        orientation = "Unknown"
        if probable_direction: 
            direction_lower = probable_direction.lower()
            if "forward" in direction_lower or "sense" in direction_lower:
                orientation = "Forward"
            elif "reverse" in direction_lower or "antisense" in direction_lower:
                orientation = "Reverse"

        final_results_transformed.append({
            "PMCID": pmcid,
            "Gene": probable_gene,
            "Sequence": sequence_val,
            "Orientation": orientation,
            "Source File": source_file,
            "Page": page_val,
            "Original Cell Text": original_text
        })
    
    end_python_processing_time = time.time()
    # print(f"  Python post-Camelot processing took: {end_python_processing_time - start_python_processing_time:.2f} seconds.")
            
    return final_results_transformed

# Add this function definition with your other helper functions

# -----------------------------------------------------------------------------------
# 6.  main text reader functions  ────────────────────────────────────────────────-
# -----------------------------------------------------------------------------------
def process_pdf_with_timeout_wrapper(args_tuple):
    """
    Wrapper function to call find_primers_sequence_first for a single PDF.
    Handles arguments and exceptions for multiprocessing.
    """
    pdf_path, page_spec, hgnc_symbols_set, pdf_filename_for_log = args_tuple
    # print(f"  Worker starting for: {pdf_filename_for_log}, pages: {page_spec}") # Optional: for verbose worker start
    try:
        result = find_primers_sequence_first(pdf_path, page_spec, hgnc_symbols_set)
        # print(f"  Worker finished for: {pdf_filename_for_log}. Found {len(result)} primer items.") # Optional
        return result
    except Exception as e:
        print(f"  ERROR in worker process for {pdf_filename_for_log}: {e}")
        import traceback
        traceback.print_exc() # Print full traceback from worker for debugging
        return [] # Return empty list on error within the worker
        
def get_element_text_content(element) -> str:
    """Extracts and concatenates all text within an XML element."""
    return " ".join(text.strip() for text in element.xpath(".//text()") if text.strip()).strip()
def parse_xml_table_to_dataframe(table_etree_element):
    """Converts an lxml etree <table> element to a pandas DataFrame."""
    headers = []
    header_elements = table_etree_element.xpath("./thead/tr/th | ./tr[1]/th | ./thead/tr/td | ./tr[1]/td") # More comprehensive header find
    
    # If the first row has only <th>, it's likely the header row
    if header_elements and all(el.tag == 'th' for el in table_etree_element.xpath("./tr[1]/*")):
         headers = [get_element_text_content(th).strip() for th in header_elements]
    elif header_elements : # if first row has mix of th/td or just td but are headers from xpath
         headers = [get_element_text_content(th).strip() for th in header_elements]


    rows_data = []
    # Get all table rows (tr)
    tr_elements = table_etree_element.xpath("./tbody/tr | ./tr") # Get body rows or all rows if no tbody
    
    start_row_index = 0
    if headers and len(tr_elements) > 0 : # If headers were found from tr[1] or thead
        # Check if the first tr_element was used for headers
        first_tr_cells_for_header_check = tr_elements[0].xpath("./th | ./td")
        first_tr_text_for_header_check = [get_element_text_content(c).strip() for c in first_tr_cells_for_header_check]
        if first_tr_text_for_header_check == headers:
            start_row_index = 1 # Data rows start from the next row

    for r_idx in range(start_row_index, len(tr_elements)):
        tr = tr_elements[r_idx]
        cells = tr.xpath("./td | ./th") # Cells can be td or th (e.g. row headers)
        row_text = [get_element_text_content(cell).strip() for cell in cells]
        if any(rt.strip() for rt in row_text): # Only add non-empty rows
            rows_data.append(row_text)

    if not rows_data:
        return pd.DataFrame()

    df = pd.DataFrame(rows_data)
    if headers and len(headers) == df.shape[1]:
        df.columns = headers
    elif df.shape[0] > 0 and not headers: # No explicit headers found, try to use first data row if suitable
        first_row_is_data_like = any(is_valid_primer_sequence(extract_sequence_and_direction_from_cell(str(c))[0] or "", 18,40) for c in df.iloc[0])
        if not first_row_is_data_like and df.shape[0] > 1 :
            df.columns = [str(c).strip() for c in df.iloc[0]]
            df = df[1:].reset_index(drop=True)
        # Else: use default integer column names from pandas
    return df

def scan_dataframe_for_primers(df: pd.DataFrame, pmcid: str, 
                               source_description: str, # e.g., "XML Table in file.xml" or "PDF Page X"
                               source_filename_context: str, # filename for XML, or page number for PDF
                               hgnc_symbols: set) -> List[Dict]:
    """
    Scans a given DataFrame (from PDF or XML table) for primer sequences.
    """
    found_primers_in_df = []
    if df.empty:
        return found_primers_in_df

    df_headers_original_case = [str(col).strip() for col in df.columns] # Keep original case for display/debug if needed
    df_headers_lower = [h.lower() for h in df_headers_original_case]

    for r_idx in range(df.shape[0]):
        row_context_gene = "Unknown"
        # Try to get row context gene from the first cell if it's text-like and not a sequence
        if df.shape[1] > 0:
            first_cell_text_for_gene = str(df.iloc[r_idx, 0]).strip()
            if first_cell_text_for_gene and first_cell_text_for_gene.lower() != 'nan':
                temp_bases_gene, _, _ = extract_sequence_and_direction_from_cell(first_cell_text_for_gene)
                # A gene name shouldn't be a long valid primer sequence
                if not (temp_bases_gene and is_valid_primer_sequence(temp_bases_gene, min_len=15)): # Check if it's NOT a primer
                    cleaned_name = clean_gene_name(first_cell_text_for_gene) # clean_gene_name does further checks
                    if cleaned_name != "Unknown":
                         # Validate against HGNC if it looks like a gene symbol
                        if cleaned_name.upper() in hgnc_symbols or (len(cleaned_name) > 4 and not any(k in cleaned_name.lower() for k in FORWARD_KEYWORDS + REVERSE_KEYWORDS)): # Allow longer non-HGNC names
                            row_context_gene = cleaned_name

        for c_idx in range(df.shape[1]):
            try:
                raw_cell_text = str(df.iloc[r_idx, c_idx]).strip()
                if not raw_cell_text or raw_cell_text.lower() == 'nan' or len(raw_cell_text) < 10:
                    continue

                core_bases, direction_from_cell, original_cell_text = \
                    extract_sequence_and_direction_from_cell(raw_cell_text)

                if core_bases and is_valid_primer_sequence(core_bases):
                    sequence = core_bases
                    probable_direction = direction_from_cell if direction_from_cell else "Unknown"
                    probable_gene = row_context_gene # Start with row context

                    # If gene still "Unknown" from row context, try column to the left
                    if probable_gene == "Unknown" and c_idx > 0:
                        gene_cand_text_left = str(df.iloc[r_idx, c_idx - 1]).strip()
                        if gene_cand_text_left and gene_cand_text_left.lower() != 'nan':
                            temp_bases_left, _, _ = extract_sequence_and_direction_from_cell(gene_cand_text_left)
                            if not (temp_bases_left and is_valid_primer_sequence(temp_bases_left, min_len=15)):
                                cleaned_name_left = clean_gene_name(gene_cand_text_left)
                                if cleaned_name_left != "Unknown": probable_gene = cleaned_name_left
                    
                    # If gene is still "Unknown" and current cell has gene-like prefix before primer
                    if probable_gene == "Unknown" or len(probable_gene) < 2:
                        gene_part_match = re.match(r"([a-zA-Z0-9_.\-]+(?:[-/][a-zA-Z0-9_.\-]+)*)\s*[:\-(]", original_cell_text)
                        if gene_part_match:
                            potential_gene_prefix = clean_gene_name(gene_part_match.group(1))
                            if potential_gene_prefix != "Unknown": probable_gene = potential_gene_prefix
                    
                    # Use column header for direction if not found in cell
                    if probable_direction == "Unknown" and c_idx < len(df_headers_lower):
                        col_header_text = df_headers_lower[c_idx]
                        if any(k in col_header_text for k in FORWARD_KEYWORDS):
                            probable_direction = "Forward"
                        elif any(k in col_header_text for k in REVERSE_KEYWORDS):
                            probable_direction = "Reverse"
                    
                    # If gene is still "Unknown", check current column header (if not a direction/sequence keyword itself)
                    if probable_gene == "Unknown" and c_idx < len(df_headers_lower):
                        header_text = df_headers_lower[c_idx]
                        is_dir_header = any(k in header_text for k in FORWARD_KEYWORDS + REVERSE_KEYWORDS)
                        is_seq_header = any(k in header_text for k in ['sequence', 'primer', 'oligo', 'probe'])
                        if not is_dir_header and not is_seq_header and len(df_headers_original_case[c_idx]) > 1 : # Use original case for clean_gene_name
                            cleaned_header_gene = clean_gene_name(df_headers_original_case[c_idx])
                            if cleaned_header_gene != "Unknown": probable_gene = cleaned_header_gene

                    found_primers_in_df.append({
                        "PMCID": pmcid, "Gene": probable_gene, "Sequence": sequence,
                        "Orientation": probable_direction, "Source File": source_filename_context, # Use the context string
                        "Page": source_description, # General description (e.g. "XML Table", "PDF Page X")
                        "Original Cell Text": original_cell_text
                    })
            except Exception: # Catch errors within cell processing
                continue
    return found_primers_in_df


def extract_primers_from_xml(xml_path: Path, pmcid: str, hgnc_symbols: set) -> List[Dict]:
    found_xml_primers = []
    if not xml_path.exists(): return found_xml_primers

    try:
        with open(xml_path, 'rb') as f: xml_content_bytes = f.read()
        parser = etree.XMLParser(recover=True, strip_cdata=True, resolve_entities=False, no_network=True)
        root = etree.fromstring(xml_content_bytes, parser=parser)
    except Exception as e:
        print(f"  Error parsing XML for {pmcid} ({xml_path.name}): {e}")
        return found_xml_primers

    # --- 1. Process Tables within XML ---
    table_elements = root.xpath("//table[ancestor::table-wrap]") # More specific: tables within table-wraps
    if not table_elements: # Fallback to any table element
        table_elements = root.xpath("//table")
    
    print(f"  Found {len(table_elements)} <table> elements in XML for {pmcid}.")
    for idx, table_el in enumerate(table_elements):
        # print(f"    Processing XML Table {idx+1} in {pmcid}...")
        df_from_xml = parse_xml_table_to_dataframe(table_el)
        if not df_from_xml.empty:
            source_desc = f"XML Table {idx+1} (cols: {list(df_from_xml.columns)})"
            primers_from_table = scan_dataframe_for_primers(df_from_xml, pmcid, source_desc, xml_path.name, hgnc_symbols)
            if primers_from_table:
                found_xml_primers.extend(primers_from_table)
        # else:
            # print(f"      XML Table {idx+1} parsed to empty DataFrame.")

    # --- 2. Process Prose Text ---
    # Define XPaths for prose content, trying to exclude table cell content already processed
    prose_xpaths = [
        "//sec[translate(@sec-type,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='methods']//p[not(.//table)]",
        "//sec[translate(@sec-type,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='materialsmethods']//p[not(.//table)]",
        "//body//p[not(ancestor::table-wrap) and not(ancestor::fig) and not(.//table)]",
        "//fig//caption//p[not(.//table)]",
        "//table-wrap//caption//p[not(.//table)]", # Caption of table-wrap if it doesn't contain a table itself
        "//table-wrap-foot//p[not(.//table)]"
    ]
    unique_prose_blocks = set()
    dna_candidate_regex = re.compile(r'\b((?:[ACGUTNRYKMSWBDHV]\s*){15,50})\b', re.IGNORECASE)

    for xp in prose_xpaths:
        try:
            elements = root.xpath(xp)
            for elem in elements:
                text_block = get_element_text_content(elem)
                if not text_block or len(text_block) < 20 or text_block in unique_prose_blocks:
                    continue
                unique_prose_blocks.add(text_block)

                for match in dna_candidate_regex.finditer(text_block):
                    raw_dna_match_with_spaces = match.group(1)
                    core_bases, direction_from_text_body, _ = \
                        extract_sequence_and_direction_from_cell(raw_dna_match_with_spaces) # Use this to clean and get initial direction

                    if core_bases and is_valid_primer_sequence(core_bases):
                        sequence = core_bases
                        probable_direction = direction_from_text_body if direction_from_text_body else "Unknown"
                        probable_gene = "Unknown"
                        
                        match_start_in_block = match.start()
                        match_end_in_block = match.end()
                        
                        # Context for direction and gene (search around the raw_dna_match_with_spaces)
                        context_chars = 80 # How many chars before/after for context
                        text_before = text_block[max(0, match_start_in_block - context_chars) : match_start_in_block]
                        text_after = text_block[match_end_in_block : min(len(text_block), match_end_in_block + context_chars)]
                        
                        # Refine direction from local context if not found by extract_sequence_and_direction_from_cell
                        if probable_direction == "Unknown":
                            if any(k in text_before.lower() or k in text_after.lower() for k in FORWARD_KEYWORDS):
                                probable_direction = "Forward"
                            elif any(k in text_before.lower() or k in text_after.lower() for k in REVERSE_KEYWORDS):
                                probable_direction = "Reverse"
                        
                        # Gene finding in prose (simplified example, can be improved)
                        # Look for capitalized words or HGNC symbols near the primer
                        gene_search_context = text_before + raw_dna_match_with_spaces + text_after
                        potential_genes = re.findall(r'\b([A-Z][A-Za-z0-9-]{2,15})\b', gene_search_context) # Find capitalized words
                        best_dist = float('inf')
                        
                        common_prose_noise = {"PCR", "DNA", "RNA", "PRIMER", "FORWARD", "REVERSE", "SEQUENCE", "METHOD", "FIGURE", "TABLE"} | set(k.upper() for k in FORWARD_KEYWORDS) | set(k.upper() for k in REVERSE_KEYWORDS)

                        for pg_cand in potential_genes:
                            pg_upper = pg_cand.upper()
                            if pg_upper in hgnc_symbols and pg_upper not in common_prose_noise:
                                # Find position of pg_cand relative to raw_dna_match_with_spaces in gene_search_context
                                for m_pg in re.finditer(re.escape(pg_cand), gene_search_context, re.IGNORECASE):
                                    # Relative start of primer in gene_search_context
                                    primer_rel_start = len(text_before) 
                                    dist = abs(m_pg.start() - primer_rel_start)
                                    if dist < best_dist and dist < context_chars : # Must be close
                                        best_dist = dist
                                        probable_gene = clean_gene_name(pg_cand)
                        
                        original_context_snip = f"...{text_block[max(0,match_start_in_block-30):min(len(text_block),match_end_in_block+30)]}..."
                        found_xml_primers.append({
                            "PMCID": pmcid, "Gene": probable_gene, "Sequence": sequence,
                            "Orientation": probable_direction, "Source File": xml_path.name, 
                            "Page": "Main Text Prose", "Original Cell Text": original_context_snip
                        })
        except Exception: continue # Ignore errors for a single XPath or element
            
    return found_xml_primers


def find_primers_sequence_first(pdf_path, page_spec="all", hgnc_symbols: set = set()):
    all_found_primers_details = [] # This will store dicts from scan_dataframe_for_primers
    pdf_filename = os.path.basename(pdf_path)
    
    # Timeout for each Camelot call (lattice and stream)
    CAMELOT_CALL_TIMEOUT = 60  # Seconds (e.g., 1.5 minutes per Camelot flavor call)

    # print(f"Attempting to read tables from '{pdf_filename}' pages: {page_spec}") # Already printed in main
    
    start_camelot_time = time.time()
    lattice_tables_content = []
    stream_tables_content = []

    # Camelot parameters
    camelot_params_lattice = {
        'pages': page_spec, 
        'flavor': 'lattice', 
        'line_scale': 30, 
        'shift_text': [' '], 
        'copy_text': ['v'], 
        'suppress_stdout': True
    }
    camelot_params_stream = {
        'pages': page_spec, 
        'flavor': 'stream', 
        'suppress_stdout': True
    }

    with ProcessPool(max_workers=1) as pool: # Using 1 worker as we do calls sequentially
        # Lattice
        try:
            # print(f"  Submitting 'lattice' task for {pdf_filename}...")
            future_lattice = pool.schedule(camelot.read_pdf, args=[pdf_path], kwargs=camelot_params_lattice, timeout=CAMELOT_CALL_TIMEOUT)
            tables_lattice_obj = future_lattice.result()  # Blocks until result or timeout
            if tables_lattice_obj and tables_lattice_obj.n > 0:
                lattice_tables_content = list(tables_lattice_obj)
            # print(f"  Lattice found {len(lattice_tables_content)} table(s).")
        except PebbleTimeoutError:
            print(f"  Lattice flavor TIMED OUT for {pdf_filename} after {CAMELOT_CALL_TIMEOUT}s on pages '{page_spec}'.")
        except Exception as e_lattice:
            print(f"  Lattice flavor failed for {pdf_filename} (pages '{page_spec}'). Error: {e_lattice}")

        # Stream
        try:
            # print(f"  Submitting 'stream' task for {pdf_filename}...")
            future_stream = pool.schedule(camelot.read_pdf, args=[pdf_path], kwargs=camelot_params_stream, timeout=CAMELOT_CALL_TIMEOUT)
            tables_stream_obj = future_stream.result() # Blocks until result or timeout
            if tables_stream_obj and tables_stream_obj.n > 0:
                stream_tables_content = list(tables_stream_obj)
            # print(f"  Stream found {len(stream_tables_content)} table(s).")
        except PebbleTimeoutError:
            print(f"  Stream flavor TIMED OUT for {pdf_filename} after {CAMELOT_CALL_TIMEOUT}s on pages '{page_spec}'.")
        except Exception as e_stream:
            print(f"  Stream flavor failed for {pdf_filename} (pages '{page_spec}'). Error: {e_stream}")
    
    end_camelot_time = time.time()
    print(f"  Camelot PDF processing for pages '{page_spec}' took: {end_camelot_time - start_camelot_time:.2f} seconds (including any timeouts).")
        
    combined_tables_list = lattice_tables_content + stream_tables_content
        
    if not combined_tables_list:
        # print(f"  No tables found by Camelot in {pdf_filename} on pages '{page_spec}'.") # Already handled by individual prints
        return [] # Return empty list, not all_found_primers_details which isn't defined yet in this scope
        
    # print(f"  Found {len(combined_tables_list)} potential table regions in {pdf_filename}.")

    start_python_processing_time = time.time()
    # --- (The rest of your table scanning, cell iteration, and context gathering logic) ---
    # This part iterates `combined_tables_list` and calls `scan_dataframe_for_primers`
    # or directly processes df from table_obj to populate `all_found_primers_details`
    
    unique_table_identifiers = set()
    for table_idx, table_obj in enumerate(combined_tables_list):
        try:
            table_content_hash = hash(table_obj.df.to_string())
            table_id = (table_obj.page, table_content_hash, table_obj.flavor)
            if table_id in unique_table_identifiers: continue
            unique_table_identifiers.add(table_id)
        except Exception: pass
        
        df_from_pdf = table_obj.df
        if df_from_pdf.empty: continue
        
        source_desc = f"PDF Page {table_obj.page}, Table {table_idx+1} ({table_obj.flavor})"
        # Extract PMCID from the PDF filename to pass to scan_dataframe_for_primers
        current_pmcid = extract_pmcid_from_filename(pdf_filename)

        # Ensure scan_dataframe_for_primers is defined and accessible
        primers_from_pdf_table = scan_dataframe_for_primers(
            df_from_pdf, 
            current_pmcid, 
            source_desc,
            pdf_filename, # filename_or_context
            hgnc_symbols
        )
        if primers_from_pdf_table:
            all_found_primers_details.extend(primers_from_pdf_table)

    end_python_processing_time = time.time()
    # print(f"  Python post-Camelot processing took: {end_python_processing_time - start_python_processing_time:.2f} seconds.")
            
    # The `all_found_primers_details` list contains dictionaries already in the desired final format
    # as produced by `scan_dataframe_for_primers`.
    # No further transformation loop is strictly needed here if `scan_dataframe_for_primers`
    # correctly sets all required keys ("PMCID", "Gene", "Orientation", etc.).
    # The error you had previously (KeyError: 'Probable Gene') would be in the loop below if it existed.
    # The `scan_dataframe_for_primers` should be outputting with "Gene" and "Orientation".
    
    # For clarity, let's assume scan_dataframe_for_primers returns the final desired structure:
    # {"PMCID": ..., "Gene": ..., "Sequence": ..., "Orientation": ..., "Source File": ..., "Page": ..., "Original Cell Text": ...}
    
    return all_found_primers_details


# ###################################################################################
# # --- (Main Execution Block - if __name__ == "__main__": ...) ---
# ###################################################################################
if __name__ == "__main__":
    # --- 0. Configuration ---
    records_to_fetch = 500 # Example from your code
    START_YEAR = 2025
    END_YEAR_PROCESSING = 2015 # Example from your code
    N_LAST_PAGES = 5 # Used for PDF page targeting

    print("Loading HGNC symbols...")
    HGNC = load_hgnc_symbol_set()
    # xml_paths = [] # This isn't strictly necessary if processing XML immediately

    # --- Main Year Loop ---
    for year_to_process in range(START_YEAR, END_YEAR_PROCESSING - 1, -1): # Use year_to_process
        print(f"\n{'='*60}")
        print(f"STARTING PROCESSING FOR YEAR: {year_to_process}")
        print(f"{'='*60}\n")

        master_primer_list_for_year = [] # Initialize for the current year's results

        # --- 1. Define year-specific query for Europe PMC ---
        base_query = 'OPEN_ACCESS:y AND HAS_FT:y AND (METHODS:"qPCR" OR "RT-PCR" OR "real time PCR") AND (ABSTRACT:"pluripotent" OR "iPSC" OR "PSC" OR "hPSC" OR "ESC" OR "hESC")'
        query = f"{base_query} AND (FIRST_PDATE:{year_to_process})"
        print(f"Searching Europe PMC with query: {query}")
        
        epmc_hits_for_year = []
        try:
            epmc_hits_for_year = europepmc_search_all(query, max_records=records_per_year)
            print(f"Retrieved {len(epmc_hits_for_year)} records for {year_to_process}.")
        except Exception as e_search:
            print(f"Could not complete search for year {year_to_process}. Error: {e_search}")
            continue

        if not epmc_hits_for_year:
            print(f"No records found for {year_to_process}. Skipping to next year.")
            continue

        # --- 2. Process each paper for this year ---
        for hit_index, hit in enumerate(tqdm(epmc_hits_for_year, desc=f"Processing Papers for {year_to_process}")):
            current_pmcid_str = None
            if isinstance(hit, dict):
                current_pmcid_str = hit.get("pmcid") or hit.get("id")
            else:
                current_pmcid_str = str(hit)
            
            if not (current_pmcid_str and current_pmcid_str.startswith("PMC")):
                continue
            
            print(f"\n  Processing PMCID: {current_pmcid_str} ({hit_index+1}/{len(epmc_hits_for_year)}) for year {year_to_process}")
            
            primers_found_for_this_pmcid_count = 0 # Initialize for THIS PMCID

            try:
                xml_file_for_current_paper = get_or_download_xml(current_pmcid_str, DATA_DIR)
                
                # --- 2c. Read primers from main text XML ---
                if xml_file_for_current_paper.exists():
                    # print(f"    Extracting primers from XML: {xml_file_for_current_paper.name}")
                    xml_primers = extract_primers_from_xml(xml_file_for_current_paper, current_pmcid_str, HGNC)
                    if xml_primers:
                        master_primer_list_for_year.extend(xml_primers)
                        primers_found_for_this_pmcid_count += len(xml_primers)
                    # print(f"      Found {len(xml_primers)} primer items from XML. Total for {current_pmcid_str}: {primers_found_for_this_pmcid_count}")

                # --- 2d. Fetch, Rename, and Process Supplements for THIS PMCID ---
                renamed_supp_files_for_this_pmcid = [] # Initialize to ensure it's always defined

                if primers_found_for_this_pmcid_count > 8:
                    print(f"      Sufficient primers ({primers_found_for_this_pmcid_count}) found for {current_pmcid_str} from XML. Skipping its supplement processing.")
                else:
                    # print(f"    Fetching supplements for {current_pmcid_str} (XML found: {primers_found_for_this_pmcid_count})...")
                    downloaded_supp_files = fetch_supplements_from_xml(current_pmcid_str, xml_file_for_current_paper, DATA_DIR / "supp")
                    
                    if downloaded_supp_files:
                        renamed_supp_files_for_this_pmcid = rename_supp_files(current_pmcid_str, downloaded_supp_files)
                        # print(f"      ↳ {len(renamed_supp_files_for_this_pmcid)} supplement(s) to process for {current_pmcid_str}.")
                    # else:
                        # print(f"      No supplements found or downloaded for {current_pmcid_str}.")

                # Process the collected supplements for THIS PMCID
                if renamed_supp_files_for_this_pmcid: # Only loop if there are supplements and we haven't skipped due to XML
                    for supp_file_path_obj in tqdm(renamed_supp_files_for_this_pmcid, desc=f"Supps for {current_pmcid_str}", leave=False):
                        if primers_found_for_this_pmcid_count > 8:
                            print(f"      Sufficient primers ({primers_found_for_this_pmcid_count}) now found for {current_pmcid_str}. Skipping its remaining supplements.")
                            break # Correctly breaks from THIS PMCID's supplement loop

                        supp_file_path_str = str(supp_file_path_obj)
                        supp_filename_for_log = os.path.basename(supp_file_path_str)
                        # print(f"\n        Processing Supplement File: {supp_filename_for_log}")
                        
                        extracted_supp_primers = []
                        page_spec_for_camelot = "all" 
                        process_this_file_fully = True # Renamed from process_this_pdf_fully

                        # Determine page_spec and if it's a reporting summary (for PDFs)
                        if supp_filename_for_log.lower().endswith(".pdf"):
                            try:
                                with pdfplumber.open(supp_file_path_str) as pdf_doc:
                                    if not pdf_doc.pages:
                                        process_this_file_fully = False
                                    else:
                                        total_pages = len(pdf_doc.pages)
                                        first_page_text = pdf_doc.pages[0].extract_text() if pdf_doc.pages[0] else ""
                                        if first_page_text and "reporting summary" in first_page_text.lower():
                                            page_spec_for_camelot = "1"
                                        elif total_pages == 0: page_spec_for_camelot = "1"
                                        elif total_pages <= N_LAST_PAGES: page_spec_for_camelot = f"1-{total_pages}"
                                        else:
                                            start_page = total_pages - N_LAST_PAGES + 1
                                            page_spec_for_camelot = f"{start_page}-{total_pages}"
                            except Exception as e_pg_cnt:
                                # print(f"        Error getting page count for {supp_filename_for_log}: {e_pg_cnt}. Defaulting to all pages.")
                                page_spec_for_camelot = "all" # Keep process_this_file_fully as True to attempt parsing

                        # Process the file based on its type
                        if process_this_file_fully:
                            if supp_filename_for_log.lower().endswith(".pdf"):
                                extracted_supp_primers = find_primers_sequence_first(supp_file_path_str, page_spec=page_spec_for_camelot, hgnc_symbols=HGNC)
                            elif supp_filename_for_log.lower().endswith(".docx"):
                                extracted_supp_primers = extract_primers_from_docx(supp_file_path_obj, current_pmcid_str, HGNC)
                            # Add more elif for other file types if needed

                        if extracted_supp_primers:
                            master_primer_list_for_year.extend(extracted_supp_primers)
                            primers_found_for_this_pmcid_count += len(extracted_supp_primers)
                        # print(f"        Finished {supp_filename_for_log}. Found {len(extracted_supp_primers)}. Total for PMCID {current_pmcid_str}: {primers_found_for_this_pmcid_count}")
            
            except requests.HTTPError as e:
                print(f"    ⚠️ HTTPError for {current_pmcid_str} (XML or Supplement download): {e}")
            except Exception as e_paper_processing: # This catches other errors during THIS paper's processing
                print(f"    ⚠️ Unexpected error processing paper {current_pmcid_str}: {e_paper_processing}")
                import traceback
                traceback.print_exc() # This will show where the NameError occurred if it's within this try
                # Continue to the next paper
            
            time.sleep(0.3) # Politeness delay for Europe PMC
        # --- End of loop for individual papers (epmc_hits_for_year) ---

        # --- Save results for the CURRENT YEAR ---
        if master_primer_list_for_year:
            print(f"\n--- Consolidating and Saving Results for YEAR {year_to_process} ---")
            # ... (rest of your CSV saving logic for the year, which seems correct) ...
            final_df_year = pd.DataFrame(master_primer_list_for_year)
            if not final_df_year.empty:
                expected_columns = ['PMCID', 'Gene', 'Sequence', 'Orientation', 'Source File', 'Page', 'Original Cell Text']
                actual_columns_in_df = final_df_year.columns.tolist()
                subset_for_duplicates = [col for col in expected_columns if col in actual_columns_in_df]
                if not subset_for_duplicates: final_df_year.drop_duplicates(inplace=True, keep='first')
                else: final_df_year.drop_duplicates(subset=subset_for_duplicates, inplace=True, keep='first')
                print(f"Total unique primer entries found for {year_to_process}: {len(final_df_year)}")
                # (Print and save logic)
                try:
                    output_dir = DATA_DIR / "yearly_results"; output_dir.mkdir(parents=True, exist_ok=True)
                    output_csv_path_year = output_dir / f"master_extracted_primers_{year_to_process}.csv"
                    final_df_year.to_csv(output_csv_path_year, index=False)
                    print(f"  ✅ Saved {len(final_df_year)} unique extracted primers for {year_to_process} to {output_csv_path_year}")
                except Exception as e_csv: print(f"  ❌ Error saving CSV for {year_to_process}: {e_csv}")
            else: print(f"No primer data to save for {year_to_process} (DataFrame empty).")
        else: print(f"No primers extracted for YEAR {year_to_process}.")
        print(f"###### FINISHED PROCESSING FOR YEAR: {year_to_process} ######")

    print("\n\nBatch processing complete for all specified years.")

Loading HGNC symbols...
Loaded 193356 HGNC symbols.

STARTING PROCESSING FOR YEAR: 2025

Searching Europe PMC with query: OPEN_ACCESS:y AND HAS_FT:y AND (METHODS:"qPCR" OR "RT-PCR" OR "real time PCR") AND (ABSTRACT:"pluripotent" OR "iPSC" OR "PSC" OR "hPSC" OR "ESC" OR "hESC") AND (FIRST_PDATE:2025)
Retrieved 100 records for 2025.


Processing Papers for 2025:   0%|                                                      | 0/100 [00:00<?, ?it/s]


  Processing PMCID: PMC12076839 (1/100) for year 2025
  Found 0 <table> elements in XML for PMC12076839.



Supps for PMC12076839:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC12076839:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  1.34it/s][A
                                                                                                               [A

      Sufficient primers (42) now found for PMC12076839. Skipping its remaining supplements.


Processing Papers for 2025:   1%|▍                                             | 1/100 [00:23<38:46, 23.50s/it]


  Processing PMCID: PMC12008852 (2/100) for year 2025
  Found 0 <table> elements in XML for PMC12008852.



Supps for PMC12008852:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC12008852: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.33it/s][A
Processing Papers for 2025:   2%|▉                                             | 2/100 [00:27<19:40, 12.05s/it][A


  Processing PMCID: PMC11836331 (3/100) for year 2025
  Found 0 <table> elements in XML for PMC11836331.



Supps for PMC11836331:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11836331:  25%|█████████████▎                                       | 1/4 [00:16<00:48, 16.22s/it][ACropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '3-7' took: 15.74 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11836331:  50%|██████████████████████████▌                          | 2/4 [00:18<00:15,  7.93s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 1.99 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11836331:  75%|███████████████████████████████████████▊             | 3/4 [00:29<00:09,  9.62s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-4' took: 10.76 seconds (including any timeouts).
      Sufficient primers (39) now found for PMC11836331. Skipping its remaining supplements.


Processing Papers for 2025:   3%|█▎                                          | 3/100 [01:56<1:16:11, 47.13s/it]


  Processing PMCID: PMC11815987 (4/100) for year 2025
  Found 2 <table> elements in XML for PMC11815987.


Processing Papers for 2025:   4%|█▊                                            | 4/100 [01:58<46:46, 29.23s/it]


  Processing PMCID: PMC11872338 (5/100) for year 2025
  Found 0 <table> elements in XML for PMC11872338.



Supps for PMC11872338:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11872338: 100%|█████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.55s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 8.30 seconds (including any timeouts).


Processing Papers for 2025:   5%|██▏                                         | 5/100 [02:56<1:02:39, 39.57s/it]


  Processing PMCID: PMC11934218 (6/100) for year 2025
  Found 0 <table> elements in XML for PMC11934218.



Supps for PMC11934218:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11934218: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.72it/s][A
Processing Papers for 2025:   6%|██▊                                           | 6/100 [03:10<48:49, 31.16s/it][A


  Processing PMCID: PMC12034926 (7/100) for year 2025
  Found 0 <table> elements in XML for PMC12034926.



Supps for PMC12034926:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC12034926: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.19it/s][A
Processing Papers for 2025:   7%|███▏                                          | 7/100 [03:15<34:47, 22.44s/it][A


  Processing PMCID: PMC11955862 (8/100) for year 2025
  Found 4 <table> elements in XML for PMC11955862.
      Sufficient primers (14) found for PMC11955862 from XML. Skipping its supplement processing.


Processing Papers for 2025:   8%|███▋                                          | 8/100 [03:17<24:23, 15.90s/it]


  Processing PMCID: PMC11965213 (9/100) for year 2025
  Found 0 <table> elements in XML for PMC11965213.



Supps for PMC11965213:   0%|                                                             | 0/2 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '6-10' took: 12.05 seconds (including any timeouts).



Supps for PMC11965213:  50%|██████████████████████████▌                          | 1/2 [00:12<00:12, 12.41s/it][A
Supps for PMC11965213: 100%|█████████████████████████████████████████████████████| 2/2 [00:12<00:00,  5.18s/it][A
Processing Papers for 2025:   9%|████▏                                         | 9/100 [03:35<25:02, 16.51s/it][A


  Processing PMCID: PMC11883997 (10/100) for year 2025
  Found 0 <table> elements in XML for PMC11883997.



Supps for PMC11883997:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11883997: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.56it/s][A
Processing Papers for 2025:  10%|████▌                                        | 10/100 [03:40<19:24, 12.94s/it][A


  Processing PMCID: PMC11959787 (11/100) for year 2025
  Found 0 <table> elements in XML for PMC11959787.


Processing Papers for 2025:  11%|████▉                                        | 11/100 [03:42<14:13,  9.59s/it]


  Processing PMCID: PMC12064636 (12/100) for year 2025
  Found 2 <table> elements in XML for PMC12064636.
      Sufficient primers (25) found for PMC12064636 from XML. Skipping its supplement processing.


Processing Papers for 2025:  12%|█████▍                                       | 12/100 [03:43<10:33,  7.20s/it]


  Processing PMCID: PMC12037078 (13/100) for year 2025
  Found 0 <table> elements in XML for PMC12037078.
      Sufficient primers (19) found for PMC12037078 from XML. Skipping its supplement processing.


Processing Papers for 2025:  13%|█████▊                                       | 13/100 [03:45<08:09,  5.62s/it]


  Processing PMCID: PMC11933898 (14/100) for year 2025
  Found 0 <table> elements in XML for PMC11933898.


Processing Papers for 2025:  14%|██████▎                                      | 14/100 [03:47<06:12,  4.33s/it]


  Processing PMCID: PMC12001006 (15/100) for year 2025
  Found 0 <table> elements in XML for PMC12001006.



Supps for PMC12001006:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC12001006: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.52it/s][A
Processing Papers for 2025:  15%|██████▊                                      | 15/100 [03:51<06:07,  4.33s/it][A


  Processing PMCID: PMC11882705 (17/100) for year 2025
  Found 2 <table> elements in XML for PMC11882705.
      Sufficient primers (40) found for PMC11882705 from XML. Skipping its supplement processing.


Processing Papers for 2025:  17%|███████▋                                     | 17/100 [03:53<03:43,  2.69s/it]


  Processing PMCID: PMC12082684 (18/100) for year 2025
  Found 1 <table> elements in XML for PMC12082684.



Supps for PMC12082684:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC12082684: 100%|█████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.39s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '5-9' took: 11.17 seconds (including any timeouts).


Processing Papers for 2025:  18%|████████                                     | 18/100 [04:09<08:12,  6.00s/it]


  Processing PMCID: PMC12011625 (19/100) for year 2025
  Found 0 <table> elements in XML for PMC12011625.


Processing Papers for 2025:  19%|████████▌                                    | 19/100 [04:10<06:29,  4.80s/it]


  Processing PMCID: PMC12010622 (20/100) for year 2025
  Found 2 <table> elements in XML for PMC12010622.
      Sufficient primers (52) found for PMC12010622 from XML. Skipping its supplement processing.


Processing Papers for 2025:  20%|█████████                                    | 20/100 [04:12<05:22,  4.03s/it]


  Processing PMCID: PMC12032275 (21/100) for year 2025
  Found 0 <table> elements in XML for PMC12032275.



Supps for PMC12032275:   0%|                                                             | 0/3 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '20-24' took: 7.64 seconds (including any timeouts).
      Sufficient primers (52) now found for PMC12032275. Skipping its remaining supplements.


Processing Papers for 2025:  21%|█████████▍                                   | 21/100 [04:29<10:11,  7.74s/it]


  Processing PMCID: PMC11799516 (22/100) for year 2025
  Found 0 <table> elements in XML for PMC11799516.


Processing Papers for 2025:  22%|█████████▉                                   | 22/100 [04:31<07:47,  5.99s/it]


  Processing PMCID: PMC12008586 (23/100) for year 2025
  Found 0 <table> elements in XML for PMC12008586.



Supps for PMC12008586:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC12008586:  50%|██████████████████████████▌                          | 1/2 [00:06<00:06,  6.49s/it][A

  Camelot PDF processing for pages '27-31' took: 6.22 seconds (including any timeouts).



Supps for PMC12008586: 100%|█████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.66s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '40-44' took: 6.65 seconds (including any timeouts).


Processing Papers for 2025:  23%|██████████▎                                  | 23/100 [04:54<14:01, 10.93s/it]


  Processing PMCID: PMC11993987 (24/100) for year 2025
  Found 3 <table> elements in XML for PMC11993987.


Processing Papers for 2025:  24%|██████████▊                                  | 24/100 [04:56<10:23,  8.20s/it]


  Processing PMCID: PMC11780910 (25/100) for year 2025
  Found 0 <table> elements in XML for PMC11780910.



Supps for PMC11780910:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '7-11' took: 10.26 seconds (including any timeouts).


Processing Papers for 2025:  25%|███████████▎                                 | 25/100 [05:12<13:18, 10.64s/it]


  Processing PMCID: PMC12002989 (26/100) for year 2025
  Found 0 <table> elements in XML for PMC12002989.



Supps for PMC12002989:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC12002989:  50%|██████████████████████████▌                          | 1/2 [00:12<00:12, 12.74s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '13-17' took: 12.48 seconds (including any timeouts).
      Sufficient primers (76) now found for PMC12002989. Skipping its remaining supplements.


Processing Papers for 2025:  26%|███████████▋                                 | 26/100 [05:32<16:30, 13.39s/it]


  Processing PMCID: PMC12090657 (27/100) for year 2025
  Found 0 <table> elements in XML for PMC12090657.



Supps for PMC12090657:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC12090657: 100%|█████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.22s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 14.56 seconds (including any timeouts).


Processing Papers for 2025:  27%|████████████▏                                | 27/100 [05:51<18:24, 15.13s/it]


  Processing PMCID: PMC12095295 (28/100) for year 2025
  Found 0 <table> elements in XML for PMC12095295.



Supps for PMC12095295:   0%|                                                             | 0/3 [00:00<?, ?it/s][A
Supps for PMC12095295:  33%|█████████████████▋                                   | 1/3 [00:00<00:00,  5.13it/s][A
Supps for PMC12095295:  67%|███████████████████████████████████▎                 | 2/3 [00:00<00:00,  3.36it/s][A
Supps for PMC12095295: 100%|█████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.27it/s][A
Processing Papers for 2025:  28%|████████████▌                                | 28/100 [06:26<25:03, 20.88s/it][A


  Processing PMCID: PMC11906159 (29/100) for year 2025
  Found 1 <table> elements in XML for PMC11906159.



Supps for PMC11906159:   0%|                                                             | 0/7 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11906159:  14%|███████▌                                             | 1/7 [00:02<00:13,  2.18s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.02 seconds (including any timeouts).



Supps for PMC11906159:  29%|███████████████▏                                     | 2/7 [00:04<00:10,  2.02s/it][ACropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 1.78 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11906159:  43%|██████████████████████▋                              | 3/7 [00:06<00:08,  2.21s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.11 seconds (including any timeouts).



Supps for PMC11906159:  57%|██████████████████████████████▎                      | 4/7 [00:08<00:06,  2.18s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 1.92 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11906159:  71%|█████████████████████████████████████▊               | 5/7 [00:24<00:14,  7.19s/it][A

  Camelot PDF processing for pages '1-5' took: 15.77 seconds (including any timeouts).



Supps for PMC11906159: 100%|█████████████████████████████████████████████████████| 7/7 [00:24<00:00,  3.61s/it][A
Processing Papers for 2025:  29%|█████████████                                | 29/100 [07:17<35:30, 30.01s/it][A


  Processing PMCID: PMC12069886 (30/100) for year 2025
  Found 0 <table> elements in XML for PMC12069886.



Supps for PMC12069886:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC12069886:  50%|██████████████████████████▌                          | 1/2 [00:09<00:09,  9.68s/it][A

  Camelot PDF processing for pages '12-16' took: 9.53 seconds (including any timeouts).



Supps for PMC12069886: 100%|█████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.40s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '23-27' took: 9.80 seconds (including any timeouts).


Processing Papers for 2025:  30%|█████████████▌                               | 30/100 [07:44<34:06, 29.23s/it]


  Processing PMCID: PMC11964241 (31/100) for year 2025
  Found 0 <table> elements in XML for PMC11964241.


Processing Papers for 2025:  31%|█████████████▉                               | 31/100 [07:46<24:05, 20.96s/it]


  Processing PMCID: PMC11979110 (32/100) for year 2025
  Found 2 <table> elements in XML for PMC11979110.
      Sufficient primers (23) found for PMC11979110 from XML. Skipping its supplement processing.


Processing Papers for 2025:  32%|██████████████▍                              | 32/100 [07:48<17:13, 15.20s/it]


  Processing PMCID: PMC11928952 (33/100) for year 2025
  Found 6 <table> elements in XML for PMC11928952.


Processing Papers for 2025:  33%|██████████████▊                              | 33/100 [07:49<12:24, 11.12s/it]


  Processing PMCID: PMC11848452 (34/100) for year 2025
  Found 0 <table> elements in XML for PMC11848452.
      Sufficient primers (9) found for PMC11848452 from XML. Skipping its supplement processing.


Processing Papers for 2025:  34%|███████████████▎                             | 34/100 [07:51<09:10,  8.33s/it]


  Processing PMCID: PMC12036290 (35/100) for year 2025
  Found 0 <table> elements in XML for PMC12036290.



Supps for PMC12036290:   0%|                                                             | 0/3 [00:00<?, ?it/s][A
Supps for PMC12036290:  33%|█████████████████▋                                   | 1/3 [00:32<01:04, 32.41s/it][A

  Camelot PDF processing for pages '8-12' took: 31.13 seconds (including any timeouts).



Supps for PMC12036290:  67%|███████████████████████████████████▎                 | 2/3 [00:43<00:20, 20.13s/it][A

  Camelot PDF processing for pages '1-5' took: 11.30 seconds (including any timeouts).



Supps for PMC12036290: 100%|█████████████████████████████████████████████████████| 3/3 [00:45<00:00, 11.78s/it][A
Processing Papers for 2025:  35%|███████████████▋                             | 35/100 [09:17<34:20, 31.70s/it][A


  Processing PMCID: PMC11980850 (36/100) for year 2025
  Found 0 <table> elements in XML for PMC11980850.



Supps for PMC11980850:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '6-10' took: 6.42 seconds (including any timeouts).


Processing Papers for 2025:  36%|████████████████▏                            | 36/100 [09:29<27:14, 25.54s/it]


  Processing PMCID: PMC12041927 (37/100) for year 2025
  Found 1 <table> elements in XML for PMC12041927.



Supps for PMC12041927:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC12041927: 100%|█████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.52s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-5' took: 1.47 seconds (including any timeouts).


Processing Papers for 2025:  37%|████████████████▋                            | 37/100 [09:34<20:24, 19.44s/it]


  Processing PMCID: PMC12094700 (38/100) for year 2025
  Found 2 <table> elements in XML for PMC12094700.



Supps for PMC12094700:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC12094700:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  6.50it/s][A
Supps for PMC12094700: 100%|█████████████████████████████████████████████████████| 2/2 [00:00<00:00,  5.00it/s][A
Processing Papers for 2025:  38%|█████████████████                            | 38/100 [09:39<15:41, 15.18s/it][A


  Processing PMCID: PMC12083436 (39/100) for year 2025
  Found 0 <table> elements in XML for PMC12083436.
      Sufficient primers (22) found for PMC12083436 from XML. Skipping its supplement processing.


Processing Papers for 2025:  39%|█████████████████▌                           | 39/100 [09:41<11:18, 11.12s/it]


  Processing PMCID: PMC11966042 (40/100) for year 2025
  Found 0 <table> elements in XML for PMC11966042.



Supps for PMC11966042:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11966042: 100%|█████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.39s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 10.97 seconds (including any timeouts).


Processing Papers for 2025:  40%|██████████████████                           | 40/100 [10:05<15:10, 15.17s/it]


  Processing PMCID: PMC12048643 (41/100) for year 2025
  Found 1 <table> elements in XML for PMC12048643.



Supps for PMC12048643:   0%|                                                             | 0/3 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '9-13' took: 13.79 seconds (including any timeouts).
      Sufficient primers (70) now found for PMC12048643. Skipping its remaining supplements.


Processing Papers for 2025:  41%|██████████████████▍                          | 41/100 [10:28<17:12, 17.50s/it]


  Processing PMCID: PMC12069897 (42/100) for year 2025
  Found 2 <table> elements in XML for PMC12069897.


Processing Papers for 2025:  42%|██████████████████▉                          | 42/100 [10:30<12:28, 12.91s/it]


  Processing PMCID: PMC11980838 (43/100) for year 2025
  Found 0 <table> elements in XML for PMC11980838.



Supps for PMC11980838:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11980838: 100%|█████████████████████████████████████████████████████| 1/1 [00:11<00:00, 11.93s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '27-31' took: 11.42 seconds (including any timeouts).


Processing Papers for 2025:  43%|███████████████████▎                         | 43/100 [11:01<17:20, 18.26s/it]


  Processing PMCID: PMC12086357 (44/100) for year 2025
  Found 0 <table> elements in XML for PMC12086357.
      Sufficient primers (11) found for PMC12086357 from XML. Skipping its supplement processing.


Processing Papers for 2025:  44%|███████████████████▊                         | 44/100 [11:03<12:18, 13.19s/it]


  Processing PMCID: PMC12020857 (45/100) for year 2025
  Found 1 <table> elements in XML for PMC12020857.



Supps for PMC12020857:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC12020857: 100%|█████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.87s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '6-10' took: 13.56 seconds (including any timeouts).


Processing Papers for 2025:  45%|████████████████████▎                        | 45/100 [11:21<13:33, 14.79s/it]


  Processing PMCID: PMC11826378 (46/100) for year 2025
  Found 2 <table> elements in XML for PMC11826378.



Supps for PMC11826378:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11826378: 100%|█████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.90s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.56 seconds (including any timeouts).


Processing Papers for 2025:  46%|████████████████████▋                        | 46/100 [11:28<11:03, 12.30s/it]


  Processing PMCID: PMC11756937 (47/100) for year 2025
  Found 3 <table> elements in XML for PMC11756937.



Supps for PMC11756937:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11756937: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.96it/s][A
Processing Papers for 2025:  47%|█████████████████████▏                       | 47/100 [11:30<08:19,  9.43s/it][A


  Processing PMCID: PMC11608845 (48/100) for year 2025
  Found 0 <table> elements in XML for PMC11608845.
      Sufficient primers (20) found for PMC11608845 from XML. Skipping its supplement processing.


Processing Papers for 2025:  48%|█████████████████████▌                       | 48/100 [11:32<06:12,  7.16s/it]


  Processing PMCID: PMC11889117 (49/100) for year 2025
  Found 0 <table> elements in XML for PMC11889117.



Supps for PMC11889117:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Lattice flavor TIMED OUT for PMC11889117_supp1.pdf after 60s on pages '1-4'.



Supps for PMC11889117:  25%|█████████████                                       | 1/4 [02:30<07:32, 150.89s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Stream flavor TIMED OUT for PMC11889117_supp1.pdf after 60s on pages '1-4'.
  Camelot PDF processing for pages '1-4' took: 122.69 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11889117:  50%|██████████████████████████▌                          | 2/4 [02:54<02:32, 76.10s/it][A

  Camelot PDF processing for pages '3-7' took: 22.24 seconds (including any timeouts).



Supps for PMC11889117:  75%|███████████████████████████████████████▊             | 3/4 [03:05<00:46, 46.33s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '17-21' took: 10.59 seconds (including any timeouts).



Supps for PMC11889117: 100%|█████████████████████████████████████████████████████| 4/4 [03:07<00:00, 28.90s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.00 seconds (including any timeouts).


Processing Papers for 2025:  49%|█████████████████████                      | 49/100 [15:24<1:03:19, 74.50s/it]


  Processing PMCID: PMC11961953 (50/100) for year 2025
  Found 1 <table> elements in XML for PMC11961953.


Processing Papers for 2025:  50%|██████████████████████▌                      | 50/100 [15:26<43:58, 52.76s/it]


  Processing PMCID: PMC11816654 (51/100) for year 2025
  Found 0 <table> elements in XML for PMC11816654.
      Sufficient primers (14) found for PMC11816654 from XML. Skipping its supplement processing.


Processing Papers for 2025:  51%|██████████████████████▉                      | 51/100 [15:27<30:32, 37.40s/it]


  Processing PMCID: PMC11983251 (52/100) for year 2025
  Found 0 <table> elements in XML for PMC11983251.



Supps for PMC11983251:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11983251: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.96it/s][A
Processing Papers for 2025:  52%|███████████████████████▍                     | 52/100 [16:06<30:17, 37.86s/it][A


  Processing PMCID: PMC12009284 (53/100) for year 2025
  Found 0 <table> elements in XML for PMC12009284.



Supps for PMC12009284:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Stream flavor failed for PMC12009284_supp1.pdf (pages '7-11'). Error: list index out of range
  Camelot PDF processing for pages '7-11' took: 16.94 seconds (including any timeouts).


Processing Papers for 2025:  53%|███████████████████████▊                     | 53/100 [16:27<25:40, 32.78s/it]


  Processing PMCID: PMC11953279 (54/100) for year 2025
  Found 1 <table> elements in XML for PMC11953279.
      Sufficient primers (17) found for PMC11953279 from XML. Skipping its supplement processing.


Processing Papers for 2025:  54%|████████████████████████▎                    | 54/100 [16:29<18:04, 23.58s/it]


  Processing PMCID: PMC11948778 (55/100) for year 2025
  Found 0 <table> elements in XML for PMC11948778.
    ⚠️ Unexpected error processing paper PMC11948778: HTTPSConnectionPool(host='europepmc.org', port=443): Read timed out. (read timeout=60)


Traceback (most recent call last):
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\ssl.py", line 1275, in recv_into
    return self.read(nbytes, buffer)
  File "C:\


  Processing PMCID: PMC11821750 (56/100) for year 2025
  Found 3 <table> elements in XML for PMC11821750.
      Sufficient primers (54) found for PMC11821750 from XML. Skipping its supplement processing.


Processing Papers for 2025:  56%|█████████████████████████▏                   | 56/100 [17:35<18:43, 25.54s/it]


  Processing PMCID: PMC11760979 (57/100) for year 2025
  Found 0 <table> elements in XML for PMC11760979.



Supps for PMC11760979:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11760979: 100%|█████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.12s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '13-17' took: 13.52 seconds (including any timeouts).


Processing Papers for 2025:  57%|█████████████████████████▋                   | 57/100 [17:54<16:57, 23.67s/it]


  Processing PMCID: PMC12027929 (58/100) for year 2025
  Found 2 <table> elements in XML for PMC12027929.


Processing Papers for 2025:  58%|██████████████████████████                   | 58/100 [17:56<11:58, 17.12s/it]


  Processing PMCID: PMC11919623 (59/100) for year 2025
  Found 8 <table> elements in XML for PMC11919623.


Processing Papers for 2025:  59%|██████████████████████████▌                  | 59/100 [17:58<08:34, 12.54s/it]


  Processing PMCID: PMC12081669 (60/100) for year 2025
  Found 0 <table> elements in XML for PMC12081669.
    ⚠️ Unexpected error processing paper PMC12081669: HTTPSConnectionPool(host='europepmc.org', port=443): Read timed out. (read timeout=60)


Traceback (most recent call last):
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\ssl.py", line 1275, in recv_into
    return self.read(nbytes, buffer)
  File "C:\


  Processing PMCID: PMC11791066 (61/100) for year 2025
  Found 1 <table> elements in XML for PMC11791066.



Supps for PMC11791066:   0%|                                                             | 0/3 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11791066:  33%|█████████████████▋                                   | 1/3 [00:08<00:16,  8.25s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-2' took: 7.08 seconds (including any timeouts).
      Sufficient primers (9) now found for PMC11791066. Skipping its remaining supplements.


Processing Papers for 2025:  61%|███████████████████████████▍                 | 61/100 [19:16<15:33, 23.94s/it]


  Processing PMCID: PMC11976395 (62/100) for year 2025
  Found 0 <table> elements in XML for PMC11976395.



Supps for PMC11976395:   0%|                                                             | 0/1 [00:00<?, ?it/s][A

  Lattice flavor TIMED OUT for PMC11976395_supp1.pdf after 60s on pages '1-5'.



Supps for PMC11976395: 100%|████████████████████████████████████████████████████| 1/1 [02:04<00:00, 124.12s/it][A
                                                                                                               [A

  Stream flavor TIMED OUT for PMC11976395_supp1.pdf after 60s on pages '1-5'.
  Camelot PDF processing for pages '1-5' took: 122.72 seconds (including any timeouts).


Processing Papers for 2025:  62%|███████████████████████████▉                 | 62/100 [21:46<39:09, 61.82s/it]


  Processing PMCID: PMC11978882 (63/100) for year 2025
  Found 0 <table> elements in XML for PMC11978882.



Supps for PMC11978882:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '16-20' took: 18.89 seconds (including any timeouts).



Supps for PMC11978882:  25%|█████████████▎                                       | 1/4 [00:20<01:00, 20.14s/it][A
Supps for PMC11978882:  50%|██████████████████████████▌                          | 2/4 [00:22<00:19,  9.58s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.01 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11978882:  75%|███████████████████████████████████████▊             | 3/4 [00:26<00:07,  7.03s/it][A

  Camelot PDF processing for pages '1' took: 2.89 seconds (including any timeouts).



Supps for PMC11978882: 100%|█████████████████████████████████████████████████████| 4/4 [00:37<00:00,  8.81s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '32-36' took: 9.83 seconds (including any timeouts).


Processing Papers for 2025:  63%|████████████████████████████▎                | 63/100 [22:35<35:40, 57.85s/it]


  Processing PMCID: PMC11904521 (64/100) for year 2025
  Found 2 <table> elements in XML for PMC11904521.
      Sufficient primers (50) found for PMC11904521 from XML. Skipping its supplement processing.


Processing Papers for 2025:  64%|████████████████████████████▊                | 64/100 [22:37<24:36, 41.02s/it]


  Processing PMCID: PMC11865489 (65/100) for year 2025
  Found 0 <table> elements in XML for PMC11865489.



Supps for PMC11865489:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Stream flavor failed for PMC11865489_supp1.pdf (pages '18-22'). Error: list index out of range
  Camelot PDF processing for pages '18-22' took: 17.50 seconds (including any timeouts).
      Sufficient primers (53) now found for PMC11865489. Skipping its remaining supplements.


Processing Papers for 2025:  65%|█████████████████████████████▎               | 65/100 [23:49<29:21, 50.33s/it]


  Processing PMCID: PMC11809796 (66/100) for year 2025
  Found 0 <table> elements in XML for PMC11809796.


Processing Papers for 2025:  66%|█████████████████████████████▋               | 66/100 [23:51<20:20, 35.89s/it]


  Processing PMCID: PMC11989054 (67/100) for year 2025
  Found 2 <table> elements in XML for PMC11989054.
      Sufficient primers (17) found for PMC11989054 from XML. Skipping its supplement processing.


Processing Papers for 2025:  67%|██████████████████████████████▏              | 67/100 [23:53<14:09, 25.74s/it]


  Processing PMCID: PMC11921648 (68/100) for year 2025
  Found 0 <table> elements in XML for PMC11921648.



Supps for PMC11921648:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11921648:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  3.36it/s][A
Supps for PMC11921648: 100%|█████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.47it/s][A
Processing Papers for 2025:  68%|██████████████████████████████▌              | 68/100 [24:01<10:58, 20.57s/it][A


  Processing PMCID: PMC11794859 (69/100) for year 2025
  Found 1 <table> elements in XML for PMC11794859.



Supps for PMC11794859:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Lattice flavor TIMED OUT for PMC11794859_supp1.pdf after 60s on pages '1-5'.



Supps for PMC11794859:  25%|█████████████                                       | 1/4 [02:23<07:09, 143.18s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Stream flavor TIMED OUT for PMC11794859_supp1.pdf after 60s on pages '1-5'.
  Camelot PDF processing for pages '1-5' took: 122.87 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

  Camelot PDF processing for pages '12-16' took: 30.75 seconds (including any timeouts).



Supps for PMC11794859:  75%|███████████████████████████████████████▊             | 3/4 [03:10<00:49, 49.20s/it][ACropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '20-24' took: 15.17 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11794859: 100%|█████████████████████████████████████████████████████| 4/4 [03:12<00:00, 30.77s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.22 seconds (including any timeouts).


Processing Papers for 2025:  69%|███████████████████████████████              | 69/100 [27:25<38:55, 75.33s/it]


  Processing PMCID: PMC12043976 (70/100) for year 2025
  Found 0 <table> elements in XML for PMC12043976.



Supps for PMC12043976:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC12043976:  25%|█████████████▎                                       | 1/4 [00:03<00:11,  4.00s/it][A

  Camelot PDF processing for pages '1' took: 3.00 seconds (including any timeouts).



Supps for PMC12043976:  50%|██████████████████████████▌                          | 2/4 [00:16<00:18,  9.00s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '37-41' took: 11.03 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC12043976:  75%|██

  Camelot PDF processing for pages '30-34' took: 12.10 seconds (including any timeouts).


Processing Papers for 2025:  70%|███████████████████████████████▍             | 70/100 [28:04<32:13, 64.45s/it]


  Processing PMCID: PMC11844183 (71/100) for year 2025
  Found 1 <table> elements in XML for PMC11844183.



Supps for PMC11844183:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11844183:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  1.27it/s][A
                                                                                                               [A

      Sufficient primers (16) now found for PMC11844183. Skipping its remaining supplements.


Processing Papers for 2025:  71%|███████████████████████████████▉             | 71/100 [28:10<22:44, 47.06s/it]


  Processing PMCID: PMC11942111 (72/100) for year 2025
  Found 1 <table> elements in XML for PMC11942111.
      Sufficient primers (10) found for PMC11942111 from XML. Skipping its supplement processing.


Processing Papers for 2025:  72%|████████████████████████████████▍            | 72/100 [28:12<15:37, 33.48s/it]


  Processing PMCID: PMC12070373 (73/100) for year 2025
  Found 1 <table> elements in XML for PMC12070373.



Supps for PMC12070373:   0%|                                                             | 0/1 [00:00<?, ?it/s][A

  Lattice flavor TIMED OUT for PMC12070373_supp1.pdf after 60s on pages '8-12'.



Supps for PMC12070373: 100%|████████████████████████████████████████████████████| 1/1 [01:58<00:00, 118.83s/it][A

  Camelot PDF processing for pages '8-12' took: 118.52 seconds (including any timeouts).



Processing Papers for 2025:  73%|████████████████████████████████▊            | 73/100 [30:15<27:11, 60.41s/it][A


  Processing PMCID: PMC11940372 (74/100) for year 2025
  Found 2 <table> elements in XML for PMC11940372.
      Sufficient primers (14) found for PMC11940372 from XML. Skipping its supplement processing.


Processing Papers for 2025:  74%|█████████████████████████████████▎           | 74/100 [30:17<18:35, 42.92s/it]


  Processing PMCID: PMC12018958 (75/100) for year 2025
  Found 1 <table> elements in XML for PMC12018958.



Supps for PMC12018958:   0%|                                                             | 0/3 [00:00<?, ?it/s][A
Supps for PMC12018958:  33%|█████████████████▋                                   | 1/3 [00:00<00:00,  9.86it/s][A
                                                                                                               [A

      Sufficient primers (48) now found for PMC12018958. Skipping its remaining supplements.


Processing Papers for 2025:  75%|█████████████████████████████████▊           | 75/100 [30:31<14:11, 34.07s/it]


  Processing PMCID: PMC11842287 (76/100) for year 2025
  Found 4 <table> elements in XML for PMC11842287.
      Sufficient primers (16) found for PMC11842287 from XML. Skipping its supplement processing.


Processing Papers for 2025:  76%|██████████████████████████████████▏          | 76/100 [30:32<09:44, 24.37s/it]


  Processing PMCID: PMC11866429 (77/100) for year 2025
  Found 2 <table> elements in XML for PMC11866429.



Supps for PMC11866429:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11866429:  50%|██████████████████████████▌                          | 1/2 [00:15<00:15, 15.70s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '18-22' took: 15.42 seconds (including any timeouts).
      Sufficient primers (19) now found for PMC11866429. Skipping its remaining supplements.


Processing Papers for 2025:  77%|██████████████████████████████████▋          | 77/100 [30:55<09:06, 23.74s/it]


  Processing PMCID: PMC12027382 (78/100) for year 2025
  Found 1 <table> elements in XML for PMC12027382.
      Sufficient primers (44) found for PMC12027382 from XML. Skipping its supplement processing.


Processing Papers for 2025:  78%|███████████████████████████████████          | 78/100 [30:56<06:16, 17.13s/it]


  Processing PMCID: PMC12102190 (79/100) for year 2025
  Found 0 <table> elements in XML for PMC12102190.



Supps for PMC12102190:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '51-55' took: 72.82 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC12102190:  75%|███████████████████████████████████████▊             | 3/4 [01:25<00:22, 22.45s/it][A

  Camelot PDF processing for pages '1' took: 2.44 seconds (including any timeouts).



Supps for PMC12102190: 100%|█████████████████████████████████████████████████████| 4/4 [01:37<00:00, 18.96s/it][A
                                                                                                               [A

  Stream flavor failed for PMC12102190_supp4.pdf (pages '26-30'). Error: list index out of range
  Camelot PDF processing for pages '26-30' took: 10.71 seconds (including any timeouts).


Processing Papers for 2025:  79%|███████████████████████████████████▌         | 79/100 [32:53<16:23, 46.85s/it]


  Processing PMCID: PMC11911636 (80/100) for year 2025
  Found 1 <table> elements in XML for PMC11911636.



Supps for PMC11911636:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11911636: 100%|█████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.77s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-5' took: 15.41 seconds (including any timeouts).


Processing Papers for 2025:  80%|████████████████████████████████████         | 80/100 [33:12<12:53, 38.67s/it]


  Processing PMCID: PMC12072796 (81/100) for year 2025
  Found 1 <table> elements in XML for PMC12072796.
      Sufficient primers (12) found for PMC12072796 from XML. Skipping its supplement processing.


Processing Papers for 2025:  81%|████████████████████████████████████▍        | 81/100 [33:14<08:44, 27.59s/it]


  Processing PMCID: PMC11667823 (82/100) for year 2025
  Found 0 <table> elements in XML for PMC11667823.



Supps for PMC11667823:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '13-17' took: 10.29 seconds (including any timeouts).


Processing Papers for 2025:  82%|████████████████████████████████████▉        | 82/100 [33:31<07:21, 24.54s/it]


  Processing PMCID: PMC11992592 (83/100) for year 2025
  Found 2 <table> elements in XML for PMC11992592.



Supps for PMC11992592:   0%|                                                             | 0/3 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11992592:  33%|█████████████████▋                                   | 1/3 [00:07<00:15,  7.60s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropB

  Camelot PDF processing for pages '1-3' took: 7.07 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11992592:  67%|███████████████████████████████████▎                 | 2/3 [00:21<00:11, 11.16s/it][A

  Camelot PDF processing for pages '8-12' took: 13.13 seconds (including any timeouts).





  Camelot PDF processing for pages '47-51' took: 15.21 seconds (including any timeouts).


Supps for PMC11992592: 100%|█████████████████████████████████████████████████████| 3/3 [00:37<00:00, 13.41s/it][A
Processing Papers for 2025:  83%|█████████████████████████████████████▎       | 83/100 [34:18<08:49, 31.16s/it][A


  Processing PMCID: PMC12036138 (84/100) for year 2025
  Found 0 <table> elements in XML for PMC12036138.



Supps for PMC12036138:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC12036138: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.31it/s][A
Processing Papers for 2025:  84%|█████████████████████████████████████▊       | 84/100 [34:29<06:43, 25.19s/it][A


  Processing PMCID: PMC12087157 (85/100) for year 2025
  Found 0 <table> elements in XML for PMC12087157.


Processing Papers for 2025:  85%|██████████████████████████████████████▎      | 85/100 [34:30<04:29, 17.98s/it]


  Processing PMCID: PMC11913185 (86/100) for year 2025
  Found 3 <table> elements in XML for PMC11913185.


Processing Papers for 2025:  86%|██████████████████████████████████████▋      | 86/100 [34:32<03:01, 12.94s/it]


  Processing PMCID: PMC11928506 (87/100) for year 2025
  Found 0 <table> elements in XML for PMC11928506.



Supps for PMC11928506:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '18-22' took: 22.77 seconds (including any timeouts).



Supps for PMC11928506:  50%|██████████████████████████▌                          | 2/4 [00:25<00:21, 10.73s/it][A

  Camelot PDF processing for pages '1-1' took: 1.78 seconds (including any timeouts).



Supps for PMC11928506:  75%|███████████████████████████████████████▊             | 3/4 [00:26<00:06,  6.46s/it][A

  Camelot PDF processing for pages '1-5' took: 1.33 seconds (including any timeouts).



Supps for PMC11928506: 100%|█████████████████████████████████████████████████████| 4/4 [00:42<00:00, 10.25s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '67-71' took: 14.83 seconds (including any timeouts).


Processing Papers for 2025:  87%|███████████████████████████████████████▏     | 87/100 [35:27<05:31, 25.53s/it]


  Processing PMCID: PMC11973445 (88/100) for year 2025
  Found 1 <table> elements in XML for PMC11973445.
      Sufficient primers (46) found for PMC11973445 from XML. Skipping its supplement processing.


Processing Papers for 2025:  88%|███████████████████████████████████████▌     | 88/100 [35:29<03:42, 18.52s/it]


  Processing PMCID: PMC11999995 (89/100) for year 2025
  Found 1 <table> elements in XML for PMC11999995.


Processing Papers for 2025:  89%|████████████████████████████████████████     | 89/100 [35:30<02:28, 13.46s/it]


  Processing PMCID: PMC12060085 (90/100) for year 2025
  Found 0 <table> elements in XML for PMC12060085.



Supps for PMC12060085:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC12060085: 100%|█████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.35s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-2' took: 2.99 seconds (including any timeouts).


Processing Papers for 2025:  90%|████████████████████████████████████████▌    | 90/100 [35:37<01:54, 11.47s/it]


  Processing PMCID: PMC11990000 (91/100) for year 2025
  Found 0 <table> elements in XML for PMC11990000.


Processing Papers for 2025:  91%|████████████████████████████████████████▉    | 91/100 [35:39<01:16,  8.55s/it]


  Processing PMCID: PMC11735105 (92/100) for year 2025
  Found 0 <table> elements in XML for PMC11735105.



Supps for PMC11735105:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11735105: 100%|█████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.22s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '37-41' took: 9.94 seconds (including any timeouts).


Processing Papers for 2025:  92%|█████████████████████████████████████████▍   | 92/100 [35:54<01:23, 10.47s/it]


  Processing PMCID: PMC12012330 (93/100) for year 2025
  Found 0 <table> elements in XML for PMC12012330.
      Sufficient primers (29) found for PMC12012330 from XML. Skipping its supplement processing.


Processing Papers for 2025:  93%|█████████████████████████████████████████▊   | 93/100 [35:56<00:55,  7.93s/it]


  Processing PMCID: PMC12077006 (94/100) for year 2025
  Found 0 <table> elements in XML for PMC12077006.



Supps for PMC12077006:   0%|                                                             | 0/2 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC12077006:  50%|██████████████████████████▌                          | 1/2 [00:02<00:02,  2.18s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.01 seconds (including any timeouts).
      Sufficient primers (36) now found for PMC12077006. Skipping its remaining supplements.


Processing Papers for 2025:  94%|██████████████████████████████████████████▎  | 94/100 [36:07<00:53,  8.95s/it]


  Processing PMCID: PMC11928738 (95/100) for year 2025
  Found 0 <table> elements in XML for PMC11928738.



Supps for PMC11928738:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '9-13' took: 19.72 seconds (including any timeouts).
      Sufficient primers (45) now found for PMC11928738. Skipping its remaining supplements.


Processing Papers for 2025:  95%|██████████████████████████████████████████▊  | 95/100 [36:44<01:27, 17.45s/it]


  Processing PMCID: PMC11960892 (96/100) for year 2025
  Found 0 <table> elements in XML for PMC11960892.


Processing Papers for 2025:  96%|███████████████████████████████████████████▏ | 96/100 [36:46<00:51, 12.79s/it]


  Processing PMCID: PMC11743952 (97/100) for year 2025
  Found 0 <table> elements in XML for PMC11743952.


Processing Papers for 2025:  97%|███████████████████████████████████████████▋ | 97/100 [36:48<00:28,  9.44s/it]


  Processing PMCID: PMC11954210 (98/100) for year 2025
  Found 0 <table> elements in XML for PMC11954210.



Supps for PMC11954210:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2025:  98%|████████████████████████████████████████████ | 98/100 [36:58<00:19,  9.61s/it][A


  Processing PMCID: PMC11940393 (99/100) for year 2025
  Found 0 <table> elements in XML for PMC11940393.


Processing Papers for 2025:  99%|████████████████████████████████████████████▌| 99/100 [37:00<00:07,  7.28s/it]


  Processing PMCID: PMC11981745 (100/100) for year 2025
  Found 5 <table> elements in XML for PMC11981745.


Processing Papers for 2025: 100%|████████████████████████████████████████████| 100/100 [37:01<00:00, 22.22s/it]



--- Consolidating and Saving Results for YEAR 2025 ---
Total unique primer entries found for 2025: 1666
  ✅ Saved 1666 unique extracted primers for 2025 to data\psc\yearly_results\master_extracted_primers_2025.csv
###### FINISHED PROCESSING FOR YEAR: 2025 ######

STARTING PROCESSING FOR YEAR: 2024

Searching Europe PMC with query: OPEN_ACCESS:y AND HAS_FT:y AND (METHODS:"qPCR" OR "RT-PCR" OR "real time PCR") AND (ABSTRACT:"pluripotent" OR "iPSC" OR "PSC" OR "hPSC" OR "ESC" OR "hESC") AND (FIRST_PDATE:2024)
Retrieved 100 records for 2024.


Processing Papers for 2024:   0%|                                                      | 0/100 [00:00<?, ?it/s]


  Processing PMCID: PMC11787772 (1/100) for year 2024
  Found 3 <table> elements in XML for PMC11787772.



Supps for PMC11787772:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11787772: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.74it/s][A
Processing Papers for 2024:   1%|▍                                             | 1/100 [00:05<09:35,  5.81s/it][A


  Processing PMCID: PMC11252539 (2/100) for year 2024
  Found 0 <table> elements in XML for PMC11252539.



Supps for PMC11252539:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11252539:  50%|██████████████████████████▌                          | 1/2 [00:15<00:14, 15.00s/it][A

  Camelot PDF processing for pages '13-17' took: 14.82 seconds (including any timeouts).



Supps for PMC11252539: 100%|█████████████████████████████████████████████████████| 2/2 [00:31<00:00, 16.08s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '27-31' took: 15.92 seconds (including any timeouts).


Processing Papers for 2024:   2%|▉                                             | 2/100 [00:45<41:41, 25.53s/it]


  Processing PMCID: PMC11462042 (3/100) for year 2024
  Found 1 <table> elements in XML for PMC11462042.



Supps for PMC11462042:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11462042: 100%|█████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.74s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '20-24' took: 10.54 seconds (including any timeouts).


Processing Papers for 2024:   3%|█▍                                            | 3/100 [01:05<37:11, 23.00s/it]


  Processing PMCID: PMC11762643 (4/100) for year 2024
  Found 2 <table> elements in XML for PMC11762643.


Processing Papers for 2024:   4%|█▊                                            | 4/100 [01:07<23:29, 14.68s/it]


  Processing PMCID: PMC11412063 (5/100) for year 2024
  Found 0 <table> elements in XML for PMC11412063.



Supps for PMC11412063:   0%|                                                             | 0/2 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '5-9' took: 13.81 seconds (including any timeouts).
      Sufficient primers (12) now found for PMC11412063. Skipping its remaining supplements.


Processing Papers for 2024:   5%|██▎                                           | 5/100 [01:29<27:37, 17.44s/it]


  Processing PMCID: PMC11762450 (6/100) for year 2024
  Found 2 <table> elements in XML for PMC11762450.


Processing Papers for 2024:   6%|██▊                                           | 6/100 [01:32<19:26, 12.41s/it]


  Processing PMCID: PMC10986062 (7/100) for year 2024
  Found 1 <table> elements in XML for PMC10986062.



Supps for PMC10986062:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10986062: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.58it/s][A
Processing Papers for 2024:   7%|███▏                                          | 7/100 [01:37<15:41, 10.12s/it][A


  Processing PMCID: PMC11582570 (8/100) for year 2024
  Found 0 <table> elements in XML for PMC11582570.



Supps for PMC11582570:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11582570: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.10it/s][A
Processing Papers for 2024:   8%|███▋                                          | 8/100 [01:44<13:49,  9.01s/it][A


  Processing PMCID: PMC10940796 (9/100) for year 2024
  Found 0 <table> elements in XML for PMC10940796.


Processing Papers for 2024:   9%|████▏                                         | 9/100 [01:45<10:03,  6.63s/it]


  Processing PMCID: PMC11277421 (10/100) for year 2024
  Found 0 <table> elements in XML for PMC11277421.


Processing Papers for 2024:  10%|████▌                                        | 10/100 [01:47<07:41,  5.13s/it]


  Processing PMCID: PMC11638181 (11/100) for year 2024
  Found 0 <table> elements in XML for PMC11638181.



Supps for PMC11638181:   0%|                                                             | 0/2 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11638181:  50%|██████████████████████████▌                          | 1/2 [00:03<00:03,  3.20s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.53 seconds (including any timeouts).
      Sufficient primers (88) now found for PMC11638181. Skipping its remaining supplements.


Processing Papers for 2024:  11%|████▉                                        | 11/100 [01:59<10:44,  7.24s/it]


  Processing PMCID: PMC11126453 (12/100) for year 2024
  Found 0 <table> elements in XML for PMC11126453.



Supps for PMC11126453:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11126453: 100%|█████████████████████████████████████████████████████| 1/1 [00:17<00:00, 17.56s/it][A
                                                       

  Camelot PDF processing for pages '3-7' took: 17.05 seconds (including any timeouts).


Processing Papers for 2024:  12%|█████▍                                       | 12/100 [02:20<16:57, 11.56s/it]


  Processing PMCID: PMC11600670 (13/100) for year 2024
  Found 0 <table> elements in XML for PMC11600670.



Supps for PMC11600670:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11600670: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.90it/s][A
Processing Papers for 2024:  13%|█████▊                                       | 13/100 [02:24<13:19,  9.19s/it][A


  Processing PMCID: PMC11647242 (14/100) for year 2024
  Found 5 <table> elements in XML for PMC11647242.



Supps for PMC11647242:   0%|                                                             | 0/1 [00:00<?, ?it/s][A

  Lattice flavor TIMED OUT for PMC11647242_supp1.pdf after 60s on pages '3-7'.





  Camelot PDF processing for pages '3-7' took: 64.75 seconds (including any timeouts).


Supps for PMC11647242: 100%|█████████████████████████████████████████████████████| 1/1 [01:05<00:00, 65.04s/it][A
Processing Papers for 2024:  14%|██████▎                                      | 14/100 [03:36<40:17, 28.11s/it][A


  Processing PMCID: PMC11595023 (15/100) for year 2024
  Found 0 <table> elements in XML for PMC11595023.


Processing Papers for 2024:  15%|██████▊                                      | 15/100 [03:38<28:41, 20.26s/it]


  Processing PMCID: PMC11341592 (16/100) for year 2024
  Found 2 <table> elements in XML for PMC11341592.


Processing Papers for 2024:  16%|███████▏                                     | 16/100 [03:39<20:27, 14.61s/it]


  Processing PMCID: PMC11367854 (17/100) for year 2024
  Found 0 <table> elements in XML for PMC11367854.



Supps for PMC11367854:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11367854:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  4.72it/s][A
                                                                                                               [A

      Sufficient primers (11) now found for PMC11367854. Skipping its remaining supplements.


Processing Papers for 2024:  17%|███████▋                                     | 17/100 [03:45<16:32, 11.96s/it]


  Processing PMCID: PMC11260683 (18/100) for year 2024
  Found 0 <table> elements in XML for PMC11260683.



Supps for PMC11260683:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '10-14' took: 16.13 seconds (including any timeouts).


Processing Papers for 2024:  18%|████████                                     | 18/100 [04:16<23:56, 17.52s/it]


  Processing PMCID: PMC10847391 (20/100) for year 2024
  Found 0 <table> elements in XML for PMC10847391.



Supps for PMC10847391:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2024:  20%|█████████                                    | 20/100 [04:21<14:09, 10.62s/it][A


  Processing PMCID: PMC11772670 (21/100) for year 2024
  Found 1 <table> elements in XML for PMC11772670.



Supps for PMC11772670:   0%|                                                             | 0/2 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '18-22' took: 28.59 seconds (including any timeouts).



Supps for PMC11772670: 100%|█████████████████████████████████████████████████████| 2/2 [00:38<00:00, 17.60s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '3-7' took: 9.08 seconds (including any timeouts).


Processing Papers for 2024:  21%|█████████▍                                   | 21/100 [05:30<33:17, 25.28s/it]


  Processing PMCID: PMC11431361 (22/100) for year 2024
  Found 2 <table> elements in XML for PMC11431361.


Processing Papers for 2024:  22%|█████████▉                                   | 22/100 [05:32<24:53, 19.15s/it]


  Processing PMCID: PMC11554259 (23/100) for year 2024
  Found 0 <table> elements in XML for PMC11554259.



Supps for PMC11554259:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11554259: 100%|█████████████████████████████████████████████████████| 1/1 [00:20<00:00, 21.00s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-4' took: 20.54 seconds (including any timeouts).


Processing Papers for 2024:  23%|██████████▎                                  | 23/100 [06:06<29:36, 23.08s/it]


  Processing PMCID: PMC11140888 (24/100) for year 2024
  Found 0 <table> elements in XML for PMC11140888.


Processing Papers for 2024:  24%|██████████▊                                  | 24/100 [06:08<21:38, 17.09s/it]


  Processing PMCID: PMC11530062 (25/100) for year 2024
  Found 0 <table> elements in XML for PMC11530062.



Supps for PMC11530062:   0%|                                                             | 0/8 [00:00<?, ?it/s][A
Supps for PMC11530062:  12%|██████▋                                              | 1/8 [00:03<00:24,  3.45s/it][ACropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.75 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11530062:  25%|█████████████▎                                       | 2/8 [00:08<00:25,  4.28s/it][A

  Camelot PDF processing for pages '1-1' took: 4.01 seconds (including any timeouts).



Supps for PMC11530062:  38%|███████████████████▉                                 | 3/8 [00:23<00:46,  9.22s/it][A

  Camelot PDF processing for pages '1-1' took: 12.12 seconds (including any timeouts).



Supps for PMC11530062:  50%|██████████████████████████▌                          | 4/8 [00:47<01:00, 15.03s/it][A

  Camelot PDF processing for pages '1-1' took: 19.33 seconds (including any timeouts).



Supps for PMC11530062:  62%|█████████████████████████████████▏                   | 5/8 [00:50<00:32, 10.75s/it][A

  Camelot PDF processing for pages '1-1' took: 2.89 seconds (including any timeouts).



Supps for PMC11530062:  75%|███████████████████████████████████████▊             | 6/8 [00:57<00:18,  9.31s/it][A

  Camelot PDF processing for pages '1-1' took: 5.56 seconds (including any timeouts).



Supps for PMC11530062:  88%|██████████████████████████████████████████████▍      | 7/8 [01:02<00:08,  8.19s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 4.89 seconds (including any timeouts).


Processing Papers for 2024:  25%|███████████▎                                 | 25/100 [07:46<50:25, 40.33s/it]


  Processing PMCID: PMC10873380 (26/100) for year 2024
  Found 0 <table> elements in XML for PMC10873380.
      Sufficient primers (12) found for PMC10873380 from XML. Skipping its supplement processing.


Processing Papers for 2024:  26%|███████████▋                                 | 26/100 [07:48<35:56, 29.15s/it]


  Processing PMCID: PMC11352932 (27/100) for year 2024
  Found 0 <table> elements in XML for PMC11352932.


Processing Papers for 2024:  27%|████████████▏                                | 27/100 [07:50<25:48, 21.21s/it]


  Processing PMCID: PMC11612254 (28/100) for year 2024
  Found 0 <table> elements in XML for PMC11612254.



Supps for PMC11612254:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11612254: 100%|█████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.83s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '10-14' took: 10.66 seconds (including any timeouts).


Processing Papers for 2024:  28%|████████████▌                                | 28/100 [08:05<23:20, 19.45s/it]


  Processing PMCID: PMC11016801 (29/100) for year 2024
  Found 1 <table> elements in XML for PMC11016801.



Supps for PMC11016801:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11016801: 100%|█████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.79s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '6-10' took: 8.64 seconds (including any timeouts).


Processing Papers for 2024:  29%|█████████████                                | 29/100 [08:18<20:53, 17.65s/it]


  Processing PMCID: PMC11570519 (30/100) for year 2024
  Found 0 <table> elements in XML for PMC11570519.



Supps for PMC11570519:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11570519:  50%|██████████████████████████▌                          | 1/2 [00:14<00:14, 14.59s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '8-12' took: 14.34 seconds (including any timeouts).
      Sufficient primers (56) now found for PMC11570519. Skipping its remaining supplements.


Processing Papers for 2024:  30%|█████████████▌                               | 30/100 [08:39<21:40, 18.58s/it]


  Processing PMCID: PMC11226837 (31/100) for year 2024
  Found 0 <table> elements in XML for PMC11226837.



Supps for PMC11226837:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11226837: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.83it/s][A
Processing Papers for 2024:  31%|█████████████▉                               | 31/100 [09:04<23:33, 20.49s/it][A


  Processing PMCID: PMC11704612 (32/100) for year 2024
  Found 1 <table> elements in XML for PMC11704612.



Supps for PMC11704612:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11704612:  50%|██████████████████████████▌                          | 1/2 [00:24<00:24, 24.78s/it][A

  Camelot PDF processing for pages '4-8' took: 24.58 seconds (including any timeouts).



Supps for PMC11704612: 100%|█████████████████████████████████████████████████████| 2/2 [00:58<00:00, 30.19s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '20-24' took: 28.86 seconds (including any timeouts).


Processing Papers for 2024:  32%|██████████████▍                              | 32/100 [10:12<39:05, 34.50s/it]


  Processing PMCID: PMC11719772 (33/100) for year 2024
  Found 3 <table> elements in XML for PMC11719772.
      Sufficient primers (9) found for PMC11719772 from XML. Skipping its supplement processing.


Processing Papers for 2024:  33%|██████████████▊                              | 33/100 [10:13<27:32, 24.66s/it]


  Processing PMCID: PMC10853177 (34/100) for year 2024
  Found 3 <table> elements in XML for PMC10853177.
      Sufficient primers (32) found for PMC10853177 from XML. Skipping its supplement processing.


Processing Papers for 2024:  34%|███████████████▎                             | 34/100 [10:15<19:36, 17.83s/it]


  Processing PMCID: PMC10963097 (35/100) for year 2024
  Found 0 <table> elements in XML for PMC10963097.



Supps for PMC10963097:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10963097: 100%|█████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.73s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-3' took: 6.41 seconds (including any timeouts).


Processing Papers for 2024:  35%|███████████████▋                             | 35/100 [10:25<16:49, 15.53s/it]


  Processing PMCID: PMC11201801 (36/100) for year 2024
  Found 0 <table> elements in XML for PMC11201801.


Processing Papers for 2024:  36%|████████████████▏                            | 36/100 [10:27<12:08, 11.39s/it]


  Processing PMCID: PMC11578294 (37/100) for year 2024
  Found 0 <table> elements in XML for PMC11578294.
    ⚠️ Unexpected error processing paper PMC11578294: HTTPSConnectionPool(host='europepmc.org', port=443): Read timed out. (read timeout=60)


Traceback (most recent call last):
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\ssl.py", line 1275, in recv_into
    return self.read(nbytes, buffer)
  File "C:\


  Processing PMCID: PMC11452544 (38/100) for year 2024
  Found 0 <table> elements in XML for PMC11452544.



Supps for PMC11452544:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11452544: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.27it/s][A
Processing Papers for 2024:  38%|█████████████████                            | 38/100 [11:37<21:32, 20.85s/it][A


  Processing PMCID: PMC11272160 (39/100) for year 2024
  Found 0 <table> elements in XML for PMC11272160.



Supps for PMC11272160:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11272160: 100%|█████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.16s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-5' took: 14.11 seconds (including any timeouts).


Processing Papers for 2024:  39%|█████████████████▌                           | 39/100 [12:02<22:37, 22.26s/it]


  Processing PMCID: PMC11728883 (40/100) for year 2024
  Found 12 <table> elements in XML for PMC11728883.


Processing Papers for 2024:  40%|██████████████████                           | 40/100 [12:04<16:14, 16.24s/it]


  Processing PMCID: PMC11096619 (41/100) for year 2024
  Found 0 <table> elements in XML for PMC11096619.



Supps for PMC11096619:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11096619:  50%|██████████████████████████▌                          | 1/2 [00:13<00:13, 13.23s/it][A

  Camelot PDF processing for pages '10-14' took: 13.02 seconds (including any timeouts).



Supps for PMC11096619: 100%|█████████████████████████████████████████████████████| 2/2 [00:27<00:00, 13.96s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '27-31' took: 13.46 seconds (including any timeouts).


Processing Papers for 2024:  41%|██████████████████▍                          | 41/100 [12:59<27:21, 27.83s/it]


  Processing PMCID: PMC11342089 (42/100) for year 2024
  Found 0 <table> elements in XML for PMC11342089.
    ⚠️ Unexpected error processing paper PMC11342089: HTTPSConnectionPool(host='europepmc.org', port=443): Read timed out. (read timeout=60)


Traceback (most recent call last):
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\ssl.py", line 1275, in recv_into
    return self.read(nbytes, buffer)
  File "C:\


  Processing PMCID: PMC11667033 (43/100) for year 2024
  Found 0 <table> elements in XML for PMC11667033.



Supps for PMC11667033:   0%|                                                             | 0/2 [00:00<?, ?it/s][A

  Camelot PDF processing for pages '3-7' took: 71.58 seconds (including any timeouts).



Supps for PMC11667033:  50%|██████████████████████████▌                          | 1/2 [01:12<01:12, 72.03s/it][A
                                                                                                               [A

      Sufficient primers (30) now found for PMC11667033. Skipping its remaining supplements.


Processing Papers for 2024:  43%|███████████████████▎                         | 43/100 [15:27<49:38, 52.26s/it]


  Processing PMCID: PMC10941625 (44/100) for year 2024
  Found 0 <table> elements in XML for PMC10941625.


Processing Papers for 2024:  44%|███████████████████▊                         | 44/100 [15:29<34:42, 37.18s/it]


  Processing PMCID: PMC11301355 (45/100) for year 2024
  Found 0 <table> elements in XML for PMC11301355.



Supps for PMC11301355:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11301355: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.53it/s][A
Processing Papers for 2024:  45%|████████████████████▎                        | 45/100 [15:35<25:16, 27.58s/it][A


  Processing PMCID: PMC11347211 (46/100) for year 2024
  Found 0 <table> elements in XML for PMC11347211.


Processing Papers for 2024:  46%|████████████████████▋                        | 46/100 [15:36<17:48, 19.79s/it]


  Processing PMCID: PMC11393772 (47/100) for year 2024
  Found 0 <table> elements in XML for PMC11393772.



Supps for PMC11393772:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11393772: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.42it/s][A
Processing Papers for 2024:  47%|█████████████████████▏                       | 47/100 [15:45<14:35, 16.53s/it][A


  Processing PMCID: PMC11118134 (48/100) for year 2024
  Found 0 <table> elements in XML for PMC11118134.


Processing Papers for 2024:  48%|█████████████████████▌                       | 48/100 [15:47<10:24, 12.01s/it]


  Processing PMCID: PMC11275010 (49/100) for year 2024
  Found 0 <table> elements in XML for PMC11275010.


Processing Papers for 2024:  49%|██████████████████████                       | 49/100 [15:49<07:39,  9.01s/it]


  Processing PMCID: PMC10797036 (50/100) for year 2024
  Found 0 <table> elements in XML for PMC10797036.



Supps for PMC10797036:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2024:  50%|██████████████████████▌                      | 50/100 [16:13<11:25, 13.70s/it][A


  Processing PMCID: PMC11624181 (51/100) for year 2024
  Found 0 <table> elements in XML for PMC11624181.



Supps for PMC11624181:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11624181: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.20it/s][A
Processing Papers for 2024:  51%|██████████████████████▉                      | 51/100 [16:18<08:55, 10.93s/it][A


  Processing PMCID: PMC11390961 (52/100) for year 2024
  Found 0 <table> elements in XML for PMC11390961.



Supps for PMC11390961:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2024:  52%|███████████████████████▍                     | 52/100 [16:22<07:02,  8.81s/it][A


  Processing PMCID: PMC11685540 (53/100) for year 2024
  Found 0 <table> elements in XML for PMC11685540.



Supps for PMC11685540:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11685540:  25%|█████████████▎                                       | 1/4 [00:03<00:11,  3.98s/it][A

  Camelot PDF processing for pages '1' took: 2.88 seconds (including any timeouts).



Supps for PMC11685540:  50%|██████████████████████████▌                          | 2/4 [00:10<00:11,  5.58s/it][A

  Camelot PDF processing for pages '1-3' took: 5.97 seconds (including any timeouts).



Supps for PMC11685540:  75%|███████████████████████████████████████▊             | 3/4 [00:35<00:14, 14.25s/it][A

  Camelot PDF processing for pages '11-15' took: 23.10 seconds (including any timeouts).



Processing Papers for 2024:  53%|███████████████████████▊                     | 53/100 [17:07<15:28, 19.77s/it][A


  Processing PMCID: PMC11608089 (54/100) for year 2024
  Found 3 <table> elements in XML for PMC11608089.


Processing Papers for 2024:  54%|████████████████████████▎                    | 54/100 [17:09<10:59, 14.33s/it]


  Processing PMCID: PMC11342912 (55/100) for year 2024
  Found 0 <table> elements in XML for PMC11342912.
      Sufficient primers (14) found for PMC11342912 from XML. Skipping its supplement processing.


Processing Papers for 2024:  55%|████████████████████████▊                    | 55/100 [17:10<07:54, 10.54s/it]


  Processing PMCID: PMC11178176 (56/100) for year 2024
  Found 1 <table> elements in XML for PMC11178176.



Supps for PMC11178176:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11178176: 100%|█████████████████████████████████████████████████████| 1/1 [00:17<00:00, 17.15s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 16.84 seconds (including any timeouts).


Processing Papers for 2024:  56%|█████████████████████████▏                   | 56/100 [17:32<10:10, 13.86s/it]


  Processing PMCID: PMC11078715 (57/100) for year 2024
  Found 0 <table> elements in XML for PMC11078715.


Processing Papers for 2024:  57%|█████████████████████████▋                   | 57/100 [17:33<07:18, 10.20s/it]


  Processing PMCID: PMC11668817 (58/100) for year 2024
  Found 0 <table> elements in XML for PMC11668817.


Processing Papers for 2024:  58%|██████████████████████████                   | 58/100 [17:35<05:17,  7.56s/it]


  Processing PMCID: PMC11307172 (59/100) for year 2024
  Found 3 <table> elements in XML for PMC11307172.


Processing Papers for 2024:  59%|██████████████████████████▌                  | 59/100 [17:36<03:54,  5.71s/it]


  Processing PMCID: PMC11563035 (60/100) for year 2024
  Found 2 <table> elements in XML for PMC11563035.



Supps for PMC11563035:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11563035: 100%|█████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.40s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 9.29 seconds (including any timeouts).


Processing Papers for 2024:  60%|███████████████████████████                  | 60/100 [17:57<06:52, 10.31s/it]


  Processing PMCID: PMC11649531 (61/100) for year 2024
  Found 2 <table> elements in XML for PMC11649531.
      Sufficient primers (10) found for PMC11649531 from XML. Skipping its supplement processing.


Processing Papers for 2024:  61%|███████████████████████████▍                 | 61/100 [17:59<05:01,  7.72s/it]


  Processing PMCID: PMC11438508 (62/100) for year 2024
  Found 0 <table> elements in XML for PMC11438508.



Supps for PMC11438508:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11438508: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.71it/s][A
Processing Papers for 2024:  62%|███████████████████████████▉                 | 62/100 [18:03<04:08,  6.55s/it][A


  Processing PMCID: PMC11487784 (63/100) for year 2024
  Found 0 <table> elements in XML for PMC11487784.



Supps for PMC11487784:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '8-12' took: 12.43 seconds (including any timeouts).


Processing Papers for 2024:  63%|████████████████████████████▎                | 63/100 [18:37<09:12, 14.93s/it]


  Processing PMCID: PMC10792855 (64/100) for year 2024
  Found 3 <table> elements in XML for PMC10792855.



Supps for PMC10792855:   0%|                                                             | 0/7 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10792855:  14%|███████▌                                             | 1/7 [00:02<00:12,  2.07s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.01 seconds (including any timeouts).



Supps for PMC10792855:  29%|███████████████▏                                     | 2/7 [00:05<00:13,  2.73s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 3.12 seconds (including any timeouts).



Supps for PMC10792855:  43%|██████████████████████▋                              | 3/7 [00:08<00:11,  2.96s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 3.14 seconds (including any timeouts).



Supps for PMC10792855:  57%|██████████████████████████████▎                      | 4/7 [00:11<00:09,  3.03s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 3.00 seconds (including any timeouts).



Supps for PMC10792855:  71%|█████████████████████████████████████▊               | 5/7 [00:14<00:06,  3.06s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 3.00 seconds (including any timeouts).



Supps for PMC10792855:  86%|█████████████████████████████████████████████▍       | 6/7 [00:17<00:02,  2.95s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.57 seconds (including any timeouts).



Supps for PMC10792855: 100%|█████████████████████████████████████████████████████| 7/7 [00:20<00:00,  2.93s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.77 seconds (including any timeouts).


Processing Papers for 2024:  64%|████████████████████████████▊                | 64/100 [19:13<12:39, 21.10s/it]


  Processing PMCID: PMC11791114 (65/100) for year 2024
  Found 0 <table> elements in XML for PMC11791114.



Supps for PMC11791114:   0%|                                                             | 0/2 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '2-6' took: 17.41 seconds (including any timeouts).



Supps for PMC11791114: 100%|█████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.41s/it][A
Processing Papers for 2024:  65%|█████████████████████████████▎               | 65/100 [19:45<14:18, 24.53s/it][A


  Processing PMCID: PMC11604831 (66/100) for year 2024
  Found 1 <table> elements in XML for PMC11604831.



Supps for PMC11604831:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '16-20' took: 14.10 seconds (including any timeouts).


Processing Papers for 2024:  66%|█████████████████████████████▋               | 66/100 [20:05<13:06, 23.14s/it]


  Processing PMCID: PMC11659569 (67/100) for year 2024
  Found 1 <table> elements in XML for PMC11659569.



Supps for PMC11659569:   0%|                                                             | 0/1 [00:00<?, ?it/s][A


  Camelot PDF processing for pages '7-11' took: 59.88 seconds (including any timeouts).


Supps for PMC11659569: 100%|█████████████████████████████████████████████████████| 1/1 [01:00<00:00, 60.18s/it][A
Processing Papers for 2024:  67%|██████████████████████████████▏              | 67/100 [21:09<19:29, 35.44s/it][A


  Processing PMCID: PMC11624199 (68/100) for year 2024
  Found 1 <table> elements in XML for PMC11624199.



Supps for PMC11624199:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2024:  68%|██████████████████████████████▌              | 68/100 [21:12<13:43, 25.74s/it][A


  Processing PMCID: PMC11896400 (69/100) for year 2024
  Found 0 <table> elements in XML for PMC11896400.
      Sufficient primers (15) found for PMC11896400 from XML. Skipping its supplement processing.


Processing Papers for 2024:  69%|███████████████████████████████              | 69/100 [21:14<09:36, 18.60s/it]


  Processing PMCID: PMC11707394 (70/100) for year 2024
  Found 2 <table> elements in XML for PMC11707394.
      Sufficient primers (54) found for PMC11707394 from XML. Skipping its supplement processing.


Processing Papers for 2024:  70%|███████████████████████████████▍             | 70/100 [21:16<06:47, 13.58s/it]


  Processing PMCID: PMC11303166 (71/100) for year 2024
  Found 2 <table> elements in XML for PMC11303166.
      Sufficient primers (10) found for PMC11303166 from XML. Skipping its supplement processing.


Processing Papers for 2024:  71%|███████████████████████████████▉             | 71/100 [21:18<04:50, 10.01s/it]


  Processing PMCID: PMC11464881 (72/100) for year 2024
  Found 1 <table> elements in XML for PMC11464881.
      Sufficient primers (38) found for PMC11464881 from XML. Skipping its supplement processing.


Processing Papers for 2024:  72%|████████████████████████████████▍            | 72/100 [21:20<03:31,  7.54s/it]


  Processing PMCID: PMC11442691 (73/100) for year 2024
  Found 0 <table> elements in XML for PMC11442691.
      Sufficient primers (11) found for PMC11442691 from XML. Skipping its supplement processing.


Processing Papers for 2024:  73%|████████████████████████████████▊            | 73/100 [21:21<02:36,  5.79s/it]


  Processing PMCID: PMC10826047 (74/100) for year 2024
  Found 0 <table> elements in XML for PMC10826047.


Processing Papers for 2024:  74%|█████████████████████████████████▎           | 74/100 [21:23<01:59,  4.59s/it]


  Processing PMCID: PMC11480346 (75/100) for year 2024
  Found 0 <table> elements in XML for PMC11480346.



Supps for PMC11480346:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11480346: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.30it/s][A
Processing Papers for 2024:  75%|█████████████████████████████████▊           | 75/100 [21:34<02:38,  6.34s/it][A


  Processing PMCID: PMC12096905 (76/100) for year 2024
  Found 0 <table> elements in XML for PMC12096905.



Supps for PMC12096905:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '10-14' took: 18.61 seconds (including any timeouts).



Supps for PMC12096905: 100%|█████████████████████████████████████████████████████| 1/1 [00:19<00:00, 19.20s/it][A
Processing Papers for 2024:  76%|██████████████████████████████████▏          | 76/100 [21:57<04:34, 11.43s/it][A


  Processing PMCID: PMC11502794 (77/100) for year 2024
  Found 0 <table> elements in XML for PMC11502794.



Supps for PMC11502794:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '9-13' took: 12.90 seconds (including any timeouts).


Processing Papers for 2024:  77%|██████████████████████████████████▋          | 77/100 [22:15<05:07, 13.39s/it]


  Processing PMCID: PMC11652941 (78/100) for year 2024
  Found 2 <table> elements in XML for PMC11652941.
      Sufficient primers (20) found for PMC11652941 from XML. Skipping its supplement processing.


Processing Papers for 2024:  78%|███████████████████████████████████          | 78/100 [22:17<03:37,  9.90s/it]


  Processing PMCID: PMC11605830 (79/100) for year 2024
  Found 0 <table> elements in XML for PMC11605830.



Supps for PMC11605830:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11605830: 100%|█████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.58s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-5' took: 10.03 seconds (including any timeouts).


Processing Papers for 2024:  79%|███████████████████████████████████▌         | 79/100 [22:31<03:54, 11.14s/it]


  Processing PMCID: PMC11810796 (80/100) for year 2024
  Found 0 <table> elements in XML for PMC11810796.



Supps for PMC11810796:   0%|                                                             | 0/3 [00:00<?, ?it/s][A
Supps for PMC11810796:  33%|█████████████████▋                                   | 1/3 [00:14<00:28, 14.42s/it][A

  Camelot PDF processing for pages '8-12' took: 14.04 seconds (including any timeouts).
  Lattice flavor TIMED OUT for PMC11810796_supp2.pdf after 60s on pages '1-4'.



Supps for PMC11810796:  67%|███████████████████████████████████▎                 | 2/3 [02:46<01:35, 95.39s/it][A

  Stream flavor TIMED OUT for PMC11810796_supp2.pdf after 60s on pages '1-4'.
  Camelot PDF processing for pages '1-4' took: 123.04 seconds (including any timeouts).



Supps for PMC11810796: 100%|█████████████████████████████████████████████████████| 3/3 [02:53<00:00, 55.18s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '22-26' took: 5.86 seconds (including any timeouts).


Processing Papers for 2024:  80%|████████████████████████████████████         | 80/100 [25:33<20:47, 62.40s/it]


  Processing PMCID: PMC10867012 (81/100) for year 2024
  Found 0 <table> elements in XML for PMC10867012.



Supps for PMC10867012:   0%|                                                             | 0/3 [00:00<?, ?it/s][A
Supps for PMC10867012:  33%|█████████████████▋                                   | 1/3 [00:20<00:40, 20.27s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '27-31' took: 19.93 seconds (including any timeouts).
      Sufficient primers (118) now found for PMC10867012. Skipping its remaining supplements.


Processing Papers for 2024:  81%|████████████████████████████████████▍        | 81/100 [26:05<16:53, 53.35s/it]


  Processing PMCID: PMC11357809 (82/100) for year 2024
  Found 50 <table> elements in XML for PMC11357809.
      Sufficient primers (9) found for PMC11357809 from XML. Skipping its supplement processing.


Processing Papers for 2024:  82%|████████████████████████████████████▉        | 82/100 [26:07<11:24, 38.05s/it]


  Processing PMCID: PMC10881400 (83/100) for year 2024
  Found 0 <table> elements in XML for PMC10881400.



Supps for PMC10881400:   0%|                                                             | 0/2 [00:00<?, ?it/s][A


  Camelot PDF processing for pages '17-21' took: 22.55 seconds (including any timeouts).


Supps for PMC10881400:  50%|██████████████████████████▌                          | 1/2 [00:23<00:23, 23.07s/it][A
                                                                                                               [A

      Sufficient primers (142) now found for PMC10881400. Skipping its remaining supplements.


Processing Papers for 2024:  83%|█████████████████████████████████████▎       | 83/100 [26:38<10:11, 35.96s/it]


  Processing PMCID: PMC11912026 (84/100) for year 2024
  Found 3 <table> elements in XML for PMC11912026.
      Sufficient primers (21) found for PMC11912026 from XML. Skipping its supplement processing.


Processing Papers for 2024:  84%|█████████████████████████████████████▊       | 84/100 [26:40<06:50, 25.63s/it]


  Processing PMCID: PMC10787008 (85/100) for year 2024
  Found 1 <table> elements in XML for PMC10787008.
      Sufficient primers (26) found for PMC10787008 from XML. Skipping its supplement processing.


Processing Papers for 2024:  85%|██████████████████████████████████████▎      | 85/100 [26:42<04:37, 18.48s/it]


  Processing PMCID: PMC11111830 (86/100) for year 2024
  Found 1 <table> elements in XML for PMC11111830.



Supps for PMC11111830:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC11111830: 100%|█████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.98s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '7-11' took: 10.72 seconds (including any timeouts).


Processing Papers for 2024:  86%|██████████████████████████████████████▋      | 86/100 [27:09<04:57, 21.25s/it]


  Processing PMCID: PMC11342178 (87/100) for year 2024
  Found 41 <table> elements in XML for PMC11342178.
      Sufficient primers (20) found for PMC11342178 from XML. Skipping its supplement processing.


Processing Papers for 2024:  87%|███████████████████████████████████████▏     | 87/100 [27:12<03:21, 15.53s/it]


  Processing PMCID: PMC10966914 (88/100) for year 2024
  Found 1 <table> elements in XML for PMC10966914.
      Sufficient primers (10) found for PMC10966914 from XML. Skipping its supplement processing.


Processing Papers for 2024:  88%|███████████████████████████████████████▌     | 88/100 [27:13<02:15, 11.31s/it]


  Processing PMCID: PMC11530573 (89/100) for year 2024
  Found 0 <table> elements in XML for PMC11530573.



Supps for PMC11530573:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '19-23' took: 28.76 seconds (including any timeouts).



Supps for PMC11530573:  25%|█████████████▎                                       | 1/4 [00:29<01:28, 29.49s/it][A
                                                                                                               [A

      Sufficient primers (26) now found for PMC11530573. Skipping its remaining supplements.


Processing Papers for 2024:  89%|████████████████████████████████████████     | 89/100 [29:40<09:33, 52.15s/it]


  Processing PMCID: PMC11411316 (90/100) for year 2024
  Found 0 <table> elements in XML for PMC11411316.



Supps for PMC11411316:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC11411316:  50%|██████████████████████████▌                          | 1/2 [00:13<00:13, 13.89s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '21-25' took: 13.66 seconds (including any timeouts).
      Sufficient primers (120) now found for PMC11411316. Skipping its remaining supplements.


Processing Papers for 2024:  90%|████████████████████████████████████████▌    | 90/100 [30:04<07:16, 43.70s/it]


  Processing PMCID: PMC11603134 (91/100) for year 2024
  Found 0 <table> elements in XML for PMC11603134.



Supps for PMC11603134:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Lattice flavor TIMED OUT for PMC11603134_supp1.pdf after 60s on pages '1-4'.



Supps for PMC11603134:  25%|█████████████                                       | 1/4 [02:33<07:40, 153.38s/it][A

  Stream flavor TIMED OUT for PMC11603134_supp1.pdf after 60s on pages '1-4'.
  Camelot PDF processing for pages '1-4' took: 122.83 seconds (including any timeouts).



Supps for PMC11603134:  50%|██████████████████████████                          | 2/4 [04:11<04:02, 121.05s/it][A


  Camelot PDF processing for pages '19-23' took: 96.67 seconds (including any timeouts).
      Sufficient primers (70) now found for PMC11603134. Skipping its remaining supplements.


Processing Papers for 2024:  91%|████████████████████████████████████████    | 91/100 [34:27<16:24, 109.39s/it][A


  Processing PMCID: PMC10843182 (92/100) for year 2024
  Found 1 <table> elements in XML for PMC10843182.



Supps for PMC10843182:   0%|                                                             | 0/2 [00:00<?, ?it/s][A


  Camelot PDF processing for pages '16-20' took: 15.90 seconds (including any timeouts).


Supps for PMC10843182:  50%|██████████████████████████▌                          | 1/2 [00:16<00:16, 16.22s/it][A
                                                                                                               [A

      Sufficient primers (16) now found for PMC10843182. Skipping its remaining supplements.


Processing Papers for 2024:  92%|█████████████████████████████████████████▍   | 92/100 [34:52<11:11, 83.94s/it]


  Processing PMCID: PMC11083256 (93/100) for year 2024
  Found 1 <table> elements in XML for PMC11083256.
      Sufficient primers (22) found for PMC11083256 from XML. Skipping its supplement processing.


Processing Papers for 2024:  93%|█████████████████████████████████████████▊   | 93/100 [34:53<06:55, 59.29s/it]


  Processing PMCID: PMC11246529 (94/100) for year 2024
  Found 0 <table> elements in XML for PMC11246529.
    ⚠️ Unexpected error processing paper PMC11246529: HTTPSConnectionPool(host='europepmc.org', port=443): Read timed out. (read timeout=60)


Traceback (most recent call last):
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\ssl.py", line 1275, in recv_into
    return self.read(nbytes, buffer)
  File "C:\


  Processing PMCID: PMC11426554 (95/100) for year 2024
  Found 3 <table> elements in XML for PMC11426554.



Supps for PMC11426554:   0%|                                                             | 0/3 [00:00<?, ?it/s][A
Supps for PMC11426554:  67%|███████████████████████████████████▎                 | 2/3 [00:00<00:00, 19.23it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.43 seconds (including any timeouts).


Processing Papers for 2024:  95%|██████████████████████████████████████████▊  | 95/100 [37:07<05:17, 63.57s/it]


  Processing PMCID: PMC11545028 (96/100) for year 2024
  Found 0 <table> elements in XML for PMC11545028.


Processing Papers for 2024:  96%|███████████████████████████████████████████▏ | 96/100 [37:09<03:00, 45.10s/it]


  Processing PMCID: PMC11106629 (97/100) for year 2024
  Found 0 <table> elements in XML for PMC11106629.


Processing Papers for 2024:  97%|███████████████████████████████████████████▋ | 97/100 [37:11<01:36, 32.09s/it]


  Processing PMCID: PMC11593313 (98/100) for year 2024
  Found 0 <table> elements in XML for PMC11593313.


Processing Papers for 2024:  98%|████████████████████████████████████████████ | 98/100 [37:13<00:45, 22.95s/it]


  Processing PMCID: PMC11467398 (99/100) for year 2024
  Found 1 <table> elements in XML for PMC11467398.



Supps for PMC11467398:   0%|                                                             | 0/3 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC11467398:  33%|█████████████████▋                                   | 1/3 [00:12<00:25, 12.83s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropB

  Camelot PDF processing for pages '2-6' took: 12.55 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

  Camelot PDF processing for pages '25-29' took: 28.64 seconds (including any timeouts).



Supps for PMC11467398: 100%|█████████████████████████████████████████████████████| 3/3 [00:52<00:00, 17.07s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '5-9' took: 10.36 seconds (including any timeouts).


Processing Papers for 2024:  99%|████████████████████████████████████████████▌| 99/100 [38:14<00:34, 34.53s/it]


  Processing PMCID: PMC11303561 (100/100) for year 2024
  Found 0 <table> elements in XML for PMC11303561.



Supps for PMC11303561:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '20-24' took: 11.27 seconds (including any timeouts).


Processing Papers for 2024: 100%|████████████████████████████████████████████| 100/100 [38:30<00:00, 23.11s/it]



--- Consolidating and Saving Results for YEAR 2024 ---
Total unique primer entries found for 2024: 1708
  ✅ Saved 1708 unique extracted primers for 2024 to data\psc\yearly_results\master_extracted_primers_2024.csv
###### FINISHED PROCESSING FOR YEAR: 2024 ######

STARTING PROCESSING FOR YEAR: 2023

Searching Europe PMC with query: OPEN_ACCESS:y AND HAS_FT:y AND (METHODS:"qPCR" OR "RT-PCR" OR "real time PCR") AND (ABSTRACT:"pluripotent" OR "iPSC" OR "PSC" OR "hPSC" OR "ESC" OR "hESC") AND (FIRST_PDATE:2023)
Retrieved 100 records for 2023.


Processing Papers for 2023:   0%|                                                      | 0/100 [00:00<?, ?it/s]


  Processing PMCID: PMC10899886 (2/100) for year 2023
  Found 1 <table> elements in XML for PMC10899886.
      Sufficient primers (10) found for PMC10899886 from XML. Skipping its supplement processing.


Processing Papers for 2023:   2%|▉                                             | 2/100 [00:01<01:14,  1.31it/s]


  Processing PMCID: PMC10525589 (3/100) for year 2023
  Found 1 <table> elements in XML for PMC10525589.


Processing Papers for 2023:   3%|█▍                                            | 3/100 [00:03<01:53,  1.18s/it]


  Processing PMCID: PMC10498698 (4/100) for year 2023
  Found 1 <table> elements in XML for PMC10498698.
      Sufficient primers (20) found for PMC10498698 from XML. Skipping its supplement processing.


Processing Papers for 2023:   4%|█▊                                            | 4/100 [00:04<02:02,  1.28s/it]


  Processing PMCID: PMC10565870 (5/100) for year 2023
  Found 4 <table> elements in XML for PMC10565870.


Processing Papers for 2023:   5%|██▎                                           | 5/100 [00:06<02:15,  1.42s/it]


  Processing PMCID: PMC10500938 (6/100) for year 2023
  Found 0 <table> elements in XML for PMC10500938.



Supps for PMC10500938:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10500938: 100%|█████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.67s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-4' took: 12.26 seconds (including any timeouts).


Processing Papers for 2023:   6%|██▊                                           | 6/100 [00:23<10:13,  6.53s/it]


  Processing PMCID: PMC10679777 (7/100) for year 2023
  Found 0 <table> elements in XML for PMC10679777.



Supps for PMC10679777:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10679777:  50%|██████████████████████████▌                          | 1/2 [00:18<00:18, 18.76s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '8-12' took: 18.50 seconds (including any timeouts).
      Sufficient primers (80) now found for PMC10679777. Skipping its remaining supplements.


Processing Papers for 2023:   7%|███▏                                          | 7/100 [00:49<20:11, 13.03s/it]


  Processing PMCID: PMC10845017 (8/100) for year 2023
  Found 3 <table> elements in XML for PMC10845017.



Supps for PMC10845017:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10845017: 100%|█████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.80s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 3.36 seconds (including any timeouts).


Processing Papers for 2023:   8%|███▋                                          | 8/100 [00:57<17:08, 11.18s/it]


  Processing PMCID: PMC10608731 (9/100) for year 2023
  Found 1 <table> elements in XML for PMC10608731.


Processing Papers for 2023:   9%|████▏                                         | 9/100 [00:58<12:28,  8.23s/it]


  Processing PMCID: PMC10094141 (10/100) for year 2023
  Found 2 <table> elements in XML for PMC10094141.
      Sufficient primers (16) found for PMC10094141 from XML. Skipping its supplement processing.


Processing Papers for 2023:  10%|████▌                                        | 10/100 [01:00<09:31,  6.35s/it]


  Processing PMCID: PMC10623941 (11/100) for year 2023
  Found 0 <table> elements in XML for PMC10623941.



Supps for PMC10623941:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10623941: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.27it/s][A
Processing Papers for 2023:  11%|████▉                                        | 11/100 [01:17<13:50,  9.33s/it][A


  Processing PMCID: PMC10601309 (12/100) for year 2023
  Found 0 <table> elements in XML for PMC10601309.



Supps for PMC10601309:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10601309: 100%|█████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.31s/it][A
                                                       

  Camelot PDF processing for pages '3-7' took: 6.85 seconds (including any timeouts).


Processing Papers for 2023:  12%|█████▍                                       | 12/100 [01:55<26:43, 18.22s/it]


  Processing PMCID: PMC10518684 (13/100) for year 2023
  Found 1 <table> elements in XML for PMC10518684.
      Sufficient primers (30) found for PMC10518684 from XML. Skipping its supplement processing.


Processing Papers for 2023:  13%|█████▊                                       | 13/100 [01:57<19:17, 13.30s/it]


  Processing PMCID: PMC10629087 (14/100) for year 2023
  Found 0 <table> elements in XML for PMC10629087.



Supps for PMC10629087:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10629087: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.22it/s][A
Processing Papers for 2023:  14%|██████▎                                      | 14/100 [02:08<18:05, 12.62s/it][A


  Processing PMCID: PMC10683363 (15/100) for year 2023
  Found 13 <table> elements in XML for PMC10683363.


Processing Papers for 2023:  15%|██████▊                                      | 15/100 [02:13<14:31, 10.25s/it]


  Processing PMCID: PMC10427969 (16/100) for year 2023
  Found 2 <table> elements in XML for PMC10427969.


Processing Papers for 2023:  16%|███████▏                                     | 16/100 [02:15<10:50,  7.74s/it]


  Processing PMCID: PMC10662141 (17/100) for year 2023
  Found 1 <table> elements in XML for PMC10662141.



Supps for PMC10662141:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10662141:  25%|█████████████▎                                       | 1/4 [00:09<00:29,  9.80s/it][ACropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-2' took: 9.50 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10662141:  50%|██████████████████████████▌                          | 2/4 [00:12<00:11,  5.91s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.55 seconds (including any timeouts).



Supps for PMC10662141:  75%|███████████████████████████████████████▊             | 3/4 [00:15<00:04,  4.21s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.00 seconds (including any timeouts).
      Sufficient primers (18) now found for PMC10662141. Skipping its remaining supplements.


Processing Papers for 2023:  17%|███████▋                                     | 17/100 [02:43<19:21, 14.00s/it]


  Processing PMCID: PMC10281086 (18/100) for year 2023
  Found 3 <table> elements in XML for PMC10281086.
      Sufficient primers (12) found for PMC10281086 from XML. Skipping its supplement processing.


Processing Papers for 2023:  18%|████████                                     | 18/100 [02:45<13:59, 10.23s/it]


  Processing PMCID: PMC10185738 (19/100) for year 2023
  Found 0 <table> elements in XML for PMC10185738.



Supps for PMC10185738:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10185738:  50%|██████████████████████████▌                          | 1/2 [00:09<00:09,  9.50s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '4-8' took: 9.27 seconds (including any timeouts).
      Sufficient primers (48) now found for PMC10185738. Skipping its remaining supplements.


Processing Papers for 2023:  19%|████████▌                                    | 19/100 [03:01<16:12, 12.01s/it]


  Processing PMCID: PMC10598731 (20/100) for year 2023
  Found 3 <table> elements in XML for PMC10598731.


Processing Papers for 2023:  20%|█████████                                    | 20/100 [03:04<12:15,  9.20s/it]


  Processing PMCID: PMC10724057 (21/100) for year 2023
  Found 0 <table> elements in XML for PMC10724057.



Supps for PMC10724057:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10724057:  50%|██████████████████████████▌                          | 1/2 [00:14<00:14, 14.11s/it][A

  Camelot PDF processing for pages '17-21' took: 13.89 seconds (including any timeouts).



Supps for PMC10724057: 100%|█████████████████████████████████████████████████████| 2/2 [00:29<00:00, 14.83s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '33-37' took: 14.33 seconds (including any timeouts).


Processing Papers for 2023:  21%|█████████▍                                   | 21/100 [03:40<22:52, 17.38s/it]


  Processing PMCID: PMC9954118 (22/100) for year 2023
  Found 1 <table> elements in XML for PMC9954118.


Processing Papers for 2023:  22%|█████████▉                                   | 22/100 [03:42<16:36, 12.77s/it]


  Processing PMCID: PMC10870039 (23/100) for year 2023
  Found 0 <table> elements in XML for PMC10870039.



Supps for PMC10870039:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '14-18' took: 17.44 seconds (including any timeouts).


Processing Papers for 2023:  23%|██████████▎                                  | 23/100 [04:05<20:12, 15.75s/it]


  Processing PMCID: PMC10789563 (24/100) for year 2023
  Found 3 <table> elements in XML for PMC10789563.


Processing Papers for 2023:  24%|██████████▊                                  | 24/100 [04:06<14:32, 11.48s/it]


  Processing PMCID: PMC10185607 (25/100) for year 2023
  Found 0 <table> elements in XML for PMC10185607.



Supps for PMC10185607:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Processing Papers for 2023:  25%|███████████▎                                 | 25/100 [04:10<11:30,  9.21s/it][A


  Processing PMCID: PMC10257648 (26/100) for year 2023
  Found 0 <table> elements in XML for PMC10257648.



Supps for PMC10257648:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10257648: 100%|█████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.90s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1' took: 4.79 seconds (including any timeouts).


Processing Papers for 2023:  26%|███████████▋                                 | 26/100 [04:20<11:35,  9.40s/it]


  Processing PMCID: PMC10294964 (27/100) for year 2023
  Found 1 <table> elements in XML for PMC10294964.
      Sufficient primers (16) found for PMC10294964 from XML. Skipping its supplement processing.


Processing Papers for 2023:  27%|████████████▏                                | 27/100 [04:22<08:41,  7.14s/it]


  Processing PMCID: PMC10724406 (29/100) for year 2023
  Found 0 <table> elements in XML for PMC10724406.



Supps for PMC10724406:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10724406: 100%|█████████████████████████████████████████████████████| 1/1 [00:22<00:00, 22.85s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '27-31' took: 22.40 seconds (including any timeouts).


Processing Papers for 2023:  29%|█████████████                                | 29/100 [04:50<12:18, 10.40s/it]


  Processing PMCID: PMC10722899 (30/100) for year 2023
  Found 1 <table> elements in XML for PMC10722899.
      Sufficient primers (28) found for PMC10722899 from XML. Skipping its supplement processing.


Processing Papers for 2023:  30%|█████████████▌                               | 30/100 [04:52<09:39,  8.27s/it]


  Processing PMCID: PMC10845156 (31/100) for year 2023
  Found 3 <table> elements in XML for PMC10845156.
      Sufficient primers (11) found for PMC10845156 from XML. Skipping its supplement processing.


Processing Papers for 2023:  31%|█████████████▉                               | 31/100 [04:54<07:26,  6.47s/it]


  Processing PMCID: PMC10235054 (32/100) for year 2023
  Found 0 <table> elements in XML for PMC10235054.



Supps for PMC10235054:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '11-15' took: 18.80 seconds (including any timeouts).


Processing Papers for 2023:  32%|██████████████▍                              | 32/100 [05:18<12:48, 11.29s/it]


  Processing PMCID: PMC10147832 (33/100) for year 2023
  Found 0 <table> elements in XML for PMC10147832.



Supps for PMC10147832:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10147832:  50%|██████████████████████████▌                          | 1/2 [00:18<00:18, 18.63s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '7-11' took: 18.36 seconds (including any timeouts).
      Sufficient primers (24) now found for PMC10147832. Skipping its remaining supplements.


Processing Papers for 2023:  33%|██████████████▊                              | 33/100 [05:43<16:50, 15.08s/it]


  Processing PMCID: PMC10502105 (34/100) for year 2023
  Found 0 <table> elements in XML for PMC10502105.



Supps for PMC10502105:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Lattice flavor TIMED OUT for PMC10502105_supp1.pdf after 60s on pages '1-4'.



Supps for PMC10502105:  25%|█████████████                                       | 1/4 [02:35<07:46, 155.65s/it][A

  Stream flavor TIMED OUT for PMC10502105_supp1.pdf after 60s on pages '1-4'.
  Camelot PDF processing for pages '1-4' took: 122.54 seconds (including any timeouts).



Supps for PMC10502105:  50%|██████████████████████████▌                          | 2/4 [03:02<02:39, 79.61s/it][A

  Camelot PDF processing for pages '18-22' took: 16.79 seconds (including any timeouts).



Supps for PMC10502105:  75%|███████████████████████████████████████▊             | 3/4 [03:13<00:48, 48.28s/it][A

  Camelot PDF processing for pages '32-36' took: 10.61 seconds (including any timeouts).



Supps for PMC10502105: 100%|█████████████████████████████████████████████████████| 4/4 [03:14<00:00, 29.99s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 1.79 seconds (including any timeouts).


Processing Papers for 2023:  34%|██████████████▌                            | 34/100 [09:09<1:16:50, 69.85s/it]


  Processing PMCID: PMC10604460 (35/100) for year 2023
  Found 0 <table> elements in XML for PMC10604460.


Processing Papers for 2023:  35%|███████████████▋                             | 35/100 [09:11<54:18, 50.13s/it]


  Processing PMCID: PMC10619266 (36/100) for year 2023
  Found 0 <table> elements in XML for PMC10619266.



Supps for PMC10619266:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10619266: 100%|█████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.89s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-5' took: 12.53 seconds (including any timeouts).


Processing Papers for 2023:  36%|████████████████▏                            | 36/100 [09:28<42:57, 40.27s/it]


  Processing PMCID: PMC10730704 (37/100) for year 2023
  Found 0 <table> elements in XML for PMC10730704.



Supps for PMC10730704:   0%|                                                             | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10730704:  25%|█████████████▎                                       | 1/4 [00:04<00:12,  4.24s/it][A

  Camelot PDF processing for pages '1' took: 2.88 seconds (including any timeouts).



Supps for PMC10730704:  50%|██████████████████████████▌                          | 2/4 [00:19<00:21, 10.89s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '18-22' took: 14.96 seconds (including any timeouts).
      Sufficient primers (76) now found for PMC10730704. Skipping its remaining supplements.


Processing Papers for 2023:  37%|████████████████▋                            | 37/100 [09:58<39:08, 37.29s/it]


  Processing PMCID: PMC10376867 (38/100) for year 2023
  Found 3 <table> elements in XML for PMC10376867.


Processing Papers for 2023:  38%|█████████████████                            | 38/100 [10:00<27:41, 26.80s/it]


  Processing PMCID: PMC10160548 (39/100) for year 2023
  Found 2 <table> elements in XML for PMC10160548.



Supps for PMC10160548:   0%|                                                             | 0/3 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10160548:  33%|█████████████████▋                                   | 1/3 [00:01<00:03,  1.61s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-4' took: 1.57 seconds (including any timeouts).



Supps for PMC10160548:  67%|███████████████████████████████████▎                 | 2/3 [00:03<00:01,  1.95s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 1.99 seconds (including any timeouts).



Supps for PMC10160548: 100%|█████████████████████████████████████████████████████| 3/3 [00:05<00:00,  2.02s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 1.90 seconds (including any timeouts).


Processing Papers for 2023:  39%|█████████████████▌                           | 39/100 [10:15<23:39, 23.26s/it]


  Processing PMCID: PMC10916649 (40/100) for year 2023
  Found 0 <table> elements in XML for PMC10916649.



Supps for PMC10916649:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Stream flavor failed for PMC10916649_supp1.pdf (pages '8-12'). Error: list index out of range
  Camelot PDF processing for pages '8-12' took: 10.93 seconds (including any timeouts).


Processing Papers for 2023:  40%|██████████████████                           | 40/100 [10:31<21:04, 21.08s/it]


  Processing PMCID: PMC11392812 (41/100) for year 2023
  Found 0 <table> elements in XML for PMC11392812.



Supps for PMC11392812:   0%|                                                             | 0/2 [00:00<?, ?it/s][A

  Lattice flavor TIMED OUT for PMC11392812_supp1.pdf after 60s on pages '4-8'.



Supps for PMC11392812:  50%|██████████████████████████                          | 1/2 [02:02<02:02, 122.85s/it][A

  Stream flavor TIMED OUT for PMC11392812_supp1.pdf after 60s on pages '4-8'.
  Camelot PDF processing for pages '4-8' took: 122.63 seconds (including any timeouts).
  Lattice flavor TIMED OUT for PMC11392812_supp2.pdf after 60s on pages '1-3'.



Supps for PMC11392812: 100%|████████████████████████████████████████████████████| 2/2 [04:32<00:00, 138.61s/it][A
                                                                                                               [A

  Stream flavor TIMED OUT for PMC11392812_supp2.pdf after 60s on pages '1-3'.
  Camelot PDF processing for pages '1-3' took: 122.88 seconds (including any timeouts).


Processing Papers for 2023:  41%|█████████████████▋                         | 41/100 [15:11<1:36:48, 98.45s/it]


  Processing PMCID: PMC10293804 (42/100) for year 2023
  Found 0 <table> elements in XML for PMC10293804.
      Sufficient primers (12) found for PMC10293804 from XML. Skipping its supplement processing.


Processing Papers for 2023:  42%|██████████████████                         | 42/100 [15:13<1:07:17, 69.62s/it]


  Processing PMCID: PMC10890977 (43/100) for year 2023
  Found 2 <table> elements in XML for PMC10890977.


Processing Papers for 2023:  43%|███████████████████▎                         | 43/100 [15:14<46:47, 49.25s/it]


  Processing PMCID: PMC10828682 (44/100) for year 2023
  Found 0 <table> elements in XML for PMC10828682.



Supps for PMC10828682:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10828682:  50%|██████████████████████████▌                          | 1/2 [00:10<00:10, 10.19s/it][A

  Camelot PDF processing for pages '17-21' took: 10.04 seconds (including any timeouts).



Supps for PMC10828682: 100%|█████████████████████████████████████████████████████| 2/2 [00:24<00:00, 12.46s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '31-35' took: 13.10 seconds (including any timeouts).


Processing Papers for 2023:  44%|███████████████████▊                         | 44/100 [15:46<41:01, 43.95s/it]


  Processing PMCID: PMC10607858 (45/100) for year 2023
  Found 2 <table> elements in XML for PMC10607858.
      Sufficient primers (24) found for PMC10607858 from XML. Skipping its supplement processing.


Processing Papers for 2023:  45%|████████████████████▎                        | 45/100 [15:47<28:39, 31.26s/it]


  Processing PMCID: PMC10031306 (46/100) for year 2023
  Found 0 <table> elements in XML for PMC10031306.



Supps for PMC10031306:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10031306:  50%|██████████████████████████▌                          | 1/2 [00:14<00:14, 14.57s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '8-12' took: 14.36 seconds (including any timeouts).
      Sufficient primers (48) now found for PMC10031306. Skipping its remaining supplements.


Processing Papers for 2023:  46%|████████████████████▋                        | 46/100 [16:41<34:07, 37.92s/it]


  Processing PMCID: PMC10375497 (47/100) for year 2023
  Found 0 <table> elements in XML for PMC10375497.


Processing Papers for 2023:  47%|█████████████████████▏                       | 47/100 [16:43<23:54, 27.06s/it]


  Processing PMCID: PMC10734062 (48/100) for year 2023
  Found 0 <table> elements in XML for PMC10734062.



Supps for PMC10734062:   0%|                                                             | 0/2 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10734062:  50%|██████████████████████████▌                          | 1/2 [00:05<00:05,  5.91s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-3' took: 5.64 seconds (including any timeouts).


Processing Papers for 2023:  48%|█████████████████████▌                       | 48/100 [16:53<19:10, 22.12s/it]


  Processing PMCID: PMC10542758 (49/100) for year 2023
  Found 1 <table> elements in XML for PMC10542758.



Supps for PMC10542758:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10542758: 100%|█████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.77s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 9.60 seconds (including any timeouts).


Processing Papers for 2023:  49%|██████████████████████                       | 49/100 [17:06<16:29, 19.41s/it]


  Processing PMCID: PMC10686799 (50/100) for year 2023
  Found 0 <table> elements in XML for PMC10686799.



Supps for PMC10686799:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10686799: 100%|█████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.71s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 8.41 seconds (including any timeouts).


Processing Papers for 2023:  50%|██████████████████████▌                      | 50/100 [17:18<14:21, 17.24s/it]


  Processing PMCID: PMC10018142 (51/100) for year 2023
  Found 0 <table> elements in XML for PMC10018142.
      Sufficient primers (12) found for PMC10018142 from XML. Skipping its supplement processing.


Processing Papers for 2023:  51%|██████████████████████▉                      | 51/100 [17:20<10:12, 12.50s/it]


  Processing PMCID: PMC10705959 (52/100) for year 2023
  Found 1 <table> elements in XML for PMC10705959.
      Sufficient primers (46) found for PMC10705959 from XML. Skipping its supplement processing.


Processing Papers for 2023:  52%|███████████████████████▍                     | 52/100 [17:22<07:26,  9.31s/it]


  Processing PMCID: PMC10115238 (53/100) for year 2023
  Found 1 <table> elements in XML for PMC10115238.


Processing Papers for 2023:  53%|███████████████████████▊                     | 53/100 [17:23<05:30,  7.03s/it]


  Processing PMCID: PMC10502778 (54/100) for year 2023
  Found 1 <table> elements in XML for PMC10502778.


Processing Papers for 2023:  54%|████████████████████████▎                    | 54/100 [17:25<04:07,  5.38s/it]


  Processing PMCID: PMC9913306 (55/100) for year 2023
  Found 0 <table> elements in XML for PMC9913306.


Processing Papers for 2023:  55%|████████████████████████▊                    | 55/100 [17:27<03:12,  4.27s/it]


  Processing PMCID: PMC10905541 (56/100) for year 2023
  Found 1 <table> elements in XML for PMC10905541.



Supps for PMC10905541:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2023:  56%|█████████████████████████▏                   | 56/100 [17:30<02:56,  4.02s/it][A


  Processing PMCID: PMC10122725 (57/100) for year 2023
  Found 0 <table> elements in XML for PMC10122725.



Supps for PMC10122725:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '10-14' took: 17.94 seconds (including any timeouts).


Processing Papers for 2023:  57%|█████████████████████████▋                   | 57/100 [17:52<06:45,  9.43s/it]


  Processing PMCID: PMC10080795 (58/100) for year 2023
  Found 0 <table> elements in XML for PMC10080795.
      Sufficient primers (12) found for PMC10080795 from XML. Skipping its supplement processing.


Processing Papers for 2023:  58%|██████████████████████████                   | 58/100 [17:54<05:00,  7.15s/it]


  Processing PMCID: PMC10532123 (59/100) for year 2023
  Found 2 <table> elements in XML for PMC10532123.


Processing Papers for 2023:  59%|██████████████████████████▌                  | 59/100 [17:56<03:45,  5.50s/it]


  Processing PMCID: PMC10262954 (60/100) for year 2023
  Found 3 <table> elements in XML for PMC10262954.
      Sufficient primers (16) found for PMC10262954 from XML. Skipping its supplement processing.


Processing Papers for 2023:  60%|███████████████████████████                  | 60/100 [17:57<02:52,  4.32s/it]


  Processing PMCID: PMC10783620 (61/100) for year 2023
  Found 1 <table> elements in XML for PMC10783620.
      Sufficient primers (9) found for PMC10783620 from XML. Skipping its supplement processing.


Processing Papers for 2023:  61%|███████████████████████████▍                 | 61/100 [17:59<02:21,  3.63s/it]


  Processing PMCID: PMC11226232 (62/100) for year 2023
  Found 3 <table> elements in XML for PMC11226232.



Supps for PMC11226232:   0%|                                                             | 0/4 [00:00<?, ?it/s][A
Supps for PMC11226232:  25%|█████████████▎                                       | 1/4 [00:03<00:09,  3.09s/it][A

  Camelot PDF processing for pages '1-1' took: 2.88 seconds (including any timeouts).



Supps for PMC11226232:  50%|██████████████████████████▌                          | 2/4 [00:05<00:04,  2.43s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 1.88 seconds (including any timeouts).


Processing Papers for 2023:  62%|███████████████████████████▉                 | 62/100 [18:55<12:13, 19.30s/it]


  Processing PMCID: PMC10446497 (63/100) for year 2023
  Found 0 <table> elements in XML for PMC10446497.



Supps for PMC10446497:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10446497: 100%|█████████████████████████████████████████████████████| 1/1 [00:16<00:00, 16.53s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '18-22' took: 16.08 seconds (including any timeouts).


Processing Papers for 2023:  63%|████████████████████████████▎                | 63/100 [19:18<12:36, 20.43s/it]


  Processing PMCID: PMC10408301 (64/100) for year 2023
  Found 0 <table> elements in XML for PMC10408301.



Supps for PMC10408301:   0%|                                                             | 0/2 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '8-12' took: 18.19 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10408301: 100%|█████████████████████████████████████████████████████| 2/2 [00:25<00:00, 11.72s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-3' took: 6.75 seconds (including any timeouts).


Processing Papers for 2023:  64%|████████████████████████████▊                | 64/100 [20:01<16:13, 27.03s/it]


  Processing PMCID: PMC9918541 (65/100) for year 2023
  Found 0 <table> elements in XML for PMC9918541.



Supps for PMC9918541:   0%|                                                              | 0/3 [00:00<?, ?it/s][A
Supps for PMC9918541:  33%|██████████████████                                    | 1/3 [00:12<00:24, 12.18s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '7-11' took: 11.67 seconds (including any timeouts).



Supps for PMC9918541:  67%|████████████████████████████████████                  | 2/3 [00:14<00:06,  6.16s/it][A

  Camelot PDF processing for pages '1-1' took: 1.90 seconds (including any timeouts).
  Lattice flavor TIMED OUT for PMC9918541_supp3.pdf after 60s on pages '1-3'.



Supps for PMC9918541: 100%|██████████████████████████████████████████████████████| 3/3 [02:46<00:00, 72.94s/it][A
                                                                                                               [A

  Stream flavor TIMED OUT for PMC9918541_supp3.pdf after 60s on pages '1-3'.
  Camelot PDF processing for pages '1-3' took: 122.69 seconds (including any timeouts).


Processing Papers for 2023:  65%|█████████████████████████████▎               | 65/100 [23:31<47:51, 82.06s/it]


  Processing PMCID: PMC10353567 (67/100) for year 2023
  Found 3 <table> elements in XML for PMC10353567.
      Sufficient primers (16) found for PMC10353567 from XML. Skipping its supplement processing.


Processing Papers for 2023:  67%|██████████████████████████████▏              | 67/100 [23:33<24:30, 44.56s/it]


  Processing PMCID: PMC10670511 (68/100) for year 2023
  Found 1 <table> elements in XML for PMC10670511.
      Sufficient primers (28) found for PMC10670511 from XML. Skipping its supplement processing.


Processing Papers for 2023:  68%|██████████████████████████████▌              | 68/100 [23:34<18:05, 33.92s/it]


  Processing PMCID: PMC10729375 (69/100) for year 2023
  Found 4 <table> elements in XML for PMC10729375.


Processing Papers for 2023:  69%|███████████████████████████████              | 69/100 [23:36<13:10, 25.51s/it]


  Processing PMCID: PMC10362734 (70/100) for year 2023
  Found 0 <table> elements in XML for PMC10362734.



Supps for PMC10362734:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10362734:  50%|██████████████████████████▌                          | 1/2 [00:10<00:10, 10.32s/it][A

  Camelot PDF processing for pages '6-10' took: 10.16 seconds (including any timeouts).



Supps for PMC10362734: 100%|█████████████████████████████████████████████████████| 2/2 [00:21<00:00, 10.93s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '20-24' took: 10.18 seconds (including any timeouts).


Processing Papers for 2023:  70%|███████████████████████████████▍             | 70/100 [24:24<15:46, 31.56s/it]


  Processing PMCID: PMC10656301 (71/100) for year 2023
  Found 0 <table> elements in XML for PMC10656301.



Supps for PMC10656301:   0%|                                                             | 0/2 [00:00<?, ?it/s][A

  Camelot PDF processing for pages '16-20' took: 9.83 seconds (including any timeouts).



Supps for PMC10656301:  50%|██████████████████████████▌                          | 1/2 [00:10<00:10, 10.33s/it][A
Supps for PMC10656301: 100%|█████████████████████████████████████████████████████| 2/2 [00:21<00:00, 10.72s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '31-35' took: 10.06 seconds (including any timeouts).


Processing Papers for 2023:  71%|███████████████████████████████▉             | 71/100 [25:14<17:47, 36.80s/it]


  Processing PMCID: PMC10261893 (72/100) for year 2023
  Found 1 <table> elements in XML for PMC10261893.



Supps for PMC10261893:   0%|                                                             | 0/2 [00:00<?, ?it/s][A

  Camelot PDF processing for pages '8-12' took: 34.63 seconds (including any timeouts).



Supps for PMC10261893:  50%|██████████████████████████▌                          | 1/2 [00:34<00:34, 34.97s/it][A
                                                                                                               [A

      Sufficient primers (100) now found for PMC10261893. Skipping its remaining supplements.


Processing Papers for 2023:  72%|████████████████████████████████▍            | 72/100 [25:56<17:52, 38.30s/it]


  Processing PMCID: PMC10530593 (73/100) for year 2023
  Found 3 <table> elements in XML for PMC10530593.
    ⚠️ Unexpected error processing paper PMC10530593: HTTPSConnectionPool(host='europepmc.org', port=443): Read timed out. (read timeout=60)


Traceback (most recent call last):
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\ssl.py", line 1275, in recv_into
    return self.read(nbytes, buffer)
  File "C:\


  Processing PMCID: PMC10582276 (74/100) for year 2023
  Found 0 <table> elements in XML for PMC10582276.



Supps for PMC10582276:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10582276: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.12it/s][A
Processing Papers for 2023:  74%|█████████████████████████████████▎           | 74/100 [27:54<20:52, 48.19s/it][A


  Processing PMCID: PMC10244346 (75/100) for year 2023
  Found 0 <table> elements in XML for PMC10244346.



Supps for PMC10244346:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10244346: 100%|█████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.85s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '23-27' took: 15.39 seconds (including any timeouts).


Processing Papers for 2023:  75%|█████████████████████████████████▊           | 75/100 [28:24<17:54, 42.97s/it]


  Processing PMCID: PMC10837345 (76/100) for year 2023
  Found 0 <table> elements in XML for PMC10837345.



Supps for PMC10837345:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '11-15' took: 12.57 seconds (including any timeouts).


Processing Papers for 2023:  76%|██████████████████████████████████▏          | 76/100 [28:41<14:07, 35.30s/it]


  Processing PMCID: PMC10705360 (77/100) for year 2023
  Found 0 <table> elements in XML for PMC10705360.


Processing Papers for 2023:  77%|██████████████████████████████████▋          | 77/100 [28:43<09:43, 25.37s/it]


  Processing PMCID: PMC10280094 (78/100) for year 2023
  Found 0 <table> elements in XML for PMC10280094.



Supps for PMC10280094:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10280094:  50%|██████████████████████████▌                          | 1/2 [00:11<00:11, 11.91s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '4-8' took: 11.70 seconds (including any timeouts).
      Sufficient primers (24) now found for PMC10280094. Skipping its remaining supplements.


Processing Papers for 2023:  78%|███████████████████████████████████          | 78/100 [29:01<08:29, 23.18s/it]


  Processing PMCID: PMC10088155 (79/100) for year 2023
  Found 0 <table> elements in XML for PMC10088155.



Supps for PMC10088155:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10088155:  50%|██████████████████████████▌                          | 1/2 [00:00<00:00,  7.92it/s][A
                                                                                                               [A

      Sufficient primers (35) now found for PMC10088155. Skipping its remaining supplements.


Processing Papers for 2023:  79%|███████████████████████████████████▌         | 79/100 [29:07<06:16, 17.93s/it]


  Processing PMCID: PMC10186475 (80/100) for year 2023
  Found 0 <table> elements in XML for PMC10186475.
⚠️  PMC10186475  Data_Sheet_1.docx → 404 Client Error: Not Found for url: https://europepmc.org/backend/ptpmcrender.fcgi?acc=PMC10186475&blobtype=image&blobname=Data_Sheet_1.docx


Processing Papers for 2023:  80%|████████████████████████████████████         | 80/100 [29:40<07:26, 22.35s/it]


  Processing PMCID: PMC10471463 (81/100) for year 2023
  Found 0 <table> elements in XML for PMC10471463.
      Sufficient primers (16) found for PMC10471463 from XML. Skipping its supplement processing.


Processing Papers for 2023:  81%|████████████████████████████████████▍        | 81/100 [29:41<05:06, 16.13s/it]


  Processing PMCID: PMC10720139 (82/100) for year 2023
  Found 3 <table> elements in XML for PMC10720139.



Supps for PMC10720139:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10720139: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.81it/s][A
Processing Papers for 2023:  82%|████████████████████████████████████▉        | 82/100 [29:45<03:42, 12.39s/it][A


  Processing PMCID: PMC10828825 (83/100) for year 2023
  Found 0 <table> elements in XML for PMC10828825.



Supps for PMC10828825:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10828825:  50%|██████████████████████████▌                          | 1/2 [00:09<00:09,  9.33s/it][A

  Camelot PDF processing for pages '12-16' took: 9.16 seconds (including any timeouts).



Supps for PMC10828825: 100%|█████████████████████████████████████████████████████| 2/2 [00:18<00:00,  8.95s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '29-33' took: 7.75 seconds (including any timeouts).


Processing Papers for 2023:  83%|█████████████████████████████████████▎       | 83/100 [30:18<05:14, 18.51s/it]


  Processing PMCID: PMC10517333 (84/100) for year 2023
  Found 0 <table> elements in XML for PMC10517333.
      Sufficient primers (52) found for PMC10517333 from XML. Skipping its supplement processing.


Processing Papers for 2023:  84%|█████████████████████████████████████▊       | 84/100 [30:19<03:34, 13.42s/it]


  Processing PMCID: PMC10130150 (85/100) for year 2023
  Found 0 <table> elements in XML for PMC10130150.
      Sufficient primers (21) found for PMC10130150 from XML. Skipping its supplement processing.


Processing Papers for 2023:  85%|██████████████████████████████████████▎      | 85/100 [30:21<02:29,  9.95s/it]


  Processing PMCID: PMC10572416 (86/100) for year 2023
  Found 0 <table> elements in XML for PMC10572416.


Processing Papers for 2023:  86%|██████████████████████████████████████▋      | 86/100 [30:23<01:44,  7.46s/it]


  Processing PMCID: PMC10772262 (87/100) for year 2023
  Found 0 <table> elements in XML for PMC10772262.
      Sufficient primers (18) found for PMC10772262 from XML. Skipping its supplement processing.


Processing Papers for 2023:  87%|███████████████████████████████████████▏     | 87/100 [30:24<01:13,  5.67s/it]


  Processing PMCID: PMC10465332 (88/100) for year 2023
  Found 0 <table> elements in XML for PMC10465332.



Supps for PMC10465332:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10465332: 100%|█████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.25s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-3' took: 4.02 seconds (including any timeouts).


Processing Papers for 2023:  88%|███████████████████████████████████████▌     | 88/100 [30:32<01:15,  6.31s/it]


  Processing PMCID: PMC10095437 (89/100) for year 2023
  Found 1 <table> elements in XML for PMC10095437.


Processing Papers for 2023:  89%|████████████████████████████████████████     | 89/100 [30:34<00:55,  5.01s/it]


  Processing PMCID: PMC10685538 (90/100) for year 2023
  Found 1 <table> elements in XML for PMC10685538.
      Sufficient primers (10) found for PMC10685538 from XML. Skipping its supplement processing.


Processing Papers for 2023:  90%|████████████████████████████████████████▌    | 90/100 [30:35<00:39,  3.98s/it]


  Processing PMCID: PMC10682145 (91/100) for year 2023
  Found 2 <table> elements in XML for PMC10682145.
      Sufficient primers (9) found for PMC10682145 from XML. Skipping its supplement processing.


Processing Papers for 2023:  91%|████████████████████████████████████████▉    | 91/100 [30:38<00:30,  3.40s/it]


  Processing PMCID: PMC10036466 (92/100) for year 2023
  Found 0 <table> elements in XML for PMC10036466.



Supps for PMC10036466:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10036466:  50%|██████████████████████████▌                          | 1/2 [00:13<00:13, 13.11s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 11.47 seconds (including any timeouts).
      Sufficient primers (52) now found for PMC10036466. Skipping its remaining supplements.


Processing Papers for 2023:  92%|█████████████████████████████████████████▍   | 92/100 [30:56<01:02,  7.85s/it]


  Processing PMCID: PMC10523463 (93/100) for year 2023
  Found 0 <table> elements in XML for PMC10523463.



Supps for PMC10523463:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Supps for PMC10523463: 100%|█████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.91s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-5' took: 12.53 seconds (including any timeouts).


Processing Papers for 2023:  93%|█████████████████████████████████████████▊   | 93/100 [31:13<01:13, 10.57s/it]


  Processing PMCID: PMC10696029 (94/100) for year 2023
  Found 0 <table> elements in XML for PMC10696029.



Supps for PMC10696029:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10696029:  50%|██████████████████████████▌                          | 1/2 [00:37<00:37, 37.43s/it][A

  Camelot PDF processing for pages '5-9' took: 36.54 seconds (including any timeouts).



Supps for PMC10696029: 100%|█████████████████████████████████████████████████████| 2/2 [00:37<00:00, 15.49s/it][A
Processing Papers for 2023:  94%|██████████████████████████████████████████▎  | 94/100 [32:02<02:12, 22.07s/it][A


  Processing PMCID: PMC10357017 (95/100) for year 2023
  Found 0 <table> elements in XML for PMC10357017.



Supps for PMC10357017:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10357017:  50%|██████████████████████████▌                          | 1/2 [00:20<00:20, 20.63s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropB

  Camelot PDF processing for pages '16-20' took: 20.11 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC10357017: 100%|█████████████████████████████████████████████████████| 2/2 [00:38<00:00, 19.02s/it][A
                                                                    

  Camelot PDF processing for pages '12-16' took: 17.38 seconds (including any timeouts).


Processing Papers for 2023:  95%|██████████████████████████████████████████▊  | 95/100 [32:47<02:24, 28.95s/it]


  Processing PMCID: PMC10249273 (96/100) for year 2023
  Found 0 <table> elements in XML for PMC10249273.



Supps for PMC10249273:   0%|                                                             | 0/3 [00:00<?, ?it/s][A
Supps for PMC10249273:  33%|█████████████████▋                                   | 1/3 [00:00<00:00,  3.83it/s][A
                                                                                                               [A

      Sufficient primers (76) now found for PMC10249273. Skipping its remaining supplements.


Processing Papers for 2023:  96%|███████████████████████████████████████████▏ | 96/100 [32:58<01:34, 23.61s/it]


  Processing PMCID: PMC10575631 (97/100) for year 2023
  Found 0 <table> elements in XML for PMC10575631.



Supps for PMC10575631:   0%|                                                             | 0/2 [00:00<?, ?it/s][A
Supps for PMC10575631:  50%|██████████████████████████▌                          | 1/2 [00:15<00:15, 15.91s/it][A

  Camelot PDF processing for pages '1-5' took: 15.21 seconds (including any timeouts).



Supps for PMC10575631: 100%|█████████████████████████████████████████████████████| 2/2 [00:16<00:00,  6.63s/it][A
Processing Papers for 2023:  97%|███████████████████████████████████████████▋ | 97/100 [33:19<01:08, 22.84s/it][A


  Processing PMCID: PMC10802105 (98/100) for year 2023
  Found 0 <table> elements in XML for PMC10802105.


Processing Papers for 2023:  98%|████████████████████████████████████████████ | 98/100 [33:20<00:32, 16.34s/it]


  Processing PMCID: PMC10288792 (99/100) for year 2023
  Found 3 <table> elements in XML for PMC10288792.


Processing Papers for 2023:  99%|████████████████████████████████████████████▌| 99/100 [33:22<00:12, 12.05s/it]


  Processing PMCID: PMC10062451 (100/100) for year 2023
  Found 3 <table> elements in XML for PMC10062451.


Processing Papers for 2023: 100%|████████████████████████████████████████████| 100/100 [33:24<00:00, 20.04s/it]



--- Consolidating and Saving Results for YEAR 2023 ---
Total unique primer entries found for 2023: 1776
  ✅ Saved 1776 unique extracted primers for 2023 to data\psc\yearly_results\master_extracted_primers_2023.csv
###### FINISHED PROCESSING FOR YEAR: 2023 ######

STARTING PROCESSING FOR YEAR: 2022

Searching Europe PMC with query: OPEN_ACCESS:y AND HAS_FT:y AND (METHODS:"qPCR" OR "RT-PCR" OR "real time PCR") AND (ABSTRACT:"pluripotent" OR "iPSC" OR "PSC" OR "hPSC" OR "ESC" OR "hESC") AND (FIRST_PDATE:2022)
Retrieved 100 records for 2022.


Processing Papers for 2022:   0%|                                                      | 0/100 [00:00<?, ?it/s]


  Processing PMCID: PMC9669397 (1/100) for year 2022
  Found 0 <table> elements in XML for PMC9669397.



Supps for PMC9669397:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9669397:  50%|███████████████████████████                           | 1/2 [00:17<00:17, 17.63s/it][A

  Camelot PDF processing for pages '18-22' took: 17.36 seconds (including any timeouts).



Supps for PMC9669397: 100%|██████████████████████████████████████████████████████| 2/2 [00:38<00:00, 19.37s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '35-39' took: 19.37 seconds (including any timeouts).


Processing Papers for 2022:   1%|▍                                           | 1/100 [00:44<1:13:59, 44.84s/it]


  Processing PMCID: PMC8967593 (2/100) for year 2022
  Found 0 <table> elements in XML for PMC8967593.



Supps for PMC8967593:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC8967593: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.04it/s][A
Processing Papers for 2022:   2%|▉                                             | 2/100 [00:48<33:22, 20.43s/it][A


  Processing PMCID: PMC8848615 (3/100) for year 2022
  Found 4 <table> elements in XML for PMC8848615.



Supps for PMC8848615:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC8848615: 100%|██████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.43s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-2' took: 6.29 seconds (including any timeouts).


Processing Papers for 2022:   3%|█▍                                            | 3/100 [00:59<26:05, 16.14s/it]


  Processing PMCID: PMC11235200 (4/100) for year 2022
  Found 1 <table> elements in XML for PMC11235200.


Processing Papers for 2022:   4%|█▊                                            | 4/100 [01:01<16:46, 10.48s/it]


  Processing PMCID: PMC9640545 (5/100) for year 2022
  Found 0 <table> elements in XML for PMC9640545.



Supps for PMC9640545:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC9640545: 100%|██████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.57s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 17.84 seconds (including any timeouts).


Processing Papers for 2022:   5%|██▎                                           | 5/100 [01:23<23:10, 14.63s/it]


  Processing PMCID: PMC9701587 (6/100) for year 2022
  Found 1 <table> elements in XML for PMC9701587.



Supps for PMC9701587:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9701587:  50%|███████████████████████████                           | 1/2 [00:09<00:09,  9.31s/it][A

  Stream flavor failed for PMC9701587_supp1.pdf (pages '1-5'). Error: list index out of range
  Camelot PDF processing for pages '1-5' took: 9.18 seconds (including any timeouts).





  Stream flavor failed for PMC9701587_supp2.pdf (pages '18-22'). Error: list index out of range
  Camelot PDF processing for pages '18-22' took: 9.28 seconds (including any timeouts).


Supps for PMC9701587: 100%|██████████████████████████████████████████████████████| 2/2 [00:19<00:00,  9.59s/it][A
Processing Papers for 2022:   6%|██▊                                           | 6/100 [01:55<32:18, 20.62s/it][A


  Processing PMCID: PMC9254250 (7/100) for year 2022
  Found 0 <table> elements in XML for PMC9254250.



Supps for PMC9254250:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9254250: 100%|██████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.44s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '10-14' took: 18.06 seconds (including any timeouts).


Processing Papers for 2022:   7%|███▏                                          | 7/100 [02:17<32:53, 21.22s/it]


  Processing PMCID: PMC9313526 (8/100) for year 2022
  Found 0 <table> elements in XML for PMC9313526.



Supps for PMC9313526:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2022:   8%|███▋                                          | 8/100 [02:27<26:51, 17.51s/it][A


  Processing PMCID: PMC9391418 (9/100) for year 2022
  Found 0 <table> elements in XML for PMC9391418.



Supps for PMC9391418:   0%|                                                              | 0/1 [00:00<?, ?it/s][A

  Stream flavor failed for PMC9391418_supp1.pdf (pages '1-4'). Error: list index out of range
  Camelot PDF processing for pages '1-4' took: 10.36 seconds (including any timeouts).



Supps for PMC9391418: 100%|██████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.59s/it][A
Processing Papers for 2022:   9%|████▏                                         | 9/100 [02:41<24:58, 16.47s/it][A


  Processing PMCID: PMC9221973 (10/100) for year 2022
  Found 6 <table> elements in XML for PMC9221973.


Processing Papers for 2022:  10%|████▌                                        | 10/100 [02:43<17:55, 11.95s/it]


  Processing PMCID: PMC9811443 (11/100) for year 2022
  Found 0 <table> elements in XML for PMC9811443.



Supps for PMC9811443:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9811443: 100%|██████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.99s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '13-17' took: 13.53 seconds (including any timeouts).


Processing Papers for 2022:  11%|████▉                                        | 11/100 [03:01<20:42, 13.96s/it]


  Processing PMCID: PMC9647332 (12/100) for year 2022
  Found 2 <table> elements in XML for PMC9647332.



Supps for PMC9647332:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2022:  12%|█████▍                                       | 12/100 [03:15<20:09, 13.75s/it][A


  Processing PMCID: PMC9794550 (13/100) for year 2022
  Found 0 <table> elements in XML for PMC9794550.



Supps for PMC9794550:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9794550: 100%|██████████████████████████████████████████████████████| 1/1 [00:24<00:00, 24.57s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 23.81 seconds (including any timeouts).


Processing Papers for 2022:  13%|█████▊                                       | 13/100 [03:50<29:36, 20.42s/it]


  Processing PMCID: PMC8915496 (14/100) for year 2022
  Found 0 <table> elements in XML for PMC8915496.



Supps for PMC8915496:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC8915496:  50%|███████████████████████████                           | 1/2 [00:00<00:00,  3.97it/s][A
                                                                                                               [A

      Sufficient primers (10) now found for PMC8915496. Skipping its remaining supplements.


Processing Papers for 2022:  14%|██████▎                                      | 14/100 [03:56<22:51, 15.94s/it]


  Processing PMCID: PMC8935725 (15/100) for year 2022
  Found 0 <table> elements in XML for PMC8935725.



Supps for PMC8935725:   0%|                                                              | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC8935725:  25%|█████████████▌                                        | 1/4 [00:02<00:08,  2.79s/it][ACropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.45 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox

Supps for PMC8935725:  50%|███████████████████████████                           | 2/4 [00:05<00:05,  2.61s/it][ACropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.22 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox

Supps for PMC8935725:  75%|████████████████████████████████████████▌             | 3/4 [00:08<00:02,  2.89s/it][ACropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '1-1' took: 2.56 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox

Supps for PMC8935725: 100%|██████████████████████████████████████████████████████| 4/4 [00:11<00:00,  2.88s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.45 seconds (including any timeouts).


Processing Papers for 2022:  15%|██████▊                                      | 15/100 [04:15<23:55, 16.89s/it]


  Processing PMCID: PMC10621662 (16/100) for year 2022
  Found 0 <table> elements in XML for PMC10621662.


Processing Papers for 2022:  16%|███████▏                                     | 16/100 [04:17<17:28, 12.48s/it]


  Processing PMCID: PMC9777247 (17/100) for year 2022
  Found 0 <table> elements in XML for PMC9777247.


Processing Papers for 2022:  17%|███████▋                                     | 17/100 [04:19<12:42,  9.19s/it]


  Processing PMCID: PMC9682313 (18/100) for year 2022
  Found 1 <table> elements in XML for PMC9682313.
      Sufficient primers (30) found for PMC9682313 from XML. Skipping its supplement processing.


Processing Papers for 2022:  18%|████████                                     | 18/100 [04:20<09:23,  6.88s/it]


  Processing PMCID: PMC9671909 (19/100) for year 2022
  Found 0 <table> elements in XML for PMC9671909.



Supps for PMC9671909:   0%|                                                              | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '21-25' took: 18.41 seconds (including any timeouts).
      Sufficient primers (70) now found for PMC9671909. Skipping its remaining supplements.


Processing Papers for 2022:  19%|████████▌                                    | 19/100 [04:49<18:01, 13.35s/it]


  Processing PMCID: PMC9254461 (20/100) for year 2022
  Found 1 <table> elements in XML for PMC9254461.
      Sufficient primers (14) found for PMC9254461 from XML. Skipping its supplement processing.


Processing Papers for 2022:  20%|█████████                                    | 20/100 [04:50<13:04,  9.81s/it]


  Processing PMCID: PMC9554908 (21/100) for year 2022
  Found 0 <table> elements in XML for PMC9554908.



Supps for PMC9554908:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Stream flavor failed for PMC9554908_supp1.pdf (pages '5-9'). Error: list index out of range
  Camelot PDF processing for pages '5-9' took: 8.13 seconds (including any timeouts).


Processing Papers for 2022:  21%|█████████▍                                   | 21/100 [05:03<14:12, 10.78s/it]


  Processing PMCID: PMC9391520 (22/100) for year 2022
  Found 0 <table> elements in XML for PMC9391520.



Supps for PMC9391520:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9391520:  50%|███████████████████████████                           | 1/2 [00:12<00:12, 12.25s/it][A

  Camelot PDF processing for pages '13-17' took: 12.03 seconds (including any timeouts).



Supps for PMC9391520: 100%|██████████████████████████████████████████████████████| 2/2 [00:25<00:00, 12.85s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '27-31' took: 12.25 seconds (including any timeouts).


Processing Papers for 2022:  22%|█████████▉                                   | 22/100 [05:38<23:24, 18.01s/it]


  Processing PMCID: PMC9154343 (23/100) for year 2022
  Found 0 <table> elements in XML for PMC9154343.
      Sufficient primers (11) found for PMC9154343 from XML. Skipping its supplement processing.


Processing Papers for 2022:  23%|██████████▎                                  | 23/100 [05:40<16:48, 13.09s/it]


  Processing PMCID: PMC9162946 (24/100) for year 2022
  Found 0 <table> elements in XML for PMC9162946.


Processing Papers for 2022:  24%|██████████▊                                  | 24/100 [05:42<12:16,  9.69s/it]


  Processing PMCID: PMC9652995 (25/100) for year 2022
  Found 5 <table> elements in XML for PMC9652995.
      Sufficient primers (36) found for PMC9652995 from XML. Skipping its supplement processing.


Processing Papers for 2022:  25%|███████████▎                                 | 25/100 [05:43<09:00,  7.20s/it]


  Processing PMCID: PMC8991583 (26/100) for year 2022
  Found 4 <table> elements in XML for PMC8991583.
      Sufficient primers (19) found for PMC8991583 from XML. Skipping its supplement processing.


Processing Papers for 2022:  26%|███████████▋                                 | 26/100 [05:45<06:53,  5.59s/it]


  Processing PMCID: PMC8910499 (27/100) for year 2022
  Found 0 <table> elements in XML for PMC8910499.


Processing Papers for 2022:  27%|████████████▏                                | 27/100 [05:46<05:18,  4.36s/it]


  Processing PMCID: PMC9385691 (28/100) for year 2022
  Found 3 <table> elements in XML for PMC9385691.


Processing Papers for 2022:  28%|████████████▌                                | 28/100 [05:48<04:14,  3.53s/it]


  Processing PMCID: PMC9033780 (29/100) for year 2022
  Found 0 <table> elements in XML for PMC9033780.



Supps for PMC9033780:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '14-18' took: 14.54 seconds (including any timeouts).


Processing Papers for 2022:  29%|█████████████                                | 29/100 [06:08<09:54,  8.38s/it]


  Processing PMCID: PMC9821383 (30/100) for year 2022
  Found 0 <table> elements in XML for PMC9821383.


Processing Papers for 2022:  30%|█████████████▌                               | 30/100 [06:09<07:22,  6.32s/it]


  Processing PMCID: PMC9137077 (31/100) for year 2022
  Found 2 <table> elements in XML for PMC9137077.


Processing Papers for 2022:  31%|█████████████▉                               | 31/100 [06:11<05:46,  5.03s/it]


  Processing PMCID: PMC9437366 (32/100) for year 2022
  Found 0 <table> elements in XML for PMC9437366.



Supps for PMC9437366:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9437366: 100%|██████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.69s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '7-11' took: 12.44 seconds (including any timeouts).


Processing Papers for 2022:  32%|██████████████▍                              | 32/100 [06:27<09:31,  8.41s/it]


  Processing PMCID: PMC9890535 (33/100) for year 2022
  Found 1 <table> elements in XML for PMC9890535.
      Sufficient primers (17) found for PMC9890535 from XML. Skipping its supplement processing.


Processing Papers for 2022:  33%|██████████████▊                              | 33/100 [06:29<07:10,  6.42s/it]


  Processing PMCID: PMC9023812 (34/100) for year 2022
  Found 1 <table> elements in XML for PMC9023812.



Supps for PMC9023812:   0%|                                                              | 0/2 [00:00<?, ?it/s][A

  Camelot PDF processing for pages '10-14' took: 13.12 seconds (including any timeouts).



Supps for PMC9023812:  50%|███████████████████████████                           | 1/2 [00:13<00:13, 13.63s/it][A
Supps for PMC9023812: 100%|██████████████████████████████████████████████████████| 2/2 [00:29<00:00, 14.72s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '25-29' took: 14.34 seconds (including any timeouts).


Processing Papers for 2022:  34%|███████████████▎                             | 34/100 [07:05<16:46, 15.24s/it]


  Processing PMCID: PMC8972137 (35/100) for year 2022
  Found 0 <table> elements in XML for PMC8972137.



Supps for PMC8972137:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC8972137: 100%|██████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.61s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-4' took: 6.34 seconds (including any timeouts).


Processing Papers for 2022:  35%|███████████████▋                             | 35/100 [07:15<14:45, 13.62s/it]


  Processing PMCID: PMC9236614 (36/100) for year 2022
  Found 0 <table> elements in XML for PMC9236614.



Supps for PMC9236614:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9236614: 100%|██████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.96s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-2' took: 3.35 seconds (including any timeouts).


Processing Papers for 2022:  36%|████████████████▏                            | 36/100 [07:25<13:28, 12.63s/it]


  Processing PMCID: PMC10082602 (37/100) for year 2022
  Found 3 <table> elements in XML for PMC10082602.



Supps for PMC10082602:   0%|                                                             | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2022:  37%|████████████████▋                            | 37/100 [07:28<10:18,  9.81s/it][A


  Processing PMCID: PMC8998132 (38/100) for year 2022
  Found 0 <table> elements in XML for PMC8998132.


Processing Papers for 2022:  38%|█████████████████                            | 38/100 [07:30<07:36,  7.36s/it]


  Processing PMCID: PMC9860076 (39/100) for year 2022
  Found 0 <table> elements in XML for PMC9860076.



Supps for PMC9860076:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9860076:  50%|███████████████████████████                           | 1/2 [00:12<00:12, 12.55s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '6-10' took: 12.35 seconds (including any timeouts).
      Sufficient primers (53) now found for PMC9860076. Skipping its remaining supplements.


Processing Papers for 2022:  39%|█████████████████▌                           | 39/100 [07:51<11:30, 11.31s/it]


  Processing PMCID: PMC9251045 (40/100) for year 2022
  Found 0 <table> elements in XML for PMC9251045.


Processing Papers for 2022:  40%|██████████████████                           | 40/100 [07:52<08:19,  8.33s/it]


  Processing PMCID: PMC8851157 (41/100) for year 2022
  Found 0 <table> elements in XML for PMC8851157.



Supps for PMC8851157:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC8851157:  50%|███████████████████████████                           | 1/2 [00:11<00:11, 11.83s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '10-14' took: 11.62 seconds (including any timeouts).
      Sufficient primers (115) now found for PMC8851157. Skipping its remaining supplements.


Processing Papers for 2022:  41%|██████████████████▍                          | 41/100 [08:42<20:24, 20.75s/it]


  Processing PMCID: PMC9925186 (42/100) for year 2022
  Found 1 <table> elements in XML for PMC9925186.
      Sufficient primers (32) found for PMC9925186 from XML. Skipping its supplement processing.


Processing Papers for 2022:  42%|██████████████████▉                          | 42/100 [08:43<14:28, 14.98s/it]


  Processing PMCID: PMC8828260 (43/100) for year 2022
  Found 1 <table> elements in XML for PMC8828260.



Supps for PMC8828260:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC8828260: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.92it/s][A
Processing Papers for 2022:  43%|███████████████████▎                         | 43/100 [09:03<15:41, 16.52s/it][A


  Processing PMCID: PMC9102156 (44/100) for year 2022
  Found 0 <table> elements in XML for PMC9102156.


Processing Papers for 2022:  44%|███████████████████▊                         | 44/100 [09:05<11:16, 12.08s/it]


  Processing PMCID: PMC9801682 (45/100) for year 2022
  Found 3 <table> elements in XML for PMC9801682.


Processing Papers for 2022:  45%|████████████████████▎                        | 45/100 [09:06<08:07,  8.87s/it]


  Processing PMCID: PMC9523398 (46/100) for year 2022
  Found 1 <table> elements in XML for PMC9523398.
      Sufficient primers (12) found for PMC9523398 from XML. Skipping its supplement processing.


Processing Papers for 2022:  46%|████████████████████▋                        | 46/100 [09:08<06:06,  6.79s/it]


  Processing PMCID: PMC9030920 (47/100) for year 2022
  Found 1 <table> elements in XML for PMC9030920.
      Sufficient primers (39) found for PMC9030920 from XML. Skipping its supplement processing.


Processing Papers for 2022:  47%|█████████████████████▏                       | 47/100 [09:10<04:37,  5.24s/it]


  Processing PMCID: PMC9904404 (48/100) for year 2022
  Found 3 <table> elements in XML for PMC9904404.


Processing Papers for 2022:  48%|█████████████████████▌                       | 48/100 [09:11<03:32,  4.09s/it]


  Processing PMCID: PMC9550256 (49/100) for year 2022
  Found 3 <table> elements in XML for PMC9550256.
      Sufficient primers (20) found for PMC9550256 from XML. Skipping its supplement processing.


Processing Papers for 2022:  49%|██████████████████████                       | 49/100 [09:13<02:50,  3.34s/it]


  Processing PMCID: PMC9160024 (50/100) for year 2022
  Found 0 <table> elements in XML for PMC9160024.



Supps for PMC9160024:   0%|                                                              | 0/3 [00:00<?, ?it/s][A
Supps for PMC9160024:  33%|██████████████████                                    | 1/3 [00:00<00:01,  1.35it/s][A
                                                                                                               [A

      Sufficient primers (17) now found for PMC9160024. Skipping its remaining supplements.


Processing Papers for 2022:  50%|██████████████████████▌                      | 50/100 [09:25<05:03,  6.07s/it]


  Processing PMCID: PMC9558835 (51/100) for year 2022
  Found 0 <table> elements in XML for PMC9558835.


Processing Papers for 2022:  51%|██████████████████████▉                      | 51/100 [09:27<03:55,  4.81s/it]


  Processing PMCID: PMC9186763 (52/100) for year 2022
  Found 0 <table> elements in XML for PMC9186763.



Supps for PMC9186763:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC9186763: 100%|██████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.75s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.22 seconds (including any timeouts).


Processing Papers for 2022:  52%|███████████████████████▍                     | 52/100 [09:33<04:08,  5.17s/it]


  Processing PMCID: PMC9068652 (54/100) for year 2022
  Found 3 <table> elements in XML for PMC9068652.


Processing Papers for 2022:  54%|████████████████████████▎                    | 54/100 [09:35<02:27,  3.21s/it]


  Processing PMCID: PMC8869365 (55/100) for year 2022
  Found 0 <table> elements in XML for PMC8869365.


Processing Papers for 2022:  55%|████████████████████████▊                    | 55/100 [09:37<02:08,  2.86s/it]


  Processing PMCID: PMC9665373 (56/100) for year 2022
  Found 0 <table> elements in XML for PMC9665373.



Supps for PMC9665373:   0%|                                                              | 0/6 [00:00<?, ?it/s][A
Supps for PMC9665373:  17%|█████████                                             | 1/6 [00:00<00:00,  9.42it/s][A
Supps for PMC9665373:  33%|██████████████████                                    | 2/6 [00:00<00:00,  9.47it/s][A
                                                                                                               [A

      Sufficient primers (66) now found for PMC9665373. Skipping its remaining supplements.


Processing Papers for 2022:  56%|█████████████████████████▏                   | 56/100 [09:46<03:18,  4.50s/it]


  Processing PMCID: PMC9308249 (57/100) for year 2022
  Found 1 <table> elements in XML for PMC9308249.
      Sufficient primers (36) found for PMC9308249 from XML. Skipping its supplement processing.


Processing Papers for 2022:  57%|█████████████████████████▋                   | 57/100 [09:48<02:45,  3.84s/it]


  Processing PMCID: PMC8744258 (58/100) for year 2022
  Found 1 <table> elements in XML for PMC8744258.



Supps for PMC8744258:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
                                                                                                               [A

      Sufficient primers (11) now found for PMC8744258. Skipping its remaining supplements.


Processing Papers for 2022:  58%|██████████████████████████                   | 58/100 [09:53<02:57,  4.22s/it]


  Processing PMCID: PMC8973792 (59/100) for year 2022
  Found 1 <table> elements in XML for PMC8973792.
      Sufficient primers (30) found for PMC8973792 from XML. Skipping its supplement processing.


Processing Papers for 2022:  59%|██████████████████████████▌                  | 59/100 [09:55<02:22,  3.48s/it]


  Processing PMCID: PMC8987729 (60/100) for year 2022
  Found 0 <table> elements in XML for PMC8987729.



Supps for PMC8987729:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC8987729:  50%|███████████████████████████                           | 1/2 [00:14<00:14, 14.05s/it][A

  Camelot PDF processing for pages '1-5' took: 13.73 seconds (including any timeouts).



Supps for PMC8987729: 100%|██████████████████████████████████████████████████████| 2/2 [00:17<00:00,  7.73s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 3.14 seconds (including any timeouts).


Processing Papers for 2022:  60%|███████████████████████████                  | 60/100 [10:27<07:52, 11.80s/it]


  Processing PMCID: PMC9739763 (61/100) for year 2022
  Found 5 <table> elements in XML for PMC9739763.


Processing Papers for 2022:  61%|███████████████████████████▍                 | 61/100 [10:30<05:57,  9.16s/it]


  Processing PMCID: PMC8956733 (62/100) for year 2022
  Found 0 <table> elements in XML for PMC8956733.



Supps for PMC8956733:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC8956733: 100%|██████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.57s/it][A
Processing Papers for 2022:  62%|███████████████████████████▉                 | 62/100 [10:35<05:02,  7.95s/it][A


  Processing PMCID: PMC9426463 (63/100) for year 2022
  Found 0 <table> elements in XML for PMC9426463.



Supps for PMC9426463:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2022:  63%|████████████████████████████▎                | 63/100 [11:30<13:32, 21.95s/it][A


  Processing PMCID: PMC9692531 (64/100) for year 2022
  Found 7 <table> elements in XML for PMC9692531.
      Sufficient primers (38) found for PMC9692531 from XML. Skipping its supplement processing.


Processing Papers for 2022:  64%|████████████████████████████▊                | 64/100 [11:32<09:38, 16.06s/it]


  Processing PMCID: PMC9729447 (65/100) for year 2022
  Found 3 <table> elements in XML for PMC9729447.
      Sufficient primers (12) found for PMC9729447 from XML. Skipping its supplement processing.


Processing Papers for 2022:  65%|█████████████████████████████▎               | 65/100 [11:34<06:47, 11.65s/it]


  Processing PMCID: PMC8802056 (66/100) for year 2022
  Found 0 <table> elements in XML for PMC8802056.
      Sufficient primers (30) found for PMC8802056 from XML. Skipping its supplement processing.


Processing Papers for 2022:  66%|█████████████████████████████▋               | 66/100 [11:35<04:53,  8.63s/it]


  Processing PMCID: PMC9253108 (67/100) for year 2022
  Found 0 <table> elements in XML for PMC9253108.



Supps for PMC9253108:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9253108:  50%|███████████████████████████                           | 1/2 [00:13<00:13, 13.78s/it][A

  Camelot PDF processing for pages '8-12' took: 13.54 seconds (including any timeouts).
  Lattice flavor TIMED OUT for PMC9253108_supp2.pdf after 60s on pages '1-2'.



Supps for PMC9253108: 100%|██████████████████████████████████████████████████████| 2/2 [02:49<00:00, 97.12s/it][A
                                                                                                               [A

  Stream flavor TIMED OUT for PMC9253108_supp2.pdf after 60s on pages '1-2'.
  Camelot PDF processing for pages '1-2' took: 122.57 seconds (including any timeouts).


Processing Papers for 2022:  67%|██████████████████████████████▏              | 67/100 [14:43<34:10, 62.14s/it]


  Processing PMCID: PMC9437536 (68/100) for year 2022
  Found 2 <table> elements in XML for PMC9437536.


Processing Papers for 2022:  68%|██████████████████████████████▌              | 68/100 [14:45<23:31, 44.09s/it]


  Processing PMCID: PMC9733929 (69/100) for year 2022
  Found 0 <table> elements in XML for PMC9733929.



Supps for PMC9733929:   0%|                                                              | 0/1 [00:00<?, ?it/s][A


  Camelot PDF processing for pages '114-118' took: 58.37 seconds (including any timeouts).


Supps for PMC9733929: 100%|██████████████████████████████████████████████████████| 1/1 [00:59<00:00, 59.39s/it][A
Processing Papers for 2022:  69%|███████████████████████████████              | 69/100 [15:50<26:09, 50.64s/it][A


  Processing PMCID: PMC9729200 (70/100) for year 2022
  Found 0 <table> elements in XML for PMC9729200.
    ⚠️ Unexpected error processing paper PMC9729200: HTTPSConnectionPool(host='europepmc.org', port=443): Read timed out. (read timeout=60)


Traceback (most recent call last):
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\site-packages\urllib3\connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 1377, in getresponse
    response.begin()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 320, in begin
    version, status, reason = self._read_status()
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\http\client.py", line 281, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\socket.py", line 716, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\tommo\anaconda3\envs\camtest\lib\ssl.py", line 1275, in recv_into
    return self.read(nbytes, buffer)
  File "C:\


  Processing PMCID: PMC9778678 (71/100) for year 2022
  Found 0 <table> elements in XML for PMC9778678.


Processing Papers for 2022:  71%|███████████████████████████████▉             | 71/100 [16:55<18:35, 38.46s/it]


  Processing PMCID: PMC9017214 (72/100) for year 2022
  Found 1 <table> elements in XML for PMC9017214.



Supps for PMC9017214:   0%|                                                              | 0/2 [00:00<?, ?it/s][A

  Lattice flavor TIMED OUT for PMC9017214_supp1.pdf after 60s on pages '7-11'.



Supps for PMC9017214:  50%|██████████████████████████▌                          | 1/2 [02:03<02:03, 123.01s/it][A

  Stream flavor TIMED OUT for PMC9017214_supp1.pdf after 60s on pages '7-11'.
  Camelot PDF processing for pages '7-11' took: 122.91 seconds (including any timeouts).
  Lattice flavor TIMED OUT for PMC9017214_supp2.pdf after 60s on pages '25-29'.



Supps for PMC9017214: 100%|█████████████████████████████████████████████████████| 2/2 [04:07<00:00, 123.80s/it][A
                                                                                                               [A

  Stream flavor TIMED OUT for PMC9017214_supp2.pdf after 60s on pages '25-29'.
  Camelot PDF processing for pages '25-29' took: 122.62 seconds (including any timeouts).


Processing Papers for 2022:  72%|███████████████████████████████▋            | 72/100 [21:52<54:06, 115.96s/it]


  Processing PMCID: PMC9790837 (73/100) for year 2022
  Found 0 <table> elements in XML for PMC9790837.



Supps for PMC9790837:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC9790837: 100%|██████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.32s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.11 seconds (including any timeouts).


Processing Papers for 2022:  73%|████████████████████████████████▊            | 73/100 [21:57<37:19, 82.94s/it]


  Processing PMCID: PMC9656397 (74/100) for year 2022
  Found 3 <table> elements in XML for PMC9656397.


Processing Papers for 2022:  74%|█████████████████████████████████▎           | 74/100 [21:59<25:22, 58.56s/it]


  Processing PMCID: PMC9299513 (75/100) for year 2022
  Found 0 <table> elements in XML for PMC9299513.



Supps for PMC9299513:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9299513: 100%|██████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.18s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '2-6' took: 13.88 seconds (including any timeouts).


Processing Papers for 2022:  75%|█████████████████████████████████▊           | 75/100 [22:17<19:19, 46.39s/it]


  Processing PMCID: PMC9863323 (76/100) for year 2022
  Found 0 <table> elements in XML for PMC9863323.


Processing Papers for 2022:  76%|██████████████████████████████████▏          | 76/100 [22:19<13:11, 32.97s/it]


  Processing PMCID: PMC9737271 (77/100) for year 2022
  Found 6 <table> elements in XML for PMC9737271.


Processing Papers for 2022:  77%|██████████████████████████████████▋          | 77/100 [22:21<09:07, 23.79s/it]


  Processing PMCID: PMC8851680 (78/100) for year 2022
    ⚠️ HTTPError for PMC8851680 (XML or Supplement download): 404 Client Error: Not Found for url: https://www.ebi.ac.uk/europepmc/webservices/rest/PMC8851680/fullTextXML


Processing Papers for 2022:  78%|███████████████████████████████████          | 78/100 [22:22<06:12, 16.95s/it]


  Processing PMCID: PMC9796812 (79/100) for year 2022
  Found 1 <table> elements in XML for PMC9796812.
      Sufficient primers (50) found for PMC9796812 from XML. Skipping its supplement processing.


Processing Papers for 2022:  79%|███████████████████████████████████▌         | 79/100 [22:24<04:20, 12.42s/it]


  Processing PMCID: PMC10655821 (80/100) for year 2022
  Found 0 <table> elements in XML for PMC10655821.



Supps for PMC10655821:   0%|                                                             | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '16-20' took: 12.22 seconds (including any timeouts).


Processing Papers for 2022:  80%|████████████████████████████████████         | 80/100 [22:41<04:33, 13.69s/it]


  Processing PMCID: PMC9532846 (81/100) for year 2022
  Found 1 <table> elements in XML for PMC9532846.



Supps for PMC9532846:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9532846:  50%|███████████████████████████                           | 1/2 [00:09<00:09,  9.22s/it][A

  Stream flavor failed for PMC9532846_supp1.pdf (pages '10-14'). Error: list index out of range
  Camelot PDF processing for pages '10-14' took: 9.06 seconds (including any timeouts).



Supps for PMC9532846: 100%|██████████████████████████████████████████████████████| 2/2 [00:20<00:00, 10.25s/it][A
                                                                                                               [A

  Stream flavor failed for PMC9532846_supp2.pdf (pages '36-40'). Error: list index out of range
  Camelot PDF processing for pages '36-40' took: 10.46 seconds (including any timeouts).


Processing Papers for 2022:  81%|████████████████████████████████████▍        | 81/100 [23:08<05:37, 17.77s/it]


  Processing PMCID: PMC9744233 (82/100) for year 2022
  Found 0 <table> elements in XML for PMC9744233.



Supps for PMC9744233:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9744233: 100%|██████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.95s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '19-23' took: 10.71 seconds (including any timeouts).


Processing Papers for 2022:  82%|████████████████████████████████████▉        | 82/100 [23:23<05:05, 16.97s/it]


  Processing PMCID: PMC9684429 (83/100) for year 2022
  Found 1 <table> elements in XML for PMC9684429.



Supps for PMC9684429:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9684429: 100%|██████████████████████████████████████████████████████| 1/1 [00:09<00:00,  9.72s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '13-17' took: 9.43 seconds (including any timeouts).


Processing Papers for 2022:  83%|█████████████████████████████████████▎       | 83/100 [23:37<04:32, 16.05s/it]


  Processing PMCID: PMC9668692 (84/100) for year 2022
  Found 1 <table> elements in XML for PMC9668692.



Supps for PMC9668692:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9668692: 100%|██████████████████████████████████████████████████████| 1/1 [01:31<00:00, 91.46s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '14-18' took: 90.38 seconds (including any timeouts).


Processing Papers for 2022:  84%|█████████████████████████████████████▊       | 84/100 [25:14<10:46, 40.43s/it]


  Processing PMCID: PMC9613702 (85/100) for year 2022
  Found 3 <table> elements in XML for PMC9613702.



Supps for PMC9613702:   0%|                                                              | 0/3 [00:00<?, ?it/s][A
Supps for PMC9613702:  33%|██████████████████                                    | 1/3 [00:16<00:33, 16.50s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '40-44' took: 15.96 seconds (including any timeouts).
      Sufficient primers (90) now found for PMC9613702. Skipping its remaining supplements.


Processing Papers for 2022:  85%|██████████████████████████████████████▎      | 85/100 [25:39<08:57, 35.81s/it]


  Processing PMCID: PMC9196909 (86/100) for year 2022
  Found 0 <table> elements in XML for PMC9196909.
      Sufficient primers (13) found for PMC9196909 from XML. Skipping its supplement processing.


Processing Papers for 2022:  86%|██████████████████████████████████████▋      | 86/100 [25:41<05:59, 25.68s/it]


  Processing PMCID: PMC8928726 (87/100) for year 2022
  Found 0 <table> elements in XML for PMC8928726.



Supps for PMC8928726:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC8928726: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.81it/s][A
Processing Papers for 2022:  87%|███████████████████████████████████████▏     | 87/100 [26:07<05:33, 25.64s/it][A


  Processing PMCID: PMC9018716 (88/100) for year 2022
  Found 0 <table> elements in XML for PMC9018716.



Supps for PMC9018716:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9018716:  50%|███████████████████████████                           | 1/2 [00:13<00:13, 13.50s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '21-25' took: 12.47 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC9018716: 100%|██████████████████████████████████████████████████████| 2/2 [00:38<00:00, 20.30s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-5' took: 23.59 seconds (including any timeouts).


Processing Papers for 2022:  88%|███████████████████████████████████████▌     | 88/100 [26:53<06:22, 31.85s/it]


  Processing PMCID: PMC9628677 (89/100) for year 2022
  Found 0 <table> elements in XML for PMC9628677.



Supps for PMC9628677:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9628677:  50%|███████████████████████████                           | 1/2 [00:17<00:17, 17.29s/it][A

  Camelot PDF processing for pages '3-7' took: 16.87 seconds (including any timeouts).



Supps for PMC9628677: 100%|██████████████████████████████████████████████████████| 2/2 [00:20<00:00,  9.08s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-2' took: 2.89 seconds (including any timeouts).


Processing Papers for 2022:  89%|████████████████████████████████████████     | 89/100 [27:19<05:31, 30.10s/it]


  Processing PMCID: PMC9311890 (90/100) for year 2022
  Found 0 <table> elements in XML for PMC9311890.


Processing Papers for 2022:  90%|████████████████████████████████████████▌    | 90/100 [27:21<03:35, 21.54s/it]


  Processing PMCID: PMC9643422 (91/100) for year 2022
  Found 0 <table> elements in XML for PMC9643422.



Supps for PMC9643422:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC9643422: 100%|█████████████████████████████████████████

  Camelot PDF processing for pages '4-8' took: 9.59 seconds (including any timeouts).


Processing Papers for 2022:  91%|████████████████████████████████████████▉    | 91/100 [27:46<03:24, 22.69s/it]


  Processing PMCID: PMC9392338 (92/100) for year 2022
  Found 1 <table> elements in XML for PMC9392338.



Supps for PMC9392338:   0%|                                                              | 0/3 [00:00<?, ?it/s][A
                                                                                                               [A

      Sufficient primers (9) now found for PMC9392338. Skipping its remaining supplements.


Processing Papers for 2022:  92%|█████████████████████████████████████████▍   | 92/100 [28:15<03:16, 24.59s/it]


  Processing PMCID: PMC9790072 (93/100) for year 2022
  Found 1 <table> elements in XML for PMC9790072.


Processing Papers for 2022:  93%|█████████████████████████████████████████▊   | 93/100 [28:17<02:03, 17.71s/it]


  Processing PMCID: PMC9696876 (94/100) for year 2022
  Found 1 <table> elements in XML for PMC9696876.
      Sufficient primers (14) found for PMC9696876 from XML. Skipping its supplement processing.


Processing Papers for 2022:  94%|██████████████████████████████████████████▎  | 94/100 [28:19<01:17, 12.93s/it]


  Processing PMCID: PMC9209417 (95/100) for year 2022
  Found 1 <table> elements in XML for PMC9209417.



Supps for PMC9209417:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9209417: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.62it/s][A
Processing Papers for 2022:  95%|██████████████████████████████████████████▊  | 95/100 [28:22<00:50, 10.17s/it][A


  Processing PMCID: PMC9307681 (96/100) for year 2022
  Found 13 <table> elements in XML for PMC9307681.


Processing Papers for 2022:  96%|███████████████████████████████████████████▏ | 96/100 [28:24<00:30,  7.61s/it]


  Processing PMCID: PMC9978832 (97/100) for year 2022
  Found 1 <table> elements in XML for PMC9978832.
      Sufficient primers (20) found for PMC9978832 from XML. Skipping its supplement processing.


Processing Papers for 2022:  97%|███████████████████████████████████████████▋ | 97/100 [28:26<00:17,  5.80s/it]


  Processing PMCID: PMC9514851 (98/100) for year 2022
  Found 0 <table> elements in XML for PMC9514851.
      Sufficient primers (47) found for PMC9514851 from XML. Skipping its supplement processing.


Processing Papers for 2022:  98%|████████████████████████████████████████████ | 98/100 [28:28<00:09,  4.66s/it]


  Processing PMCID: PMC8718141 (99/100) for year 2022
  Found 0 <table> elements in XML for PMC8718141.



Supps for PMC8718141:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '14-18' took: 11.05 seconds (including any timeouts).


Processing Papers for 2022:  99%|████████████████████████████████████████████▌| 99/100 [28:43<00:07,  7.81s/it]


  Processing PMCID: PMC9148100 (100/100) for year 2022
  Found 0 <table> elements in XML for PMC9148100.


Processing Papers for 2022: 100%|████████████████████████████████████████████| 100/100 [28:44<00:00, 17.25s/it]



--- Consolidating and Saving Results for YEAR 2022 ---
Total unique primer entries found for 2022: 1372
  ✅ Saved 1372 unique extracted primers for 2022 to data\psc\yearly_results\master_extracted_primers_2022.csv
###### FINISHED PROCESSING FOR YEAR: 2022 ######

STARTING PROCESSING FOR YEAR: 2021

Searching Europe PMC with query: OPEN_ACCESS:y AND HAS_FT:y AND (METHODS:"qPCR" OR "RT-PCR" OR "real time PCR") AND (ABSTRACT:"pluripotent" OR "iPSC" OR "PSC" OR "hPSC" OR "ESC" OR "hESC") AND (FIRST_PDATE:2021)
Retrieved 100 records for 2021.


Processing Papers for 2021:   0%|                                                      | 0/100 [00:00<?, ?it/s]


  Processing PMCID: PMC8625075 (1/100) for year 2021
  Found 1 <table> elements in XML for PMC8625075.


Processing Papers for 2021:   1%|▍                                             | 1/100 [00:02<03:50,  2.33s/it]


  Processing PMCID: PMC8560749 (2/100) for year 2021
  Found 1 <table> elements in XML for PMC8560749.


Processing Papers for 2021:   2%|▉                                             | 2/100 [00:03<02:59,  1.83s/it]


  Processing PMCID: PMC8047170 (3/100) for year 2021
  Found 2 <table> elements in XML for PMC8047170.
      Sufficient primers (11) found for PMC8047170 from XML. Skipping its supplement processing.


Processing Papers for 2021:   3%|█▍                                            | 3/100 [00:05<02:43,  1.68s/it]


  Processing PMCID: PMC8437760 (4/100) for year 2021
  Found 3 <table> elements in XML for PMC8437760.


Processing Papers for 2021:   4%|█▊                                            | 4/100 [00:07<02:52,  1.80s/it]


  Processing PMCID: PMC8306397 (5/100) for year 2021
  Found 4 <table> elements in XML for PMC8306397.


Processing Papers for 2021:   5%|██▎                                           | 5/100 [00:10<03:34,  2.25s/it]


  Processing PMCID: PMC8586142 (6/100) for year 2021
  Found 0 <table> elements in XML for PMC8586142.



Supps for PMC8586142:   0%|                                                              | 0/4 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '35-39' took: 15.94 seconds (including any timeouts).



Supps for PMC8586142:  50%|███████████████████████████                           | 2/4 [00:26<00:25, 12.75s/it][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


  Camelot PDF processing for pages '14-18' took: 9.80 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC8586142:  75%|████████████████████████████████████████▌             | 3/4 [00:42<00:14, 14.14s/it][A

  Camelot PDF processing for pages '1-4' took: 14.08 seconds (including any timeouts).



Supps for PMC8586142: 100%|██████████████████████████████████████████████████████| 4/4 [00:44<00:00,  9.47s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 1.99 seconds (including any timeouts).


Processing Papers for 2021:   6%|██▊                                           | 6/100 [01:07<32:41, 20.87s/it]


  Processing PMCID: PMC8611312 (7/100) for year 2021
  Found 0 <table> elements in XML for PMC8611312.



Supps for PMC8611312:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC8611312: 100%|██████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.23s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-2' took: 2.67 seconds (including any timeouts).


Processing Papers for 2021:   7%|███▏                                          | 7/100 [01:14<25:09, 16.23s/it]


  Processing PMCID: PMC8225148 (8/100) for year 2021
  Found 2 <table> elements in XML for PMC8225148.
      Sufficient primers (28) found for PMC8225148 from XML. Skipping its supplement processing.


Processing Papers for 2021:   8%|███▋                                          | 8/100 [01:15<17:47, 11.60s/it]


  Processing PMCID: PMC9970692 (9/100) for year 2021
  Found 3 <table> elements in XML for PMC9970692.



Supps for PMC9970692:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC9970692: 100%|██████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.09s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-1' took: 2.01 seconds (including any timeouts).


Processing Papers for 2021:   9%|████▏                                         | 9/100 [01:21<14:40,  9.68s/it]


  Processing PMCID: PMC8184003 (10/100) for year 2021
  Found 0 <table> elements in XML for PMC8184003.



Supps for PMC8184003:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Processing Papers for 2021:  10%|████▌                                        | 10/100 [01:24<11:24,  7.60s/it][A


  Processing PMCID: PMC8653577 (11/100) for year 2021
  Found 1 <table> elements in XML for PMC8653577.


Processing Papers for 2021:  11%|████▉                                        | 11/100 [01:26<08:44,  5.89s/it]


  Processing PMCID: PMC7940445 (12/100) for year 2021
  Found 0 <table> elements in XML for PMC7940445.



Supps for PMC7940445:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC7940445:  50%|███████████████████████████                           | 1/2 [00:10<00:10, 10.90s/it][A

  Camelot PDF processing for pages '23-27' took: 10.68 seconds (including any timeouts).



Supps for PMC7940445: 100%|██████████████████████████████████████████████████████| 2/2 [00:24<00:00, 12.32s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '38-42' took: 12.44 seconds (including any timeouts).


Processing Papers for 2021:  12%|█████▍                                       | 12/100 [01:57<19:56, 13.60s/it]


  Processing PMCID: PMC9209596 (13/100) for year 2021
  Found 3 <table> elements in XML for PMC9209596.
      Sufficient primers (23) found for PMC9209596 from XML. Skipping its supplement processing.


Processing Papers for 2021:  13%|█████▊                                       | 13/100 [01:58<14:20,  9.90s/it]


  Processing PMCID: PMC8284774 (14/100) for year 2021
  Found 0 <table> elements in XML for PMC8284774.



Supps for PMC8284774:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC8284774: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.28it/s][A
Processing Papers for 2021:  14%|██████▎                                      | 14/100 [02:03<11:46,  8.21s/it][A


  Processing PMCID: PMC8698385 (15/100) for year 2021
  Found 1 <table> elements in XML for PMC8698385.
      Sufficient primers (11) found for PMC8698385 from XML. Skipping its supplement processing.


Processing Papers for 2021:  15%|██████▊                                      | 15/100 [02:04<08:49,  6.23s/it]


  Processing PMCID: PMC8636751 (16/100) for year 2021
  Found 0 <table> elements in XML for PMC8636751.



Supps for PMC8636751:   0%|                                                              | 0/1 [00:00<?, ?it/s][A
Supps for PMC8636751: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.65it/s][A
Processing Papers for 2021:  16%|███████▏                                     | 16/100 [02:29<16:24, 11.72s/it][A


  Processing PMCID: PMC8302568 (17/100) for year 2021
  Found 0 <table> elements in XML for PMC8302568.



Supps for PMC8302568:   0%|                                                              | 0/3 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '5-9' took: 16.77 seconds (including any timeouts).


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC8302568: 100%|██████████████████████████████████████████████████████| 3/3 [00:32<00:00, 10.23s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-3' took: 13.95 seconds (including any timeouts).


Processing Papers for 2021:  17%|███████▋                                     | 17/100 [03:09<28:15, 20.43s/it]


  Processing PMCID: PMC7969605 (18/100) for year 2021
  Found 0 <table> elements in XML for PMC7969605.



Supps for PMC7969605:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing fr

  Camelot PDF processing for pages '10-14' took: 10.68 seconds (including any timeouts).


Processing Papers for 2021:  18%|████████                                     | 18/100 [03:28<27:15, 19.95s/it]


  Processing PMCID: PMC8750013 (19/100) for year 2021
  Found 0 <table> elements in XML for PMC8750013.


Processing Papers for 2021:  19%|████████▌                                    | 19/100 [03:30<19:25, 14.39s/it]


  Processing PMCID: PMC8472244 (20/100) for year 2021
  Found 0 <table> elements in XML for PMC8472244.


Processing Papers for 2021:  20%|█████████                                    | 20/100 [03:32<14:14, 10.68s/it]


  Processing PMCID: PMC8773696 (21/100) for year 2021
  Found 5 <table> elements in XML for PMC8773696.
      Sufficient primers (36) found for PMC8773696 from XML. Skipping its supplement processing.


Processing Papers for 2021:  21%|█████████▍                                   | 21/100 [03:33<10:33,  8.02s/it]


  Processing PMCID: PMC8166824 (22/100) for year 2021
  Found 1 <table> elements in XML for PMC8166824.
      Sufficient primers (14) found for PMC8166824 from XML. Skipping its supplement processing.


Processing Papers for 2021:  22%|█████████▉                                   | 22/100 [03:35<07:53,  6.07s/it]


  Processing PMCID: PMC8394524 (23/100) for year 2021
  Found 1 <table> elements in XML for PMC8394524.


Processing Papers for 2021:  23%|██████████▎                                  | 23/100 [03:37<06:13,  4.85s/it]


  Processing PMCID: PMC8362881 (24/100) for year 2021
  Found 0 <table> elements in XML for PMC8362881.



Supps for PMC8362881:   0%|                                                              | 0/1 [00:00<?, ?it/s][ACropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox

Supps for PMC8362881: 100%|█████████████████████████████████████████

  Camelot PDF processing for pages '4-8' took: 11.68 seconds (including any timeouts).


Processing Papers for 2021:  24%|██████████▊                                  | 24/100 [04:17<19:39, 15.52s/it]


  Processing PMCID: PMC9298993 (25/100) for year 2021
  Found 0 <table> elements in XML for PMC9298993.



Supps for PMC9298993:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC9298993:  50%|███████████████████████████                           | 1/2 [00:03<00:03,  3.04s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '1-2' took: 2.89 seconds (including any timeouts).


Processing Papers for 2021:  25%|███████████▎                                 | 25/100 [04:25<16:29, 13.19s/it]


  Processing PMCID: PMC8025391 (26/100) for year 2021
  Found 0 <table> elements in XML for PMC8025391.


Processing Papers for 2021:  26%|███████████▋                                 | 26/100 [04:27<11:59,  9.72s/it]


  Processing PMCID: PMC8611307 (27/100) for year 2021
  Found 0 <table> elements in XML for PMC8611307.


Processing Papers for 2021:  27%|████████████▏                                | 27/100 [04:28<08:50,  7.27s/it]


  Processing PMCID: PMC8197809 (28/100) for year 2021
  Found 0 <table> elements in XML for PMC8197809.


Processing Papers for 2021:  28%|████████████▌                                | 28/100 [04:30<06:39,  5.55s/it]


  Processing PMCID: PMC8072067 (29/100) for year 2021
  Found 0 <table> elements in XML for PMC8072067.



Supps for PMC8072067:   0%|                                                              | 0/2 [00:00<?, ?it/s][A
Supps for PMC8072067:  50%|███████████████████████████                           | 1/2 [00:12<00:12, 12.63s/it][A

  Camelot PDF processing for pages '41-45' took: 12.33 seconds (including any timeouts).



Supps for PMC8072067: 100%|██████████████████████████████████████████████████████| 2/2 [00:25<00:00, 12.81s/it][A
                                                                                                               [A

  Camelot PDF processing for pages '58-62' took: 11.67 seconds (including any timeouts).


Processing Papers for 2021:  29%|█████████████                                | 29/100 [05:12<19:25, 16.41s/it]


  Processing PMCID: PMC8793999 (30/100) for year 2021
  Found 1 <table> elements in XML for PMC8793999.


Processing Papers for 2021:  29%|█████████████                                | 29/100 [05:46<14:09, 11.97s/it]


KeyboardInterrupt: 

In [4]:
import pandas as pd
import glob
import os
from pathlib import Path
from collections import Counter # Import Counter

def aggregate_primer_data(input_dir: Path, output_file: Path):
    """
    Reads all 'master_extracted_primers_YYYY.csv' files from the input directory,
    aggregates primer sequences, counts occurrences, lists source PMCIDs,
    Orientations, and Genes (with counts), and saves the result to a new CSV file.

    Args:
        input_dir (Path): The directory containing the yearly CSV files.
        output_file (Path): The path to save the aggregated CSV file.
    """
    print(f"Looking for yearly CSV files in: {input_dir}")

    csv_files = glob.glob(str(input_dir / "master_extracted_primers_*.csv"))

    if not csv_files:
        print(f"No CSV files found in {input_dir} matching the pattern 'master_extracted_primers_*.csv'.")
        print("Please ensure your yearly CSV files are in that directory and named correctly.")
        return

    print(f"Found {len(csv_files)} CSV files to process:")
    for f in csv_files:
        print(f"  - {os.path.basename(f)}")

    all_dataframes = []
    required_columns = ['Sequence', 'PMCID', 'Orientation', 'Gene']

    for file_path in csv_files:
        try:
            df = pd.read_csv(file_path)
            missing_cols = [col for col in required_columns if col not in df.columns]
            if missing_cols:
                print(f"Warning: Skipping file {os.path.basename(file_path)} as it's missing column(s): {', '.join(missing_cols)}.")
                continue
            all_dataframes.append(df)
        except pd.errors.EmptyDataError:
            print(f"Warning: Skipping empty file {os.path.basename(file_path)}.")
        except Exception as e:
            print(f"Error reading {os.path.basename(file_path)}: {e}")

    if not all_dataframes:
        print("No data loaded from CSV files. Aggregation cannot proceed.")
        return

    master_df = pd.concat(all_dataframes, ignore_index=True)
    print(f"\nTotal rows loaded from all CSVs: {len(master_df)}")

    if master_df.empty:
        print("The combined dataframe is empty. Nothing to aggregate.")
        return

    # Standardize and clean data
    master_df['Sequence'] = master_df['Sequence'].astype(str).str.upper()
    master_df['PMCID'] = master_df['PMCID'].astype(str)
    master_df['Orientation'] = master_df['Orientation'].astype(str).str.strip().fillna('Unknown')
    master_df['Gene'] = master_df['Gene'].astype(str).str.strip().fillna('Unknown')
    
    master_df['Orientation'] = master_df['Orientation'].apply(lambda x: 'Unknown' if x == '' else x)
    master_df['Gene'] = master_df['Gene'].apply(lambda x: 'Unknown' if x == '' else x)

    master_df.dropna(subset=['Sequence', 'PMCID'], inplace=True)
    master_df = master_df[master_df['Sequence'] != '']

    print(f"Processing {len(master_df)} rows for aggregation...")

    # --- MODIFIED AGGREGATION FOR GENES ---
    def aggregate_gene_counts(series):
        # Count occurrences of each gene in the series
        counts = Counter(str(g) for g in series)
        # Sort by count (descending), then by gene name (ascending) for tie-breaking
        sorted_counts = sorted(counts.items(), key=lambda item: (-item[1], item[0]))
        # Format as "GeneName (count)"
        return ', '.join([f"{gene} ({count})" for gene, count in sorted_counts])

    aggregated_data = master_df.groupby('Sequence').agg(
        Occurrence_Count=('Sequence', 'size'),
        Source_PMCIDs=('PMCID', lambda x: ', '.join(sorted(list(set(str(i) for i in x))))),
        Source_Orientations=('Orientation', lambda x: ', '.join(sorted(list(set(str(i) for i in x))))),
        Source_Genes=('Gene', aggregate_gene_counts) # Use the new aggregation function
    ).reset_index()
    # --- END MODIFICATION ---

    aggregated_data = aggregated_data.sort_values(by='Occurrence_Count', ascending=False)

    print(f"\nAggregation complete. Found {len(aggregated_data)} unique primer sequences.")

    try:
        output_file.parent.mkdir(parents=True, exist_ok=True)
        aggregated_data.to_csv(output_file, index=False)
        print(f"\nSuccessfully saved aggregated primer data to: {output_file}")
    except Exception as e:
        print(f"\nError saving aggregated data to {output_file}: {e}")

if __name__ == "__main__":
    INPUT_DATA_DIR = Path("data/psc/yearly_results") 
    AGGREGATED_OUTPUT_FILE = INPUT_DATA_DIR / "aggregated_primer_summary_with_details.csv"

    aggregate_primer_data(INPUT_DATA_DIR, AGGREGATED_OUTPUT_FILE)

    print("\n--- Script Finished ---")
    # Example of how to quickly view the top results if needed:
    # if AGGREGATED_OUTPUT_FILE.exists():
    #     top_results_df = pd.read_csv(AGGREGATED_OUTPUT_FILE)
    #     print("\nTop 10 most frequent primers:")
    #     print(top_results_df.head(10).to_string())


Looking for yearly CSV files in: data\psc\yearly_results
Found 4 CSV files to process:
  - master_extracted_primers_2022.csv
  - master_extracted_primers_2023.csv
  - master_extracted_primers_2024.csv
  - master_extracted_primers_2025.csv

Total rows loaded from all CSVs: 6522
Processing 6522 rows for aggregation...

Aggregation complete. Found 4751 unique primer sequences.

Successfully saved aggregated primer data to: data\psc\yearly_results\aggregated_primer_summary_with_details.csv

--- Script Finished ---
