# ✅ Clean Inference Notebook Template

This notebook includes:
- Robust Marker-pdf extraction with error handling
- Text cleaning and DOI extraction
- Accession ID fallback
- Model loading and prediction
- Submission file creation


In [ ]:
# 📦 Imports
import os
import re
import pandas as pd
from glob import glob
from tqdm import tqdm
import torch
from marker import Document
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [ ]:
# ✅ Load model and tokenizer
model_path = "/kaggle/input/..."
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [ ]:
# ✅ Utility functions
def extract_article_id(filename):
    return filename[:-4].replace('_','/')

def clean_doi(doi):
    return doi.rstrip(')]>.,;')

def standardize_doi(doi):
    doi = str(doi).strip()
    if doi.startswith('http'): return doi.lower()
    if doi.startswith('doi:'): return 'https://doi.org/' + doi[4:].lower()
    if doi.startswith('10.'): return 'https://doi.org/' + doi.lower()
    return doi.lower()

def extract_dataset_dois(text):
    pattern = r'https?://doi\.org/10\.[^\s\)<>]+'
    return sorted(set(standardize_doi(m) for m in re.findall(pattern, text)))

def find_accession_ids_in_text(text):
    patterns = [r'\b(GSE\d+)', r'\b(SRP\d+)']
    matches = []
    for pat in patterns:
        matches.extend(re.findall(pat, text, re.IGNORECASE))
    return [m.lower() for m in matches]

In [ ]:
# ✅ Extract text with Marker-pdf
def extract_text(pdf_path):
    try:
        doc = Document(pdf_path)
        return doc.text or ''
    except Exception as e:
        print(f'Error extracting {pdf_path}: {e}')
        return ''

In [ ]:
# ✅ Process all PDFs
pdf_dir = '/kaggle/input/.../test/PDF'
pdf_files = sorted(glob(os.path.join(pdf_dir, '*.pdf')))

rows = []
for pdf_path in tqdm(pdf_files):
    aid = extract_article_id(os.path.basename(pdf_path))
    text = extract_text(pdf_path)
    rows.append({'article_id': aid, 'text': text})

In [ ]:
# ✅ Predict
label_map = {0:'Primary',1:'Secondary',2:'Missing'}
preds = []
for i in range(0, len(rows), 8):
    batch = tokenizer([r['text'] for r in rows[i:i+8]], truncation=True, padding=True, max_length=512, return_tensors='pt').to(device)
    with torch.no_grad():
        logits = model(**batch).logits
        preds.extend(torch.argmax(logits, axis=1).cpu().tolist())

In [ ]:
# ✅ Assemble submission
results = []
for i, r in enumerate(rows):
    dois = extract_dataset_dois(r['text'])
    accs = find_accession_ids_in_text(r['text'])
    dataset_id = dois[0] if dois else (accs[0] if accs else '')
    results.append({'article_id': r['article_id'].replace('/','_'), 'dataset_id': dataset_id, 'type': label_map[preds[i]]})

df = pd.DataFrame(results)
df = df[df.type != 'Missing'].drop_duplicates()
df.insert(0,'row_id',range(len(df)))
df.to_csv('submission.csv', index=False)