<a href="https://colab.research.google.com/github/xyshuai/LLM-generated-reference-checker/blob/main/LLM_Generated_Reference_Verification_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
üìö LLM-Generated Reference Verification Tool - Interactive Version

Usage:
1. Run this cell
2. Paste references into the text box (one per line)
3. Click "üöÄ Run Verification" button
4. View results with color highlighting

‚úì Supports: APA, Chicago, Harvard, IEEE, ACM, MLA

‚úì Primary API: OpenAlex (comprehensive, has retraction data)
‚úì Fallback API: Crossref (broader coverage)
"""

# ================= Install dependencies (if needed) =================
try:
    import rapidfuzz
except:
    !pip install rapidfuzz -q

import re
import time
import requests
import pandas as pd
from rapidfuzz import fuzz
import string
from IPython.display import display, HTML
import ipywidgets as widgets


# ================= Configuration =================
OPENALEX_BASE = "https://api.openalex.org/works"
CROSSREF_BASE = "https://api.crossref.org/works"

HEADERS = {
    "User-Agent": "ReferenceVerificationTool/1.0 (mailto:your_email@example.com)",
}

TITLE_THRESHOLD = 85
TITLE_MISMATCH_THRESHOLD = 70
REQUEST_DELAY = 0.2


# ================= Helper Functions =================

def normalize_doi(doi):
    """Normalize DOI by removing URL prefix and trailing punctuation"""
    if not doi:
        return None
    doi = re.sub(r'^https?://(dx\.)?doi\.org/', '', doi, flags=re.I)
    doi = re.sub(r'^doi:', '', doi, flags=re.I)
    doi = doi.rstrip('.,;)')
    return doi.lower().strip()


def extract_surname(author_name):
    """Extract surname from author name in various formats"""
    if not author_name:
        return ""

    author_name = re.sub(r'\(\d{4}\)', '', author_name).strip()
    author_name = re.sub(r'^\[\d+\]\s*', '', author_name)

    if ',' in author_name:
        surname = author_name.split(',')[0].strip()
        return surname.lower() if surname else ""

    parts = author_name.split()
    if not parts:
        return ""

    if len(parts) == 1:
        return parts[0].lower()

    last_part = parts[-1].replace('.', '').strip()
    if len(last_part) <= 2 and (last_part.isupper() or len(last_part) == 1):
        return parts[0].lower()
    else:
        return parts[-1].lower()


def normalize_page_range(page_range):
    """Normalize page range to consistent format"""
    if not page_range or page_range == "-":
        return "-"
    normalized = str(page_range).replace('‚Äì', '-').replace('‚Äî', '-').replace('‚àí', '-')
    normalized = re.sub(r'\s*-\s*', '-', normalized)
    return normalized.strip()


def standardize_title(title):
    """Standardize title for comparison (lowercase, no punctuation)"""
    if not title:
        return ""
    title = title.lower()
    title = title.replace("u.k.", "uk").replace("u.s.", "us")
    title = title.translate(str.maketrans('', '', string.punctuation))
    title = re.sub(r'\s+', ' ', title).strip()
    return title


# ================= Enhanced Reference Parser =================

def parse_reference(raw_ref):

    text = re.sub(r'^[\[\(\{]?\d+[\]\)\}]\.?\s*', '', raw_ref.strip())

    # ==================== DOI Extraction ====================
    doi_match = re.search(
        r'(?:https?://)?(?:doi\.org/|DOI:?\s*)?(10\.\d{4,9}/[^\s"\'<>\]]+)',
        text, re.I
    )
    doi = doi_match.group(1).rstrip('.,;)]') if doi_match else None

    # ==================== Year Extraction ====================
    year = None
    year_in_parentheses = False

    # Try parentheses first (most common)
    year_match = re.search(r'\((\d{4})[a-z]?\)', text)
    if year_match:
        year = int(year_match.group(1))
        year_in_parentheses = True
    else:
        # Try Chicago/MLA format: Author. 2012. Title (year after period without parentheses)
        year_match = re.search(r'\.\s+(\d{4})\.\s+[\u201c\u201d"\'A-Z]', text)
        if year_match:
            year = int(year_match.group(1))
            year_in_parentheses = False
        else:
            # Try month-year format (IEEE: Sep. 2021)
            year_match = re.search(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?\s+(\d{4})', text, re.I)
            if year_match:
                year = int(year_match.group(1))
            else:
                # Try IEEE format: ..., Year. at the end or before doi
                year_match = re.search(r',\s*(\d{4})\.?\s*(?:doi|$)', text, re.I)
                if year_match:
                    year = int(year_match.group(1))
                else:
                    # Last resort: any 4-digit year
                    year_match = re.search(r'[,\s](\d{4})[;,\.]', text)
                    if year_match:
                        year = int(year_match.group(1))

    # ==================== Detect Format Type ====================
    # Expanded IEEE detection: "vol. X, no. Y" OR "Volume(Issue):Pages" OR contains "IEEE"
    is_ieee_style = bool(re.search(r'(?:vol\.\s*\d+.*?no\.\s*\d+|IEEE\s+\w+|\d+\(\d+\):\d+)', text, re.I))
    is_vancouver = bool(re.search(r';\d+\(\d+\):', text))
    # Detect ALL quote types using Unicode
    has_quotes = bool(re.search(r'[\u201c\u201d\u2018\u2019"\'"]', text))

    # ==================== Title Extraction ====================
    title = "Unknown"

    # Pattern 1: Quoted title (ALL Unicode quote types)
    if has_quotes:
        # Extract content between quotes (handle ALL Unicode quote types)
        quote_patterns = [
            r'\u201c([^\u201d]+)\u201d',   # Smart double quotes "" (U+201C, U+201D)
            r'\u2018([^\u2019]+)\u2019',   # Smart single quotes '' (U+2018, U+2019)
            r'"([^"]+)"',                  # Standard double quotes ""
            r"'([^']+)'",                  # Standard single quotes ''
            r'[\u201c\u201d"\u2018\u2019\'](.+?)[\u201c\u201d"\u2018\u2019\']',  # Universal
        ]
        for pattern in quote_patterns:
            quote_match = re.search(pattern, text)
            if quote_match:
                title = quote_match.group(1).strip()
                break

    # Pattern 2: IEEE format WITHOUT quotes
    # Supports: "vol. X, no. Y" AND "Volume(Issue):Pages" AND "IEEE Journal"
    if title == "Unknown" and is_ieee_style:
        # Step 1: Find the position after last author (look for " and [Name].")
        last_author_match = re.search(r'\band\s+[A-Z][\w\s\.]+?\.\s+', text)

        if last_author_match:
            # Extract everything after "and Author."
            after_authors = text[last_author_match.end():]

            # Step 2: Find title (text before journal name)
            # Pattern A: Title. Journal, vol. X (traditional IEEE)
            # Pattern B: Title. Journal, Volume(Issue):Pages (compact IEEE)
            # Pattern C: Title. IEEE Journal (IEEE keyword)
            title_patterns = [
                r'^([A-Z][^\.]+?)\.\s+[A-Z][\w\s&]+?,?\s*vol\.',      # Pattern A
                r'^([A-Z][^\.]+?)\.\s+[A-Z][\w\s&]+?,\s*\d+\(',        # Pattern B
                r'^([A-Z][^\.]{20,}?)\.\s+IEEE',                       # Pattern C
            ]

            for pattern in title_patterns:
                title_match = re.search(pattern, after_authors, re.I)
                if title_match:
                    title = title_match.group(1).strip()
                    break

        # Fallback: Look for long text before journal/IEEE keyword
        if title == "Unknown":
            # Match: After period, Capital start, 20+ chars, before IEEE or journal with volume
            fallback_patterns = [
                r'\.\s+([A-Z][a-z][\w\s:,\-]{20,}?)\.\s+IEEE',
                r'\.\s+([A-Z][a-z][\w\s:,\-]{20,}?)\.\s+[A-Z][\w\s&]+?,\s*\d+\(',
                r'\.\s+([A-Z][a-z][\w\s:,\-]{20,}?)\.\s+[A-Z][\w\s&]+?,?\s*vol\.',
            ]

            for pattern in fallback_patterns:
                fallback_match = re.search(pattern, text, re.I)
                if fallback_match:
                    potential_title = fallback_match.group(1).strip()
                    # Ensure it's not an author name (no "LC Rodrigues" pattern)
                    if not re.search(r'\b[A-Z]{1,3}\s+[A-Z][a-z]+\b', potential_title[:40]):
                        title = potential_title
                        break

    # Pattern 3: Year WITHOUT parentheses (Chicago/MLA: Author. 2012. "Title." or Author. 2012. Title.)
    if title == "Unknown" and not year_in_parentheses and year:
        # Pattern 3a: . Year. "Title." Source (with quotes)
        title_match = re.search(rf'\.\s+{year}\.\s+[\u201c\u201d"\'"]?(.+?)[\u201c\u201d"\'"]?[\.?!]\s+[A-Z]', text)
        if title_match:
            title = title_match.group(1).strip()
        else:
            # Pattern 3b: . Year. Title Source (no quotes, title ends before source)
            title_match = re.search(rf'\.\s+{year}\.\s+(.+?)\.\s+[A-Z][A-Za-z\s]+\s+\d+', text)
            if title_match:
                title = title_match.group(1).strip()

    # Pattern 4: Year WITH parentheses (APA, Harvard: Author (2012). Title.)
    if title == "Unknown" and year_in_parentheses:
        patterns = [
            # Pattern 4a: (Year). Title[.?!] Source
            r'\(\d{4}\)\.\s*(.+?)[\.?!]\s+[A-Z]',
            # Pattern 4b: (Year) Title[.?!] Source (no period after year)
            r'\(\d{4}\)\s+(.+?)[\.?!]\s+[A-Z]',
            # Pattern 4c: Conference: (Year). Title. In Proceedings
            r'\(\d{4}\)\s*\.?\s*(.+?)[\.?!]\s*(?:In\s+)?(?:Proceedings?|Conference)',
        ]

        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                potential_title = match.group(1).strip()
                if len(potential_title) > 10:
                    title = potential_title
                    break

    # Pattern 5: Vancouver style: . Title. Source Year;
    if title == "Unknown" and is_vancouver:
        title_match = re.search(r'\.(.+?)[\.?!]\s+[A-Z][^\.]+\s+\d{4}', text)
        if title_match:
            title = title_match.group(1).strip()

    # Clean title (remove ALL types of quotes using Unicode)
    if title != "Unknown":
        # Remove smart quotes (U+201C, U+201D, U+2018, U+2019)
        title = title.replace('\u201c', '').replace('\u201d', '')  # ""
        title = title.replace('\u2018', '').replace('\u2019', '')  # ''
        # Remove standard quotes
        title = title.replace('"', '').replace("'", '')
        title = title.strip()

    # ==================== Journal/Source Extraction ====================
    journal = "Unknown"

    if is_ieee_style:
        # IEEE style: "Title," Source, vol. X OR Title. Source, Volume(Issue)
        if has_quotes:
            # After closing quote (Unicode-aware split)
            parts = re.split(r'[\u201c\u201d\u2018\u2019"\'"]', text)
            after_quote = parts[-1] if len(parts) > 1 else text
            ieee_match = re.search(r',\s*([^,]+?),\s*(?:vol\.|\d+\()', after_quote, re.I)
            if ieee_match:
                journal = ieee_match.group(1).strip()
        else:
            # Without quotes: Title. Journal, vol. OR Title. Journal, Volume(Issue)
            ieee_patterns = [
                r'\.\s+([A-Z][A-Za-z\s&]+?),\s*vol\.',        # Pattern A
                r'\.\s+([A-Z][A-Za-z\s&]+?),\s*\d+\(',         # Pattern B
            ]
            for pattern in ieee_patterns:
                ieee_match = re.search(pattern, text, re.I)
                if ieee_match:
                    potential_journal = ieee_match.group(1).strip()
                    # Make sure it's not the title (journal names are usually shorter)
                    if len(potential_journal) < 100:
                        journal = potential_journal
                        break

    elif is_vancouver:
        vanc_match = re.search(r'\.([^\.]+)\.\s*\d{4};', text)
        if vanc_match:
            journal = vanc_match.group(1).strip()

    else:
        # Standard formats
        patterns = [
            # Pattern 1: . Source, Volume or . "Source" Volume
            r'[\.?!]\s*[\u201c\u201d"\'"]?([A-Za-z\s&]+?)[\u201c\u201d"\'"]?\s*,?\s*\d+\s*\(',
            # Pattern 2: . Source Volume
            r'[\.?!]\s+([A-Z][A-Za-z\s&]+?)\s+\d+\s*\(',
        ]

        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                journal = match.group(1).strip()
                break

    # ==================== Volume/Issue/Pages ====================
    volume, issue, page_range = "-", "-", "-"

    # Pattern 1: vol. X, no. Y, pp. Z (IEEE traditional style)
    if 'vol.' in text.lower():
        vol_match = re.search(r'vol\.\s*(\d+)', text, re.I)
        issue_match = re.search(r'no\.\s*(\d+)', text, re.I)
        page_match = re.search(r'pp\.\s*([\d‚Äì\-‚Äî]+)', text, re.I)

        if vol_match:
            volume = vol_match.group(1)
        if issue_match:
            issue = issue_match.group(1)
        if page_match:
            page_range = normalize_page_range(page_match.group(1))

    # Pattern 2: Volume(Issue): Pages or Volume(Issue), Pages (compact style)
    if page_range == "-":
        patterns = [
            (r'(\d+)\s*\((\d+)\):\s*([\d‚Äì\-‚Äî]+)', True),  # Vol(Issue): Pages
            (r',\s*(\d+)\s*\((\d+)\),\s*([\d‚Äì\-‚Äî]+)', True),  # , Vol(Issue), Pages
            (r'\d{4};(\d+)\((\d+)\):([\d‚Äì\-‚Äî]+)', True),  # Year;Vol(Issue):Pages (Vancouver)
            (r'\s(\d+)\s*\((\d+)\):\s*([\d‚Äì\-‚Äî]+)', True),  # Space Vol (Issue): Pages
            (r'\s(\d+):\s*([\d‚Äì\-‚Äî]+)', False),  # Vol: Pages (no issue)
        ]

        for pattern, has_issue in patterns:
            match = re.search(pattern, text)
            if match:
                if has_issue:
                    volume, issue, page_range = match.groups()
                else:
                    volume = match.group(1)
                    issue = "-"
                    page_range = match.group(2)
                page_range = normalize_page_range(page_range)
                break

    # Pattern 3: pp. X-Y (conference/book chapter)
    if page_range == "-":
        pp_match = re.search(r'pp\.\s*([\d‚Äì\-‚Äî]+)', text, re.I)
        if pp_match:
            page_range = normalize_page_range(pp_match.group(1))

    # ==================== First Author ====================
    if ',' in raw_ref:
        first_author = raw_ref.split(',')[0].strip()
    else:
        if year:
            if year_in_parentheses:
                year_str = f"({year})"
            else:
                year_str = f". {year}."

            year_pos = raw_ref.find(year_str)
            if year_pos > 0:
                first_author = raw_ref[:year_pos].strip()
            else:
                first_author = raw_ref.split('.')[0].strip() if '.' in raw_ref else raw_ref.split()[0]
        else:
            first_author = raw_ref.split('.')[0].strip() if '.' in raw_ref else raw_ref.split()[0]

    first_author = re.sub(r'^\[\d+\]\s*', '', first_author)
    first_author = re.sub(r'\(\d{4}\)', '', first_author).strip()
    first_author = re.sub(r'\.\s*\d{4}\.', '', first_author).strip()

    return {
        "raw_reference": raw_ref,
        "ref_title": title,
        "ref_first_author": first_author,
        "ref_year": year,
        "ref_journal": journal,
        "ref_volume": volume,
        "ref_issue": issue,
        "ref_page_range": page_range,
        "doi": doi
    }


# ================= OpenAlex API Functions =================

def query_openalex_by_doi(doi):
    if not doi:
        return None
    try:
        normalized = normalize_doi(doi)
        url = f"{OPENALEX_BASE}/doi:{normalized}"
        r = requests.get(url, headers=HEADERS, timeout=10)
        return r.json() if r.status_code == 200 else None
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è OpenAlex DOI lookup failed: {doi} - {str(e)}")
        return None


def query_openalex_by_title(title, max_results=10):
    if not title or title == "Unknown":
        return []
    try:
        t = title.lower()
        t = re.sub(r'[&:?,;]', ' ', t)
        t = re.sub(r'\s+', ' ', t).strip()
        words = t.split()
        t_short = " ".join(words[:8])

        params = {"filter": f"title.search:{t_short}", "per-page": max_results}
        r = requests.get(OPENALEX_BASE, headers=HEADERS, params=params, timeout=10)
        if r.status_code == 200:
            return r.json().get("results", [])
        return []
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è OpenAlex title search failed: {str(e)}")
        return []


def extract_openalex_metadata(record):
    authorships = record.get("authorships", [])
    full_author_list = ", ".join([a['author']['display_name'] for a in authorships]) or "Unknown"
    first_author = authorships[0]['author']['display_name'] if authorships else "Unknown"

    biblio = record.get("biblio", {})
    primary = record.get("primary_location", {})

    title = record.get("title", "Unknown")
    year = record.get("publication_year", "Unknown")

    source_name = "Unknown"
    if primary.get("source"):
        source_name = primary["source"].get("display_name", "Unknown")
    elif biblio.get("journal_name"):
        source_name = biblio.get("journal_name")

    volume = biblio.get("volume") or "-"
    issue = biblio.get("issue") or "-"

    first_page = biblio.get("first_page")
    last_page = biblio.get("last_page")
    if first_page and last_page:
        page_range = f"{first_page}-{last_page}" if first_page != last_page else str(first_page)
    else:
        page_range = first_page or "-"

    page_range = normalize_page_range(page_range)

    raw_oa_doi = record.get("doi")
    if raw_oa_doi:
        m = re.search(r'(10\.\d{4,9}/[^\s"\'<>]+)', raw_oa_doi, re.I)
        oa_doi_plain = m.group(1).rstrip('.,;)') if m else raw_oa_doi
    else:
        oa_doi_plain = None

    is_retracted = record.get("is_retracted", False)

    doc_type_raw = record.get("type", "unknown")
    doc_type_map = {
        "article": "Journal Article",
        "book-chapter": "Book Chapter",
        "proceedings-article": "Conference Paper",
        "posted-content": "Preprint",
        "dataset": "Dataset",
        "book": "Book",
        "dissertation": "Dissertation",
        "unknown": "Unknown"
    }
    doc_type = doc_type_map.get(doc_type_raw, doc_type_raw.replace("-", " ").title())

    return {
        "oa_full_author": full_author_list,
        "oa_first_author": first_author,
        "oa_title": title,
        "oa_year": year,
        "oa_journal": source_name,
        "oa_volume": volume,
        "oa_issue": issue,
        "oa_page_range": page_range,
        "openalex_id": record.get("id", "Unknown"),
        "oa_doi": oa_doi_plain,
        "is_retracted": is_retracted,
        "doc_type": doc_type,
        "data_source": "OpenAlex"
    }


def query_crossref_by_doi(doi):
    if not doi:
        return None
    try:
        normalized = normalize_doi(doi)
        url = f"{CROSSREF_BASE}/{normalized}"
        r = requests.get(url, headers=HEADERS, timeout=10)
        if r.status_code == 200:
            return r.json().get("message")
        return None
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è Crossref DOI lookup failed: {doi} - {str(e)}")
        return None


def query_crossref_by_title(title, max_results=5):
    if not title or title == "Unknown":
        return []
    try:
        params = {"query.title": title, "rows": max_results}
        r = requests.get(CROSSREF_BASE, headers=HEADERS, params=params, timeout=10)
        if r.status_code == 200:
            return r.json().get("message", {}).get("items", [])
        return []
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è Crossref title search failed: {str(e)}")
        return []


def extract_crossref_metadata(record):
    authors = record.get("author", [])
    if authors:
        full_author_list = ", ".join([f"{a.get('given', '')} {a.get('family', '')}".strip() for a in authors])
        first_author = f"{authors[0].get('given', '')} {authors[0].get('family', '')}".strip()
    else:
        full_author_list = "Unknown"
        first_author = "Unknown"

    title_list = record.get("title", [])
    title = title_list[0] if title_list else "Unknown"

    year = "Unknown"
    published = record.get("published-print") or record.get("published-online") or record.get("created")
    if published and "date-parts" in published:
        date_parts = published["date-parts"][0]
        if date_parts:
            year = date_parts[0]

    container_title = record.get("container-title", [])
    journal = container_title[0] if container_title else "Unknown"

    volume = record.get("volume", "-")
    issue = record.get("issue", "-")
    page = record.get("page", "-")

    doi = record.get("DOI", None)

    doc_type_raw = record.get("type", "unknown")
    doc_type_map = {
        "journal-article": "Journal Article",
        "book-chapter": "Book Chapter",
        "proceedings-article": "Conference Paper",
        "posted-content": "Preprint",
        "dataset": "Dataset",
        "book": "Book",
        "dissertation": "Dissertation",
        "unknown": "Unknown"
    }
    doc_type = doc_type_map.get(doc_type_raw, doc_type_raw.replace("-", " ").title())

    return {
        "oa_full_author": full_author_list,
        "oa_first_author": first_author,
        "oa_title": title,
        "oa_year": year,
        "oa_journal": journal,
        "oa_volume": volume,
        "oa_issue": issue,
        "oa_page_range": normalize_page_range(page),
        "openalex_id": "N/A (Crossref)",
        "oa_doi": doi,
        "is_retracted": False,
        "doc_type": doc_type,
        "data_source": "Crossref"
    }


def compare_metadata(parsed_ref, oa_meta):
    diff = {}

    ref_title_std = standardize_title(parsed_ref['ref_title'])
    oa_title_std = standardize_title(oa_meta['oa_title'])
    diff['oa_title'] = oa_meta['oa_title']
    diff['oa_title_diff'] = ref_title_std != oa_title_std

    ref_surname = extract_surname(parsed_ref['ref_first_author'])
    oa_surname = extract_surname(oa_meta['oa_first_author'])
    diff['oa_full_author'] = oa_meta['oa_full_author']
    diff['oa_full_author_diff'] = ref_surname != oa_surname

    ref_year = parsed_ref['ref_year']
    oa_year = oa_meta['oa_year']
    diff['oa_year'] = oa_year
    if ref_year and oa_year:
        delta = abs(ref_year - oa_year)
        diff['oa_year_delta'] = delta
        if delta == 0:
            diff['oa_year_diff'] = False
        elif delta <= 2:
            diff['oa_year_diff'] = "minor"
        else:
            diff['oa_year_diff'] = True
    else:
        diff['oa_year_diff'] = True
        diff['oa_year_delta'] = None

    for key in ['journal', 'volume', 'issue', 'page_range']:
        ref_val = str(parsed_ref[f'ref_{key}'])
        oa_val = str(oa_meta[f'oa_{key}'])

        if key == 'page_range':
            ref_val = normalize_page_range(ref_val)
            oa_val = normalize_page_range(oa_val)

        diff[f'oa_{key}'] = oa_val
        diff[f'oa_{key}_diff'] = ref_val != oa_val

    return diff


def verify_status(parsed_ref, oa_meta):
    score = 0

    title_score = fuzz.token_sort_ratio(
        standardize_title(parsed_ref['ref_title']),
        standardize_title(oa_meta['oa_title'])
    )
    if title_score >= 90:
        score += 2
    elif title_score >= 80:
        score += 1

    ref_surname = extract_surname(parsed_ref['ref_first_author'])
    oa_surname = extract_surname(oa_meta['oa_first_author'])
    if ref_surname and oa_surname and ref_surname == oa_surname:
        score += 1

    ref_year = parsed_ref['ref_year']
    oa_year = oa_meta['oa_year']
    if ref_year is not None and oa_year is not None and abs(ref_year - oa_year) <= 2:
        score += 1

    if score >= 4:
        return "verified", "high"
    elif score >= 2:
        return "ambiguous", "medium"
    else:
        return "unverified", "low"


def process_references(raw_references):
    results = []
    total = len(raw_references)

    for idx, raw in enumerate(raw_references):
        print(f"\n{'='*80}")
        print(f"[{idx + 1}/{total}] Processing:")
        print(f"{'='*80}")
        print(f"{raw}")
        print(f"{'-'*80}")

        parsed = parse_reference(raw)

        print(f"  üìù Parsed Information:")
        print(f"     Title: {parsed['ref_title']}")
        print(f"     Author: {parsed['ref_first_author']} (surname: {extract_surname(parsed['ref_first_author'])})")
        print(f"     Year: {parsed['ref_year']}")
        print(f"     Journal/Source: {parsed['ref_journal']}")
        print(f"     Volume: {parsed['ref_volume']}, Issue: {parsed['ref_issue']}, Pages: {parsed['ref_page_range']}")
        print(f"     DOI: {parsed['doi'] if parsed['doi'] else 'None'}")

        oa_record_from_doi = query_openalex_by_doi(parsed['doi'])
        doi_lookup_success = bool(oa_record_from_doi)
        data_source = None

        if oa_record_from_doi:
            print(f"  ‚úì Found match via OpenAlex DOI")
            data_source = "OpenAlex"
        elif parsed['doi']:
            print(f"  ‚úó DOI not found in OpenAlex")
            print(f"  üîÑ Trying Crossref DOI...")
            crossref_record = query_crossref_by_doi(parsed['doi'])
            if crossref_record:
                print(f"  ‚úì Found match via Crossref DOI")
                oa_record_from_doi = crossref_record
                doi_lookup_success = True
                data_source = "Crossref"
            else:
                print(f"  ‚úó DOI not found in Crossref either")

        time.sleep(REQUEST_DELAY)

        title_similarity_score = 0
        if oa_record_from_doi:
            if data_source == "OpenAlex":
                oa_title_from_doi = oa_record_from_doi.get("title", "")
            else:
                title_list = oa_record_from_doi.get("title", [])
                oa_title_from_doi = title_list[0] if title_list else ""

            title_similarity_score = fuzz.token_sort_ratio(
                standardize_title(parsed["ref_title"]),
                standardize_title(oa_title_from_doi)
            )
            print(f"  üìä Title similarity ({data_source} record): {title_similarity_score}%")

        oa_record = oa_record_from_doi
        matched_by_title = False

        if not oa_record or title_similarity_score < TITLE_MISMATCH_THRESHOLD:
            if title_similarity_score > 0 and title_similarity_score < TITLE_MISMATCH_THRESHOLD:
                print(f"  ‚ö†Ô∏è DOI record exists but title mismatch (similarity: {title_similarity_score}%)")

            if parsed["ref_title"] == "Unknown":
                print(f"  ‚ùå Cannot search by title: Title parsing failed")
            else:
                print(f"  üîç Searching OpenAlex by title: '{parsed['ref_title'][:60]}...'")
                candidates = query_openalex_by_title(parsed["ref_title"])
                time.sleep(REQUEST_DELAY)

                if not candidates:
                    print(f"  ‚úó No results from OpenAlex title search")
                    print(f"  üîÑ Trying Crossref title search...")
                    candidates = query_crossref_by_title(parsed["ref_title"])
                    data_source = "Crossref" if candidates else None
                    time.sleep(REQUEST_DELAY)
                    if not candidates:
                        print(f"  ‚úó No results from Crossref title search either")
                else:
                    data_source = "OpenAlex"

                if candidates:
                    best_score = 0
                    best_record = None
                    print(f"  üìã Found {len(candidates)} candidates in {data_source}, comparing titles...")

                    for c in candidates:
                        if data_source == "OpenAlex":
                            c_title = c.get("title", "")
                        else:
                            c_title_list = c.get("title", [])
                            c_title = c_title_list[0] if c_title_list else ""

                        score = fuzz.token_sort_ratio(
                            standardize_title(parsed["ref_title"]),
                            standardize_title(c_title)
                        )
                        if score > best_score:
                            best_score = score
                            best_record = c

                    if best_score >= TITLE_THRESHOLD and best_record:
                        oa_record = best_record
                        matched_by_title = True
                        print(f"  ‚úì Found match by title in {data_source} (similarity: {best_score}%)")
                    else:
                        print(f"  ‚úó No strong title match (best similarity: {best_score}%)")

        if oa_record:
            if data_source == "OpenAlex":
                oa_meta = extract_openalex_metadata(oa_record)
            else:
                oa_meta = extract_crossref_metadata(oa_record)

            print(f"  üìÑ Document Type (from {data_source}): {oa_meta['doc_type']}")

            meta_diff = compare_metadata(parsed, oa_meta)
            status, confidence = verify_status(parsed, oa_meta)

            original_doi = parsed.get("doi")
            oa_doi = oa_meta.get("oa_doi")

            original_doi_norm = normalize_doi(original_doi)
            oa_doi_norm = normalize_doi(oa_doi)

            final_title_similarity = fuzz.token_sort_ratio(
                standardize_title(parsed["ref_title"]),
                standardize_title(oa_meta["oa_title"])
            )

            if original_doi:
                if doi_lookup_success and final_title_similarity < TITLE_MISMATCH_THRESHOLD:
                    filled_doi = None
                    doi_fill_status = "doi_title_mismatch"
                    print(f"  ‚ùå DOI-Title mismatch (similarity: {final_title_similarity}%)")
                elif oa_doi_norm and original_doi_norm == oa_doi_norm and final_title_similarity >= TITLE_MISMATCH_THRESHOLD:
                    filled_doi = original_doi
                    doi_fill_status = "original_correct"
                    print(f"  ‚úì DOI verified: {original_doi}")
                elif matched_by_title and oa_doi:
                    filled_doi = oa_doi
                    doi_fill_status = "title_matched_doi_corrected"
                    print(f"  üîß Matched by title, DOI corrected: {original_doi} ‚Üí {oa_doi}")
                elif oa_doi and original_doi_norm != oa_doi_norm:
                    filled_doi = oa_doi
                    doi_fill_status = "original_wrong_corrected"
                    print(f"  ‚ö†Ô∏è DOI mismatch! Original: {original_doi} ‚Üí Corrected: {oa_doi}")
                else:
                    filled_doi = original_doi
                    doi_fill_status = "original_unverified"
            elif oa_doi:
                filled_doi = oa_doi
                doi_fill_status = "filled_from_database"
                print(f"  ‚ûï DOI added from {data_source}: {oa_doi}")
            else:
                filled_doi = None
                doi_fill_status = "missing"

            is_retracted = oa_meta.get('is_retracted', False)
        else:
            oa_meta = {
                k: "Unknown" for k in [
                    "oa_title", "oa_first_author", "oa_year", "oa_journal",
                    "oa_volume", "oa_issue", "oa_page_range", "openalex_id", "oa_doi"
                ]
            }
            oa_meta['is_retracted'] = False
            oa_meta['data_source'] = "None"
            oa_meta['doc_type'] = "Unknown"
            meta_diff = {
                f"{k}_diff": False for k in [
                    "oa_title", "oa_first_author", "oa_year", "oa_journal",
                    "oa_volume", "oa_issue", "oa_page_range", "openalex_id", "oa_doi"
                ]
            }
            meta_diff['oa_year_diff'] = True
            meta_diff['oa_year_delta'] = None
            status, confidence = "unverified", "unverified"
            is_retracted = False

            original_doi = parsed.get("doi")
            if original_doi:
                filled_doi = None
                doi_fill_status = "unverified"
                print(f"  ‚ùå DOI unverified (not found in OpenAlex or Crossref)")
            else:
                filled_doi = None
                doi_fill_status = "missing"
                print(f"  ‚ùì No DOI provided and no match found")

        result = {
            **parsed,
            **oa_meta,
            **meta_diff,
            "filled_doi": filled_doi,
            "doi_fill_status": doi_fill_status,
            "status": status,
            "confidence": confidence
        }
        results.append(result)

    return results


def generate_html_table(df):
    def get_cell_style(row, col):
        if col == "doc_type":
            doc_type = str(row['doc_type'])
            if doc_type == "Journal Article":
                return 'color: #1976D2; font-weight: bold'
            elif doc_type == "Conference Paper":
                return 'color: #FF6F00; font-weight: bold'
            elif doc_type == "Book Chapter":
                return 'color: #7B1FA2; font-weight: bold'
            elif doc_type == "Preprint":
                return 'color: #00897B; font-weight: bold'

        if col == "data_source":
            source = str(row['data_source'])
            if source == "OpenAlex":
                return 'color: #1976D2; font-weight: bold'
            elif source == "Crossref":
                return 'color: #FF6F00; font-weight: bold'
            elif source == "None":
                return 'color: gray'

        if col == "is_retracted":
            if row['is_retracted'] == True:
                return 'background-color: #D32F2F; color: white; font-weight: bold'
            else:
                return 'color: green'

        elif col == "doi_fill_status":
            status = str(row['doi_fill_status'])
            if status == "original_correct":
                return 'color: green; font-weight: bold'
            elif status == "filled_from_database":
                return 'color: blue; font-weight: bold'
            elif status == "title_matched_doi_corrected":
                return 'color: #1976D2; font-weight: bold'
            elif status == "original_wrong_corrected":
                return 'background-color: #FFA726; color: white; font-weight: bold'
            elif status == "doi_title_mismatch":
                return 'background-color: #E91E63; color: white; font-weight: bold'
            elif status == "unverified":
                return 'color: red; font-weight: bold'
            elif status == "missing":
                return 'color: gray'

        elif col == "filled_doi":
            status = str(row['doi_fill_status'])
            if status == "filled_from_database":
                return 'color: blue; font-weight: bold'
            elif status == "title_matched_doi_corrected":
                return 'color: #1976D2; font-weight: bold'
            elif status == "original_wrong_corrected":
                return 'color: orange; font-weight: bold'
            elif status in ["unverified", "doi_title_mismatch"]:
                return 'color: red; font-weight: bold'

        elif col == "doi":
            if row['doi_fill_status'] in ["unverified", "doi_title_mismatch"]:
                return 'color: red; font-weight: bold'

        elif col == "oa_year":
            if row['oa_year_diff'] == False:
                return 'color: green'
            elif row['oa_year_diff'] == "minor":
                return 'background-color: yellow'
            elif row['oa_year_diff'] == True:
                return 'color: red'

        elif row['status'] == 'verified' and col.startswith('oa'):
            return 'color: green'

        elif row['status'] == 'unverified' and col.startswith('oa'):
            return 'color: red'

        elif col in ["oa_title", "oa_full_author", "oa_journal", "oa_volume", "oa_issue", "oa_page_range"] and row.get(f"{col}_diff", False):
            return 'color: red'

        return ""

    html = """
    <style>
        table { border-collapse: collapse; width: 100%; font-size: 11px; }
        th { background-color: #1976D2; color: white; padding: 8px; text-align: left; position: sticky; top: 0; z-index: 10; }
        td { padding: 6px; border: 1px solid #ddd; word-wrap: break-word; max-width: 300px; }
        tr:nth-child(even) { background-color: #f9f9f9; }
        tr:hover { background-color: #f5f5f5; }
        .container { max-height: 600px; overflow: auto; }
    </style>
    <div class="container">
    <table>
        <thead><tr>
    """

    for col in df.columns:
        html += f"<th>{col}</th>"
    html += "</tr></thead><tbody>"

    for idx, row in df.iterrows():
        html += "<tr>"
        for col in df.columns:
            style = get_cell_style(row, col)
            value = row[col]
            if pd.isna(value):
                value = "-"
            html += f'<td style="{style}">{value}</td>'
        html += "</tr>"

    html += "</tbody></table></div>"
    return html


print("=" * 80)
print("üìö LLM-Generated Reference Verification Tool - Interactive Version")
print("   ")
print("   Usage:")
print("   1. Run this cell")
print("   2. Paste references into the text box")
print("   3. Click Run Verification button")
print("   4. View results with color highlighting")
print("   ")
print("   Supports: APA, Harvard, Chicago, IEEE, ACM, MLA")
print("=" * 80)
print("\n‚ö†Ô∏è  IMPORTANT: Replace email in HEADERS!")
print("   ")
print("\nüìÑ  Paste references below (one per line)\n")


text_input = widgets.Textarea(
    value='',
    placeholder='Paste references...\n\nExample:\nCortes, C., & Vapnik, V. (1995). Support-vector Networks. Machine Learning, 20(3), 273‚Äì297. \nHinton, G. E., Osindero, S., & Teh, Y. (2006). A fast learning algorithm for deep belief nets. Neural Computation, 18(7), 1527‚Äì1554. https://doi.org/10.1162/neco.2006.18.7.1527',
    description='References:',
    layout=widgets.Layout(width='95%', height='300px'),
    style={'description_width': 'initial'}
)

run_button = widgets.Button(
    description='üöÄ Run Verification',
    button_style='success',
    layout=widgets.Layout(width='200px', height='40px')
)

output = widgets.Output()

def on_button_click(b):
    with output:
        output.clear_output()
        content = text_input.value.strip()
        if not content:
            print("‚ùå Please enter at least one reference.")
            return

        references = [line.strip() for line in content.split('\n') if line.strip()]
        print(f"\n‚úì Loaded {len(references)} references\n")

        results = process_references(references)
        df = pd.DataFrame(results)

        status_icon_map = {"verified": "‚úÖ", "ambiguous": "‚ö†Ô∏è", "unverified": "‚ùå"}
        df['status_icon'] = df['status'].map(status_icon_map)

        front_cols = ["status", "confidence", "status_icon", "is_retracted", "doc_type", "data_source"]
        doi_cols = ["doi", "filled_doi", "doi_fill_status"]
        other_cols = [c for c in df.columns if c not in front_cols + doi_cols + ["oa_first_author_diff", "openalex_id_diff"]]
        df = df[front_cols + doi_cols + other_cols]

        print("\n" + "="*80)
        print("üìä Verification Statistics")
        print("="*80)

        verified_count = len(df[df['status'] == 'verified'])
        ambiguous_count = len(df[df['status'] == 'ambiguous'])
        unverified_count = len(df[df['status'] == 'unverified'])
        retracted_count = len(df[df['is_retracted'] == True])

        journal_count = len(df[df['doc_type'] == 'Journal Article'])
        conf_count = len(df[df['doc_type'] == 'Conference Paper'])
        book_count = len(df[df['doc_type'] == 'Book Chapter'])
        other_doc_count = len(df) - journal_count - conf_count - book_count

        openalex_count = len(df[df['data_source'] == 'OpenAlex'])
        crossref_count = len(df[df['data_source'] == 'Crossref'])
        none_count = len(df[df['data_source'] == 'None'])

        print(f"Total references: {len(df)}")
        print(f"‚úÖ Verified: {verified_count} ({verified_count/len(df)*100:.1f}%)")
        print(f"‚ö†Ô∏è  Ambiguous: {ambiguous_count} ({ambiguous_count/len(df)*100:.1f}%)")
        print(f"‚ùå Unverified: {unverified_count} ({unverified_count/len(df)*100:.1f}%)")
        print(f"üö® Retracted: {retracted_count}")

        print(f"\nüìÑ Document Types (from database):")
        print(f"  üìò Journal Articles: {journal_count}")
        print(f"  üìô Conference Papers: {conf_count}")
        print(f"  üìï Book Chapters: {book_count}")
        if other_doc_count > 0:
            print(f"  üìó Other: {other_doc_count}")

        print(f"\nüóÑÔ∏è Data Sources:")
        print(f"  üìò OpenAlex: {openalex_count}")
        print(f"  üìô Crossref: {crossref_count}")
        print(f"  ‚ùå Not found: {none_count}")

        doi_correct = len(df[df['doi_fill_status'] == 'original_correct'])
        doi_filled = len(df[df['doi_fill_status'] == 'filled_from_database'])
        doi_corrected = len(df[df['doi_fill_status'] == 'original_wrong_corrected'])
        doi_title_corrected = len(df[df['doi_fill_status'] == 'title_matched_doi_corrected'])
        doi_title_mismatch = len(df[df['doi_fill_status'] == 'doi_title_mismatch'])
        doi_unverified = len(df[df['doi_fill_status'] == 'unverified'])
        doi_missing = len(df[df['doi_fill_status'] == 'missing'])

        print(f"\nüìã DOI Status:")
        print(f"  ‚úì Original correct: {doi_correct}")
        print(f"  ‚ûï Filled from database: {doi_filled}")
        print(f"  üîß Corrected (wrong original): {doi_corrected}")
        print(f"  üîÑ Title matched, DOI corrected: {doi_title_corrected}")
        print(f"  ‚ö†Ô∏è  DOI-Title mismatch: {doi_title_mismatch}")
        print(f"  ‚ùå Unverified: {doi_unverified}")
        print(f"  ‚ùì Missing: {doi_missing}")

        print("="*80)

        print("\nüìã Detailed Results (with color highlighting):\n")
        html_table = generate_html_table(df)
        display(HTML(html_table))

        print("\n‚úÖ Processing complete!")
        print("\nüíæ To export results, use: df.to_csv('verification_results.csv', index=False)")

run_button.on_click(on_button_click)

display(text_input)
display(run_button)
display(output)
