In [3]:
import os
import re
import requests
import fitz  # PyMuPDF
import pandas as pd
from IPython.display import Markdown, display
from datetime import datetime

OPENALEX_URL = "https://api.openalex.org/works"
SEMANTIC_SCHOLAR_URL = "https://api.semanticscholar.org/graph/v1/paper/"

def search_openalex(query, n=10, sort_by="cited_by_count"):
    """Search OpenAlex and return sorted results by a field (default: citations)."""
    params = {
        "search": query,
        "per_page": n,
        "sort": f"{sort_by}:desc"  # descending order
    }
    r = requests.get(OPENALEX_URL, params=params, timeout=30)
    r.raise_for_status()
    results = r.json().get("results", [])
    return results

def get_pdf_link(paper):
    arxiv_id = paper.get("ids", {}).get("arxiv")
    if arxiv_id:
        return f"https://arxiv.org/pdf/{arxiv_id.split('/')[-1]}.pdf"
    return paper.get("primary_location", {}).get("landing_page_url")

def download_pdf(url, filename):
    try:
        r = requests.get(url, stream=True, timeout=60)
        r.raise_for_status()
        content_type = r.headers.get('Content-Type', '')
        if 'pdf' not in content_type.lower():
            return None
        with open(filename, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
        return filename
    except Exception:
        return None

def extract_full_text_from_pdf(path):
    try:
        with fitz.open(path) as doc:
            return "".join([page.get_text("text") + "\n" for page in doc])
    except Exception:
        return None

def extract_sections_from_pdf(path):
    text = extract_full_text_from_pdf(path)
    if not text:
        return {}
    sections = {}
    for sec in ["abstract", "introduction", "methods", "materials and methods"]:
        pattern = re.compile(
            rf"(?i)\b{sec}\b[\s:]*([\s\S]*?)(?=\n[A-Z][^\n]{{0,60}}\n|$)"
        )
        match = pattern.search(text)
        if match:
            sections[sec.lower()] = match.group(1).strip()
    return sections

def get_semantic_scholar_abstract(doi_or_title):
    try:
        if doi_or_title.startswith("10."):
            url = SEMANTIC_SCHOLAR_URL + f"DOI:{doi_or_title}?fields=title,abstract"
        else:
            search_url = "https://api.semanticscholar.org/graph/v1/paper/search"
            res = requests.get(search_url, params={"query": doi_or_title, "limit": 1})
            data = res.json().get("data", [])
            if not data:
                return None
            paper_id = data[0]["paperId"]
            url = SEMANTIC_SCHOLAR_URL + f"{paper_id}?fields=title,abstract"
        res = requests.get(url, timeout=15)
        if res.status_code == 200:
            return res.json().get("abstract")
    except Exception:
        pass
    return None

def find_and_extract(query, n=3, mode="sections", print_output=True):
    papers = search_openalex(query, n=n*4, sort_by="cited_by_count")
    results = []
    success_count = 0
    i = 0
    paper_index = 1

    current_year = datetime.now().year

    while success_count < n and i < len(papers):
        paper = papers[i]
        i += 1

        title = paper["display_name"]
        doi = paper.get("doi", "")
        citations = paper.get("cited_by_count", 0)
        pub_date = paper.get("publication_date", "N/A")
        publication_year = pub_date.split("-")[0] if pub_date != "N/A" else "N/A"

        authorships = paper.get("authorships", [])
        first_author_inst = authorships[0]["institutions"][0]["display_name"] if authorships and authorships[0]["institutions"] else "N/A"
        last_author_inst = authorships[-1]["institutions"][0]["display_name"] if authorships and authorships[-1]["institutions"] else "N/A"
        pdf_url = get_pdf_link(paper)

        # Extract last 4 years of citations
        counts_by_year = {int(c["year"]): c["cited_by_count"] for c in paper.get("counts_by_year", [])}
        last_4_years_citations = {str(year): counts_by_year.get(year, 0) for year in range(current_year-3, current_year+1)}

        text_data = {"abstract": None, "introduction": None, "methods": None, "full_text": None}
        extracted_something = False

        if mode != "notext" and pdf_url:
            filename = f"paper_{paper_index}.pdf"
            downloaded = download_pdf(pdf_url, filename)
            if downloaded and os.path.exists(filename):
                if mode == "full":
                    text_data["full_text"] = extract_full_text_from_pdf(filename)
                    extracted_something = True
                elif mode == "sections":
                    sections = extract_sections_from_pdf(filename)
                    if sections:
                        text_data.update({k: sections.get(k) for k in ["abstract", "introduction", "methods"]})
                        extracted_something = True
                try:
                    os.remove(filename)
                except Exception:
                    pass

        if not extracted_something and mode != "notext":
            abstract = get_semantic_scholar_abstract(doi or title)
            if abstract:
                text_data["abstract"] = abstract
                extracted_something = True

        result = {
            "title": title,
            "doi": doi,
            "citations_total": citations,
            "publication_date": pub_date,
            "publication_year": publication_year,
            "first_author_institution": first_author_inst,
            "last_author_institution": last_author_inst,
            "pdf_url": pdf_url,
            **last_4_years_citations,  # Add citations per year
            **text_data
        }

        results.append(result)
        success_count += 1
        paper_index += 1

        if print_output:
            print(f"[{paper_index-1}] {title} | Total Citations: {citations} | DOI: {doi or 'N/A'} | Year: {publication_year}")
            print("-" * 100)

    df = pd.DataFrame(results)
    return df

def render_md_dataframe(df):
    # Convert all values to string
    str_df = df.astype(str)
    
    # Get the max width of each column
    col_widths = [max(len(str_df[col][i]) for i in range(len(df))) for col in df.columns]
    col_widths = [max(len(col), w) for col, w in zip(df.columns, col_widths)]
    
    # Build header
    header = "| " + " | ".join(col.ljust(col_widths[i]) for i, col in enumerate(df.columns)) + " |"
    separator = "| " + " | ".join("-" * col_widths[i] for i in range(len(df.columns))) + " |"
    
    # Build rows
    rows = []
    for i in range(len(df)):
        row = "| " + " | ".join(str_df.iloc[i, j].ljust(col_widths[j]) for j in range(len(df.columns))) + " |"
        rows.append(row)
    
    # Combine everything
    md_table = "\n".join([header, separator] + rows)
    display(Markdown(md_table))


In [4]:
if __name__ == "__main__":
    query = input("Enter your topic or paper title: ")
    mode = input("Enter mode ('full' for full text, 'sections' for abstract+intro+methods, 'notext' for metadata only): ").strip().lower()
    if mode not in ["full", "sections", "notext"]:
        mode = "sections"
    df = find_and_extract(query, n=10, mode=mode)
   
    render_md_dataframe(df)

[1] Complex brain networks: graph theoretical analysis of structural and functional systems | Total Citations: 11369 | DOI: https://doi.org/10.1038/nrn2575 | Year: 2009
----------------------------------------------------------------------------------------------------
[2] FSL | Total Citations: 10549 | DOI: https://doi.org/10.1016/j.neuroimage.2011.09.015 | Year: 2011
----------------------------------------------------------------------------------------------------
[3] The WU-Minn Human Connectome Project: An overview | Total Citations: 5663 | DOI: https://doi.org/10.1016/j.neuroimage.2013.05.041 | Year: 2013
----------------------------------------------------------------------------------------------------
[4] The minimal preprocessing pipelines for the Human Connectome Project | Total Citations: 5357 | DOI: https://doi.org/10.1016/j.neuroimage.2013.04.127 | Year: 2013
----------------------------------------------------------------------------------------------------
[5] A multi-

| title                                                                                      | doi                                              | citations_total | publication_date | publication_year | first_author_institution           | last_author_institution                      | pdf_url                                          | 2022 | 2023 | 2024 | 2025 | abstract | introduction | methods | full_text |
| ------------------------------------------------------------------------------------------ | ------------------------------------------------ | --------------- | ---------------- | ---------------- | ---------------------------------- | -------------------------------------------- | ------------------------------------------------ | ---- | ---- | ---- | ---- | -------- | ------------ | ------- | --------- |
| Complex brain networks: graph theoretical analysis of structural and functional systems    | https://doi.org/10.1038/nrn2575                  | 11369           | 2009-02-04       | 2009             | Addenbrooke's Hospital             | Indiana University Bloomington               | https://doi.org/10.1038/nrn2575                  | 836  | 849  | 751  | 451  | None     | None         | None    | None      |
| FSL                                                                                        | https://doi.org/10.1016/j.neuroimage.2011.09.015 | 10549           | 2011-09-16       | 2011             | University of Oxford               | University of Oxford                         | https://doi.org/10.1016/j.neuroimage.2011.09.015 | 1153 | 1220 | 1228 | 838  | None     | None         | None    | None      |
| The WU-Minn Human Connectome Project: An overview                                          | https://doi.org/10.1016/j.neuroimage.2013.05.041 | 5663            | 2013-05-16       | 2013             | Washington University in St. Louis | University of Minnesota                      | https://doi.org/10.1016/j.neuroimage.2013.05.041 | 696  | 681  | 727  | 411  | None     | None         | None    | None      |
| The minimal preprocessing pipelines for the Human Connectome Project                       | https://doi.org/10.1016/j.neuroimage.2013.04.127 | 5357            | 2013-05-10       | 2013             | Washington University in St. Louis | Wellcome Centre for Integrative Neuroimaging | https://doi.org/10.1016/j.neuroimage.2013.04.127 | 699  | 707  | 756  | 402  | None     | None         | None    | None      |
| A multi-modal parcellation of human cerebral cortex                                        | https://doi.org/10.1038/nature18933              | 4927            | 2016-07-19       | 2016             | Washington University in St. Louis | Washington University in St. Louis           | https://doi.org/10.1038/nature18933              | 621  | 701  | 719  | 396  | None     | None         | None    | None      |
| Mapping the Structural Core of Human Cerebral Cortex                                       | https://doi.org/10.1371/journal.pbio.0060159     | 4230            | 2008-06-24       | 2008             | University of Lausanne             | Indiana University Bloomington               | https://doi.org/10.1371/journal.pbio.0060159     | 247  | 211  | 192  | 113  | None     | None         | None    | None      |
| BrainNet Viewer: A Network Visualization Tool for Human Brain Connectomics                 | https://doi.org/10.1371/journal.pone.0068910     | 3908            | 2013-07-04       | 2013             | Beijing Normal University          | Beijing Normal University                    | https://doi.org/10.1371/journal.pone.0068910     | 463  | 442  | 407  | 262  | None     | None         | None    | None      |
| Cluster failure: Why fMRI inferences for spatial extent have inflated false-positive rates | https://doi.org/10.1073/pnas.1602413113          | 3442            | 2016-06-28       | 2016             | Linköping University               | Linköping University                         | https://doi.org/10.1073/pnas.1602413113          | 269  | 250  | 190  | 120  | None     | None         | None    | None      |
| DPARSF: a MATLAB toolbox for “pipeline” data analysis of resting-state fMRI                | https://doi.org/10.3389/fnsys.2010.00013         | 3408            | 2010-01-01       | 2010             | Beijing Normal University          | Beijing Normal University                    | https://doi.org/10.3389/fnsys.2010.00013         | 335  | 271  | 247  | 159  | None     | None         | None    | None      |
| fMRIPrep: a robust preprocessing pipeline for functional MRI                               | https://doi.org/10.1038/s41592-018-0235-4        | 3316            | 2018-12-04       | 2018             | Stanford University                | Stanford University                          | https://doi.org/10.1038/s41592-018-0235-4        | 508  | 660  | 747  | 534  | None     | None         | None    | None      |