In [13]:
import requests
import fitz  # PyMuPDF for PDF text extraction
import pandas as pd
from google.colab import files
import os
import re

# Ensure PyMuPDF is installed
!pip install pymupdf

def upload_pdf():
    """Prompts user to upload a PDF file and returns the file path."""
    uploaded = files.upload()  # Opens file upload dialog in Colab
    pdf_path = list(uploaded.keys())[0]  # Gets the uploaded file name
    return pdf_path

def extract_authors_from_first_page(pdf_path):
    """Extracts author names from the first page of a PDF."""
    doc = fitz.open(pdf_path)
    first_page = doc[0].get_text("text")  # Extract text from first page
    lines = first_page.split("\n")

    authors = []
    author_pattern = re.compile(r"([\w\s-]+)(?=\s*[\^†‡*\d])")  # Capture names before superscripts

    found_title = False
    for line in lines:
        if not found_title:
            found_title = True  # Assume first non-empty line is the title
            continue  # Move to the next line (where authors should be)

        matches = author_pattern.findall(line)  # Look for authors in the next line
        if matches:
            authors.extend([name.strip() for name in matches if name.strip()])  # Clean and add names
            break  # Stop after finding the author list

    return authors

def upload_excel():
    """Prompts user to upload an Excel file and returns the file path."""
    uploaded = files.upload()
    excel_path = list(uploaded.keys())[0]
    return excel_path

def get_excel_path():
    """Checks for 'retraction_watch.xlsx' in the current directory.
    If found, returns the path. Otherwise, prompts the user to upload."""
    excel_file = 'retraction_watch.xlsx'
    if os.path.exists(excel_file):
        print(f"Using existing Excel file: {excel_file}")
        return excel_file
    else:
        print(f"'{excel_file}' not found. Please upload the Excel file.")
        return upload_excel()

def remove_numbers_from_author(author):
    """Removes numbers from an author name."""
    return re.sub(r'\d+', '', author).strip()

def search_retractions_by_author_excel(authors, excel_path):
    """Searches for retractions associated with each author using names from Excel."""
    df = pd.read_excel(excel_path)
    excel_authors_col = df['Author']
    retraction_doi_col = df['RetractionDOI']
    reason_col = df['Reason']

    results = []
    for author in authors:
        author_no_numbers = remove_numbers_from_author(author)
        found_in_excel = False
        doi = ""
        reason = ""
        for index, excel_authors in enumerate(excel_authors_col):
            if isinstance(excel_authors, str):  # Check if the value is a string
                excel_author_list = [a.strip() for a in excel_authors.split(';')]
                if author_no_numbers in excel_author_list:
                    found_in_excel = True
                    doi = retraction_doi_col[index]
                    reason = reason_col[index]
                    break
        doi_link = f"https://doi.org/{doi}" if doi else ""
        results.append([author, "YES" if found_in_excel else "NO", doi_link if found_in_excel else "", reason if found_in_excel else ""])

    return results

def save_results_to_csv(results, pdf_path):
    """Saves the results to a CSV file with the same base name as the input PDF."""
    base_name = os.path.splitext(pdf_path)[0]  # Extract filename without extension
    output_filename = f"{base_name}.authorretractions.csv"

    df = pd.DataFrame(results, columns=["Author", "Found in Excel Author", "RetractionDOI", "Reason"])
    df.to_csv(output_filename, index=False)

    print(f"Verification completed. Downloading {output_filename}...")
    files.download(output_filename)

def main():
    print("Please upload a PDF file containing author information.")
    pdf_path = upload_pdf()  # User uploads a file

    authors = extract_authors_from_first_page(pdf_path)
    authors = [remove_numbers_from_author(a) for a in authors]  # Remove numbers from PDF authors

    if not authors:
        print("No authors found in the document.")
        return

    excel_path = get_excel_path()  # Get Excel file path

    results = search_retractions_by_author_excel(authors, excel_path)
    save_results_to_csv(results, pdf_path)

if __name__ == "__main__":
    main()


Please upload a PDF file containing author information.


Saving POF24-AR-00230_MS.pdf to POF24-AR-00230_MS.pdf
Using existing Excel file: retraction_watch.xlsx
Verification completed. Downloading POF24-AR-00230_MS.authorretractions.csv...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')