In [2]:
import os
import time
import io
import json
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseUpload
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

In [3]:

# Google Drive authentication setup
SCOPES = ['https://www.googleapis.com/auth/drive.file']

def authenticate_google_drive():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    drive_service = build('drive', 'v3', credentials=creds)
    return drive_service
# Initialize Drive service
service = authenticate_google_drive()

In [4]:
def setup_selenium():
    """Set up Selenium with headless Chrome."""
    chrome_options = Options()
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    prefs = {"profile.default_content_setting_values": {"images": 2}}
    chrome_options.add_experimental_option("prefs", prefs)
    return webdriver.Chrome(options=chrome_options)

In [5]:
# Fetch PDF link using Selenium
def fetch_pdf_link(article_url, driver):
    try:
        driver.get(article_url)

        # Handle cookies dialog if present
        try:
            WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept')]"))
            ).click()
            print("Cookies accepted.")
        except Exception:
            print("No cookies dialog or already handled.")

        # Find the PDF download button
        pdf_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '.pdf')]"))
        )
        pdf_link = pdf_element.get_attribute("href")
        print(f"PDF link found: {pdf_link}")
        return pdf_link
    except Exception as e:
        print(f"Error fetching PDF link from {article_url}: {e}")
        return None


In [6]:
# Download the PDF and upload to Google Drive
def download_and_upload_pdf(pdf_link, title, folder_id, service):
    try:
        # Download the PDF
        response = requests.get(pdf_link, stream=True)
        response.raise_for_status()
        filename = f"{title[:100]}.pdf"
        filepath = os.path.join("./pdfs", filename)

        with open(filepath, "wb") as f:
            f.write(response.content)
        print(f"Downloaded PDF: {filepath}")

        # Upload to Google Drive
        file_metadata = {'name': filename, 'parents': [folder_id]}
        media = MediaIoBaseUpload(io.BytesIO(response.content), mimetype='application/pdf')
        uploaded_file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
        print(f"Uploaded '{filename}' to Google Drive (File ID: {uploaded_file['id']})")

    except Exception as e:
        print(f"Error downloading or uploading PDF: {e}")

In [7]:
# Update the checkpoint file
def update_checkpoint(checkpoint_file, doi):
    with open(checkpoint_file, "a") as f:
        f.write(doi + "\n")

1v0NkebMgXHzSslTKf6GL62Jcvib7RA8O'

In [8]:
# Main workflow
def main():
    api_key = 'a098f016b3dd3963ecb73d756fd38227'  # Springer API key
    base_url = "https://api.springernature.com/openaccess/json"
    query = "MOSFET"
    folder_id = "1v0NkebMgXHzSslTKf6GL62Jcvib7RA8O"  # Replace with your folder ID
    checkpoint_file = "checkpoint.txt"
    os.makedirs("./pdfs", exist_ok=True)

    # Load checkpoint
    checkpoint = set()
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            checkpoint = set(f.read().splitlines())

    # Authenticate Google Drive
    service = authenticate_google_drive()

    # Set up Selenium driver
    driver = setup_selenium()

    # Pagination parameters
    start = 1
    step = 10

    while True:
        try:
            # Fetch articles from Springer API
            params = {
                "q": query,
                "s": start,
                "p": step,
                "api_key": api_key,
            }
            response = requests.get(base_url, params=params)
            response.raise_for_status()
            data = response.json()
            records = data.get("records", [])

            if not records:
                print("No more records found. Exiting.")
                break

            for record in records:
                # Skip non-articles
                if record.get("contentType") != "Article":
                    print(f"Skipping non-article record: {record.get('title', 'Unknown Title')}")
                    continue

                doi = record.get("identifier", "").split(":")[1]
                title = record.get("title", "Unknown Title")
                article_url = record.get("url", [{}])[0].get("value", "")

                # Skip already processed articles
                if doi in checkpoint:
                    print(f"Skipping already processed article: {title} (DOI: {doi})")
                    continue

                print(f"Processing article: {title} (DOI: {doi})")
                print(f"Navigating to: {article_url}")

                # Fetch the PDF link
                pdf_link = fetch_pdf_link(article_url, driver)
                if pdf_link:
                    # Download and upload the PDF
                    download_and_upload_pdf(pdf_link, title, folder_id, service)

                    # Update checkpoint
                    update_checkpoint(checkpoint_file, doi)
                    checkpoint.add(doi)

            # Increment start for pagination
            start += step

        except Exception as e:
            print(f"Error: {e}")
            break

    # Close Selenium driver
    driver.quit()

if __name__ == "__main__":
    main()

Skipping non-article record: Introduction to Single-Event Effects
Skipping non-article record: Radiation Hardening
Skipping already processed article: Progress on mechanical and tribological characterization of 2D materials by AFM force spectroscopy (DOI: 10.1007/s40544-024-0864-9)
Skipping already processed article: High-voltage FinFET with floating poly and high-k material for enhanced intrinsic gain and safe operating area (DOI: 10.1038/s41598-024-79881-3)
Skipping already processed article: Manufacturing carbon nanotube transistors using lift-off process: limitations and prospects (DOI: 10.1007/s44275-024-00016-x)
Processing article: Brayton–Moser passivity based controller for constant power load with interleaved boost converter (DOI: 10.1038/s41598-024-79405-z)
Navigating to: http://dx.doi.org/10.1038/s41598-024-79405-z
Cookies accepted.
PDF link found: https://www.nature.com/articles/s41598-024-79405-z.pdf
Downloaded PDF: ./pdfs/Brayton–Moser passivity based controller for const