In [1]:
import os
import time
import requests
from selenium import webdriver
from concurrent.futures import ThreadPoolExecutor
import concurrent
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC #expilicit wait

In [2]:
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.http import MediaIoBaseUpload
from selenium.common.exceptions import StaleElementReferenceException
import io

Google Drive Authentication

In [3]:
#Authenticate with Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive.file']

In [4]:
# Authenticate with Google Drive
def authenticate_google_drive():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    drive_service = build('drive', 'v3', credentials=creds)
    return drive_service

In [5]:
drive_service = authenticate_google_drive()
parent_folder_id = '1WZ2l6ZT3Op4X7P95wrLt2P2asVxd6rMz'

Selenium Konfigürasyonları

In [6]:
#hızlı olsun diye, js link veya butonu çıkarsa js engelini kaldır
options = Options()
prefs = {"profile.default_content_setting_values": {"images": 2, "javascript": 2}}  # Block images and JS
options.headless = True # Run in headless mode
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)

In [7]:
driver.get("https://aclanthology.org")
driver.implicitly_wait(15)

In [None]:
# Locate the table containing the event links
try:
    table = driver.find_element(By.XPATH, "//h6[text()='ACL Events']/following-sibling::table[contains(@class, 'table-hover')]")
    print("ACL Events table found successfully!")
except Exception as e:
    print(f"Error: Table not found. {e}")

Google drive main operations

In [282]:
# Function to check if a folder already exists in Google Drive
def check_folder_exists(service, folder_name, parent_folder_id=None):
    query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder'"
    if parent_folder_id:
        query += f" and '{parent_folder_id}' in parents"
    
    results = service.files().list(q=query, spaces='drive').execute()
    files = results.get('files', [])
    if files:
        return files[0]['id']  # Return the existing folder ID
    return None

# Function to create a folder in Google Drive
def create_folder_in_drive(service, folder_name, parent_folder_id=None):
    folder_metadata = {
        'name': folder_name,
        'mimeType': 'application/vnd.google-apps.folder',
    }
    if parent_folder_id:
        folder_metadata['parents'] = [parent_folder_id]
    folder = service.files().create(body=folder_metadata, fields='id').execute()
    print(f"Folder '{folder_name}' created with ID: {folder.get('id')}")
    return folder.get('id')

# Function to upload a file to Google Drive
def upload_file_to_drive(service, file_name, folder_id, file_content):
    file_metadata = {
        'name': file_name,
        'parents': [folder_id]
    }
    media = MediaIoBaseUpload(io.BytesIO(file_content), mimetype='application/pdf')
    
    # Upload the file to Google Drive
    uploaded_file = service.files().create(
        body=file_metadata,
        media_body=media,
        fields='id'
    ).execute()

    print(f"Uploaded {file_name} to Google Drive with ID: {uploaded_file.get('id')}")

Checkpoint operations

In [283]:
def load_checkpoint_from_drive(service, folder_id):
    # Query for the checkpoint file in the given volume folder
    query = f"name='checkpoint.txt' and '{folder_id}' in parents"
    results = service.files().list(q=query, spaces='drive').execute()
    files = results.get('files', [])
    
    if files:
        checkpoint_file_id = files[0]['id']
        checkpoint_content = service.files().get_media(fileId=checkpoint_file_id).execute().decode('utf-8')
        processed_papers = set(checkpoint_content.split('\n'))
        print(f"Loaded {len(processed_papers)} processed papers from checkpoint.")
    else:
        processed_papers = set()
        print("No checkpoint file found. Starting fresh.")
    
    return processed_papers

# Function to update the checkpoint.txt file in Google Drive
def update_checkpoint_in_drive(service, folder_id, paper_title):
    # Query for the checkpoint file
    query = f"name='checkpoint.txt' and '{folder_id}' in parents"
    results = service.files().list(q=query, spaces='drive').execute()
    files = results.get('files', [])
    
    if files:
        checkpoint_file_id = files[0]['id']
        # Get the existing content
        existing_content = service.files().get_media(fileId=checkpoint_file_id).execute().decode('utf-8')
        new_content = existing_content + paper_title + "\n"
        media = MediaIoBaseUpload(io.BytesIO(new_content.encode('utf-8')), mimetype='text/plain')
        service.files().update(fileId=checkpoint_file_id, media_body=media).execute()
    else:
        # Create a new checkpoint file if it doesn't exist
        file_metadata = {
            'name': 'checkpoint.txt',
            'parents': [folder_id],
            'mimeType': 'text/plain'
        }
        media = MediaIoBaseUpload(io.BytesIO((paper_title + "\n").encode('utf-8')), mimetype='text/plain')
        service.files().create(body=file_metadata, media_body=media).execute()
    
    print(f"Updated checkpoint with {paper_title}.")

In [284]:
# Function to download and upload the actual PDF from the paper page
def download_and_upload_paper_pdf(driver, drive_service, volume_folder_id, paper_title, processed_papers):
    if paper_title in processed_papers:
        print(f"Skipping already processed paper: {paper_title}")
        return  # Skip this paper if it is already processed

    try:
        # Locate the PDF link inside the paper details page
        pdf_link = driver.find_element(By.XPATH, "//a[@class='btn btn-primary' and contains(@href, '.pdf')]")
        pdf_url = pdf_link.get_attribute("href")
        paper_file_name = paper_title + ".pdf"

        # Download the PDF file
        response = requests.get(pdf_url)
        if response.status_code == 200:
            print(f"Downloaded paper: {paper_file_name}")
            # Upload the downloaded PDF to Google Drive
            upload_file_to_drive(drive_service, paper_file_name, volume_folder_id, response.content)
            # Update the checkpoint after successful upload
            update_checkpoint_in_drive(drive_service, volume_folder_id, paper_title)
        else:
            print(f"Failed to download paper PDF: {pdf_url}")
    except Exception as e:
        print(f"Error: Could not find PDF link on paper page. {e}")


def navigate_and_download_papers_from_volume(driver, drive_service, volume_folder_id):
    # Load the checkpoint for this volume from Google Drive
    processed_papers = load_checkpoint_from_drive(drive_service, volume_folder_id)

    # Function to normalize paper titles (removes special characters, extra spaces, etc.)
    def normalize_title(title):
        return title.strip().replace(" ", "_").replace(":", "").replace(",", "").lower()

    # Normalize all the processed paper titles in the checkpoint
    processed_papers_normalized = set([normalize_title(paper) for paper in processed_papers])

    while True:
        try:
            # Get all the paper links in the volume (e.g., /2023.ijcnlp-main.1)
            paper_links = driver.find_elements(By.XPATH, "//a[@class='align-middle' and contains(@href, '/')]")
            total_papers = len(paper_links)
            processed_count = len(processed_papers_normalized)
            print(f"Total papers in volume: {total_papers}")
            print(f"Processed papers in checkpoint: {processed_count}")

            # **Stop Condition**: If all papers are processed, exit this volume
            if processed_count >= total_papers:
                print(f"All papers in volume '{volume_folder_id}' are already processed. Skipping volume.")
                break  # Move to the next volume
            
            for paper_index in range(len(paper_links)):
                try:
                    # Re-fetch the paper links to avoid stale element issues
                    paper_links = driver.find_elements(By.XPATH, "//a[@class='align-middle' and contains(@href, '/')]")
                    
                    paper_link = paper_links[paper_index]
                    paper_href = paper_link.get_attribute('href')
                    paper_title = paper_link.text

                    # Normalize the title for comparison with the checkpoint
                    normalized_title = normalize_title(paper_title)

                    # Check if the normalized title is already processed
                    if normalized_title in processed_papers_normalized:
                        print(f"Skipping already processed paper: {paper_title} (Normalized: {normalized_title})")
                        continue  # Skip already processed papers

                    print(f"Navigating to paper: {paper_href}")
                    driver.get(paper_href)
                    time.sleep(2)  # Wait for the paper details page to load

                    # Try to locate the PDF download link, handle stale elements
                    try:
                        # Locate the PDF link inside the paper details page
                        pdf_link = WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.XPATH, "//a[@class='btn btn-primary' and contains(@href, '.pdf')]"))
                        )
                        pdf_url = pdf_link.get_attribute("href")
                        paper_file_name = normalize_title(paper_title) + ".pdf"

                        # Download the PDF file
                        response = requests.get(pdf_url)
                        if response.status_code == 200:
                            print(f"Downloaded paper: {paper_file_name}")
                            # Upload the downloaded PDF to Google Drive
                            upload_file_to_drive(drive_service, paper_file_name, volume_folder_id, response.content)
                            
                            # Update the checkpoint after successful upload with the normalized title
                            update_checkpoint_in_drive(drive_service, volume_folder_id, normalized_title)
                            
                            # **IMPORTANT**: Update the in-memory processed set
                            processed_papers_normalized.add(normalized_title)

                        else:
                            print(f"Failed to download paper PDF: {pdf_url}")

                    except Exception as e:
                        print(f"Error: Could not find PDF link on paper page. {e}")

                    # Go back to the volume page after processing the paper
                    driver.back()
                    time.sleep(2)  # Wait for the volume page to reload

                except StaleElementReferenceException as stale_e:
                    print(f"Stale element encountered while navigating paper links: {stale_e}. Refetching paper links and retrying...")
                    continue  # Retry the current set of paper links after refetching

        except StaleElementReferenceException as stale_e:
            print(f"Stale element encountered while navigating volume: {stale_e}. Refetching volume and retrying...")
            continue  # Retry fetching the volume links if stale elements are encountered


In [285]:
# Function to handle downloading papers from all volumes in an event
def download_papers_from_volumes(event_folder_id, driver, drive_service):
    # Try fetching volume links
    while True:
        try:
            # Find all the volume links on the event page
            volume_links = driver.find_elements(By.XPATH, "//a[contains(@href, '/volumes/')]")
            print(f"Found {len(volume_links)} volumes.")
            
            for volume_index in range(len(volume_links)):
                try:
                    # Re-fetch the volume links to avoid stale element issues
                    volume_links = driver.find_elements(By.XPATH, "//a[contains(@href, '/volumes/')]")
                    
                    # Get the volume link and its details
                    volume_link = volume_links[volume_index]
                    volume_href = volume_link.get_attribute('href')
                    volume_name = volume_href.split('/')[-2]  # Extract the volume name
                    
                    if volume_name == "2024.eacl-long":
                        print(f"Skipping volume: {volume_name}")
                        continue  
                    
                    print(f"Navigating to volume: {volume_name}")

                    # Navigate to the volume link
                    driver.get(volume_href)
                    time.sleep(2)  # Wait for the volume page to load

                    # Check if a folder for this volume already exists in Google Drive
                    volume_folder_id = check_folder_exists(drive_service, volume_name, event_folder_id)
                    if not volume_folder_id:
                        # If the folder doesn't exist, create it
                        volume_folder_id = create_folder_in_drive(drive_service, volume_name, event_folder_id)

                    # Navigate to each paper in the volume and download PDFs
                    navigate_and_download_papers_from_volume(driver, drive_service, volume_folder_id)

                    # Go back to the event page after processing the volume
                    driver.back()
                    time.sleep(2)  # Wait for the event page to reload

                except StaleElementReferenceException as stale_e:
                    print(f"Stale element encountered: {stale_e}. Refetching volume links and retrying...")
                    # Retry the current volume after refetching the elements
                    continue
                except Exception as e:
                    print(f"Error processing volume: {e}")
                    continue

            break  # Exit the loop when all volumes are processed
        except StaleElementReferenceException as stale_e:
            print(f"Stale element encountered while fetching volumes: {stale_e}. Refetching volume links and retrying...")
            continue  # Retry fetching the volume links if stale elements are encountered


In [286]:
# Function to traverse through all events and download papers
def traverse_events_and_download_papers(driver, drive_service, parent_folder_id):
    try:
        # Locate the ACL Events table
        table = driver.find_element(By.XPATH, "//h6[text()='ACL Events']/following-sibling::table[contains(@class, 'table-hover')]")
        print("ACL Events table found successfully!")

        # Get all the event links using the corrected XPath
        while True:
            try:
                event_links = driver.find_elements(By.XPATH, "//th/a[contains(@href, '/venues/')]")
                
                for event_index in range(len(event_links)):
                    try:
                        # Re-fetch the event links to avoid stale element issues
                        event_links = driver.find_elements(By.XPATH, "//th/a[contains(@href, '/venues/')]")
                        
                        link = event_links[event_index]
                        href = link.get_attribute("href")
                        event_name = href.split("/")[-2]  # Extract the event name (e.g., "acl", "conll", "aacl")

                        # **Skip the "aacl" event manually**
                        if event_name in ["aacl", "acl","anlp","cl","findings","conll"]:
                            print(f"Skipping event: {event_name}")
                            continue  # Skip processing for the "aacl" event

                        print(f"Processing event: {event_name}")

                        # Navigate to the event link
                        driver.get(href)
                        time.sleep(2)  # Wait for the event page to load

                        # Create a folder for this event in Google Drive if it doesn't exist
                        event_folder_id = check_folder_exists(drive_service, event_name, parent_folder_id)
                        if not event_folder_id:
                            event_folder_id = create_folder_in_drive(drive_service, event_name, parent_folder_id)

                        # Traverse volumes and download papers within the event
                        download_papers_from_volumes(event_folder_id, driver, drive_service)

                        # Go back to the main event table page after processing the event
                        driver.back()
                        time.sleep(2)  # Wait for the event page to reload

                    except StaleElementReferenceException as stale_e:
                        print(f"Stale element encountered: {stale_e}. Refetching event links and retrying...")
                        continue  # Retry the current event after refetching the elements
                    except Exception as e:
                        print(f"Error processing event: {e}")
                        continue

                break  # Exit the loop when all events are processed

            except StaleElementReferenceException as stale_e:
                print(f"Stale element encountered while fetching events: {stale_e}. Refetching event links and retrying...")
                continue  # Retry fetching the event links if stale elements are encountered

    except Exception as e:
        print(f"Error: Table not found. {e}")


In [None]:
traverse_events_and_download_papers(driver, drive_service, parent_folder_id)