In [90]:
import os
import time
import requests
from selenium import webdriver
from concurrent.futures import ThreadPoolExecutor
import concurrent
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC #expilicit wait

In [91]:
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.http import MediaIoBaseUpload
from selenium.common.exceptions import StaleElementReferenceException
import io

Google Drive Authentication

In [92]:
#Authenticate with Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive.file']

In [93]:
# Authenticate with Google Drive
def authenticate_google_drive():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    drive_service = build('drive', 'v3', credentials=creds)
    return drive_service

In [94]:
drive_service = authenticate_google_drive()
parent_folder_id = '1WZ2l6ZT3Op4X7P95wrLt2P2asVxd6rMz'

Selenium Konfigürasyonları

In [95]:
#hızlı olsun diye, js link veya butonu çıkarsa js engelini kaldır
options = Options()
prefs = {"profile.default_content_setting_values": {"images": 2, "javascript": 2}}  # Block images and JS
options.headless = True # Run in headless mode
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)

In [96]:
driver.get("https://aclanthology.org")
driver.implicitly_wait(15)

In [97]:
# Locate the table containing the event links
try:
    table = driver.find_element(By.XPATH, "//h6[text()='ACL Events']/following-sibling::table[contains(@class, 'table-hover')]")
    print("ACL Events table found successfully!")
except Exception as e:
    print(f"Error: Table not found. {e}")

ACL Events table found successfully!


In [99]:
# Function to check if a folder already exists in Google Drive
def check_folder_exists(service, folder_name, parent_folder_id=None):
    query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder'"
    if parent_folder_id:
        query += f" and '{parent_folder_id}' in parents"
    
    results = service.files().list(q=query, spaces='drive').execute()
    files = results.get('files', [])
    if files:
        return files[0]['id']  # Return the existing folder ID
    return None

# Function to create a folder in Google Drive
def create_folder_in_drive(service, folder_name, parent_folder_id=None):
    folder_metadata = {
        'name': folder_name,
        'mimeType': 'application/vnd.google-apps.folder',
    }
    if parent_folder_id:
        folder_metadata['parents'] = [parent_folder_id]
    folder = service.files().create(body=folder_metadata, fields='id').execute()
    print(f"Folder '{folder_name}' created with ID: {folder.get('id')}")
    return folder.get('id')

# Function to upload a file to Google Drive
def upload_file_to_drive(service, file_name, folder_id, file_content):
    file_metadata = {
        'name': file_name,
        'parents': [folder_id]
    }
    media = MediaIoBaseUpload(io.BytesIO(file_content), mimetype='application/pdf')
    
    # Upload the file to Google Drive
    uploaded_file = service.files().create(
        body=file_metadata,
        media_body=media,
        fields='id'
    ).execute()

    print(f"Uploaded {file_name} to Google Drive with ID: {uploaded_file.get('id')}")

# Function to check or create the checkpoint file for a volume
def get_checkpoint(drive_service, folder_id):
    query = f"name='checkpoint.txt' and '{folder_id}' in parents"
    results = drive_service.files().list(q=query).execute()
    files = results.get('files', [])
    if files:
        checkpoint_file = files[0]
        checkpoint_content = drive_service.files().get_media(fileId=checkpoint_file['id']).execute().decode('utf-8')
        return set(checkpoint_content.split('\n'))
    return set()

# Function to download and upload the actual PDF from the paper page
def download_and_upload_paper_pdf(driver, drive_service, volume_folder_id, paper_title):
    try:
        # Locate the PDF link inside the paper details page
        pdf_link = driver.find_element(By.XPATH, "//a[@class='btn btn-primary' and contains(@href, '.pdf')]")
        pdf_url = pdf_link.get_attribute("href")
        paper_file_name = paper_title + ".pdf"

        # Download the PDF file
        response = requests.get(pdf_url)
        if response.status_code == 200:
            print(f"Downloaded paper: {paper_file_name}")
            # Upload the downloaded PDF to Google Drive
            upload_file_to_drive(drive_service, paper_file_name, volume_folder_id, response.content)
        else:
            print(f"Failed to download paper PDF: {pdf_url}")
    except Exception as e:
        print(f"Error: Could not find PDF link on paper page. {e}")

# Function to navigate each paper page from the volume and extract PDF link
def navigate_and_download_papers_from_volume(driver, drive_service, volume_folder_id):
    existing_papers = get_checkpoint(drive_service, volume_folder_id)

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        while True:
            try:
                # Get all the paper links in the volume (e.g., /2023.ijcnlp-main.1)
                paper_links = driver.find_elements(By.XPATH, "//a[@class='align-middle' and contains(@href, '/')]")
                
                for paper_link in paper_links:
                    paper_href = paper_link.get_attribute('href')
                    paper_title = paper_link.text.replace(" ", "_")
                    if paper_title in existing_papers:
                        print(f"Skipping already processed paper: {paper_title}")
                        continue

                    print(f"Navigating to paper: {paper_href}")
                    driver.get(paper_href)
                    time.sleep(2)  # Wait for the paper details page to load

                    # Submit the download and upload task to the executor
                    futures.append(executor.submit(download_and_upload_paper_pdf, driver, drive_service, volume_folder_id, paper_title))

                    driver.back()
                    time.sleep(2)  # Wait for the volume page to reload

                # Exit the loop if all papers are processed successfully
                break

            except StaleElementReferenceException as stale_e:
                print(f"Stale element encountered while navigating paper links: {stale_e}. Refetching paper links and retrying...")
                continue  # Retry the current set of paper links after refetching

        # Wait for all futures (tasks) to complete
        concurrent.futures.wait(futures)


In [104]:
def download_papers_from_volumes(event_folder_id, driver, drive_service):
    # Try fetching volume links
    while True:
        try:
            volume_links = driver.find_elements(By.XPATH, "//a[contains(@href, '/volumes/')]")
            print(f"Found {len(volume_links)} volumes.")
            
            for volume_index in range(len(volume_links)):
                try:
                    # Re-fetch the volume links (to avoid stale element issues)
                    volume_links = driver.find_elements(By.XPATH, "//a[contains(@href, '/volumes/')]")
                    
                    volume_link = volume_links[volume_index]
                    volume_href = volume_link.get_attribute('href')
                    volume_name = volume_href.split('/')[-2]

                    # Check if the folder for this volume already exists
                    existing_folder_id = check_folder_exists(drive_service, volume_name, event_folder_id)
                    if existing_folder_id:
                        print(f"Skipping already processed volume: {volume_name}")
                        continue

                    print(f"Navigating to volume: {volume_name}")

                    # Click the volume link (or navigate directly using driver.get)
                    driver.get(volume_href)
                    time.sleep(2)  # Wait for the volume page to load

                    # Create folder for this volume in Google Drive
                    volume_folder_id = create_folder_in_drive(drive_service, volume_name, event_folder_id)

                    # Navigate to each paper in the volume and download PDFs
                    navigate_and_download_papers_from_volume(driver, drive_service, volume_folder_id)

                    # Go back to the event page
                    driver.back()
                    time.sleep(2)  # Wait for the event page to reload

                except StaleElementReferenceException as stale_e:
                    print(f"Stale element encountered: {stale_e}. Refetching volume links and retrying...")
                    # Retry the current volume after refetching the elements
                    continue
                except Exception as e:
                    print(f"Error processing volume: {e}")
                    continue

            break  # Exit the loop when all volumes are processed
        except StaleElementReferenceException as stale_e:
            print(f"Stale element encountered while fetching volumes: {stale_e}. Refetching volume links and retrying...")
            continue  # Retry fetching the volume links if stale elements are encountered

In [None]:
def traverse_events_and_download_papers(driver, drive_service, parent_folder_id):
    try:
        # Locate the ACL Events table
        table = driver.find_element(By.XPATH, "//h6[text()='ACL Events']/following-sibling::table[contains(@class, 'table-hover')]")
        print("ACL Events table found successfully!")

        # Get all the event links in the table
        while True:
            try:
                event_links = table.find_elements(By.TAG_NAME, "a")
                
                for event_index in range(len(event_links)):
                    try:
                        # Re-fetch the event links (to avoid stale element issues)
                        event_links = table.find_elements(By.TAG_NAME, "a")
                        
                        link = event_links[event_index]
                        href = link.get_attribute("href")
                        event_name = href.split("/")[-2]  # Extract the event name (e.g., "aacl")

                        # Process event names without a hyphen and with trailing "/"
                        if '-' not in event_name and href.endswith('/'):
                            event_name = event_name.rstrip('/')
                            print(f"Processing event: {event_name}")

                            # Check if the event folder exists in Google Drive
                            existing_folder_id = check_folder_exists(drive_service, event_name, parent_folder_id)
                            if existing_folder_id:
                                print(f"Skipping already processed event: {event_name}")
                                continue  # Skip this event if the folder already exists

                            # Navigate to the event link
                            driver.get(href)
                            time.sleep(2)  # Wait for the event page to load

                            # Create a folder for this event in Google Drive
                            event_folder_id = create_folder_in_drive(drive_service, event_name, parent_folder_id)

                            # Traverse volumes and download papers within the event
                            download_papers_from_volumes(event_folder_id, driver, drive_service)

                            # Go back to the main event table page
                            driver.back()
                            time.sleep(2)  # Wait for the event page to reload

                    except StaleElementReferenceException as stale_e:
                        print(f"Stale element encountered: {stale_e}. Refetching event links and retrying...")
                        continue  # Retry the current event after refetching the elements
                    except Exception as e:
                        print(f"Error processing event: {e}")
                        continue

                break  # Exit the loop when all events are processed

            except StaleElementReferenceException as stale_e:
                print(f"Stale element encountered while fetching events: {stale_e}. Refetching event links and retrying...")
                continue  # Retry fetching the event links if stale elements are encountered

    except Exception as e:
        print(f"Error: Table not found. {e}")

In [103]:
traverse_events_and_download_papers(driver, drive_service, parent_folder_id)


ACL Events table found successfully!
Skipping event: aacl
Processing event: acl
Skipping already processed event: acl
Processing event: anlp
Skipping already processed event: anlp
Processing event: cl
Skipping already processed event: cl
Processing event: conll
Folder 'conll' created with ID: 1qzjR-J2jDPwMXTthFwOwTyUDrP8EOHGQ
Navigating to volume: 2023.conll-1
Folder '2023.conll-1' created with ID: 1LwCWYKgw5zYv6Va0EQkjJrcSQ6-u-EVF
Navigating to paper: https://aclanthology.org/2023.conll-1.0/
Downloaded paper: 'Proceedings_of_the_27th_Conference_on_Computational_Natural_Language_Learning_(CoNLL)'.pdf
Uploaded 'Proceedings_of_the_27th_Conference_on_Computational_Natural_Language_Learning_(CoNLL)'.pdf to Google Drive with ID: 1pTZAuPpC5gP70WfUxUkGTGD3dgYREzww
Navigating to paper: https://aclanthology.org/2023.conll-1.1/
Downloaded paper: 'Can_Language_Models_Be_Tricked_by_Language_Illusions?_Easier_with_Syntax,_Harder_with_Semantics'.pdf
Uploaded 'Can_Language_Models_Be_Tricked_by_Langua

KeyboardInterrupt: 