In [1]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC #expilicit wait

In [6]:
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

Google Drive Authentication

In [3]:
#Authenticate with Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive.file']

In [16]:
# Authenticate with Google Drive
def authenticate_google_drive():
    creds = None
    # The file token.json stores the user's access and refresh tokens, created during the first run
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for future use
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    
    # Build the Google Drive API client
    drive_service = build('drive', 'v3', credentials=creds)
    return drive_service

# Function to create a folder in Google Drive
def create_folder_in_drive(service, folder_name, parent_folder_id=None):
    folder_metadata = {
        'name': folder_name,
        'mimeType': 'application/vnd.google-apps.folder',
    }
    if parent_folder_id:
        folder_metadata['parents'] = [parent_folder_id]

    folder = service.files().create(body=folder_metadata, fields='id').execute()
    print(f"Folder '{folder_name}' created with ID: {folder.get('id')}")
    return folder.get('id')

# Function to download a file
def download_file(download_url, save_path):
    response = requests.get(download_url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
            print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download: {download_url}")

# Function to navigate through volumes and download papers
def download_papers_from_event(event_folder, driver):
    # Find volume links in the event page (all <a> tags related to volumes)
    volume_links = driver.find_elements(By.XPATH, "//a[contains(text(), 'Volume')]")
    
    for volume_link in volume_links:
        volume_name = volume_link.text
        print(f"Navigating to volume: {volume_name}")

        # Click the volume link
        volume_link.click()
        time.sleep(2)  # Wait for the volume page to load

        # Get all the paper links (adjust the XPath to fit the paper links)
        paper_links = driver.find_elements(By.XPATH, "//a[contains(text(), 'PDF')]")  # Adjust if necessary
        
        # Create folder for this volume
        volume_folder = os.path.join(event_folder, volume_name)
        if not os.path.exists(volume_folder):
            os.makedirs(volume_folder)

        # Loop through each paper link and download the paper
        for paper_link in paper_links:
            paper_title = paper_link.text
            paper_url = paper_link.get_attribute("href")
            paper_file_name = paper_title.replace(" ", "_") + ".pdf"

            # Define the path to save the PDF file
            save_path = os.path.join(volume_folder, paper_file_name)
            download_file(paper_url, save_path)

        # Go back to the event page
        driver.back()
        time.sleep(2)  # Wait for the event page to reload

In [9]:
drive_service = authenticate_google_drive()
parent_folder_id = '1WZ2l6ZT3Op4X7P95wrLt2P2asVxd6rMz'

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=474666803493-g96b49gl8mm8d77u8sg5sf72js5q5f56.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A53074%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.file&state=QG68lTyDN09954Z2a61nwUg6P1GNoQ&access_type=offline


KeyboardInterrupt: 

Selenium Konfigürasyonları

In [12]:
#hızlı olsun diye, js link veya butonu çıkarsa js engelini kaldır
options = Options()
prefs = {"profile.default_content_setting_values": {"images": 2, "javascript": 2}}  # Block images and JS
options.headless = True # Run in headless mode
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)

In [13]:
driver.get("https://aclanthology.org")
driver.implicitly_wait(15)

In [14]:
# Locate the table containing the event links
try:
    table = driver.find_element(By.XPATH, "//h6[text()='ACL Events']/following-sibling::table[contains(@class, 'table-hover')]")
    print("ACL Events table found successfully!")
except Exception as e:
    print(f"Error: Table not found. {e}")

ACL Events table found successfully!


In [15]:
# If the table is found, find all <a> tags (links) within the table
if table:
    links = table.find_elements(By.TAG_NAME, "a")

    # Loop through each link in the table
    for link in links:
        # Get the href attribute (the event URL) and the link text (the event name)
        href = link.get_attribute("href")
        event_name = href.split("/")[-2]  # Extract the event name (e.g., "aacl")

        # Only process event names without a hyphen and with trailing "/"
        if '-' not in event_name and href.endswith('/'):
            # Remove trailing slash from the folder name
            event_name = event_name.rstrip('/')

            # Preview the folder name without creating it
            print(f"Folder name that would be created: {event_name}")

            # Optionally, preview the href links as well
            print(f"Link found: {href}")

            #create_folder_in_drive(drive_service, event_name, parent_folder_id)

Folder name that would be created: aacl
Link found: https://aclanthology.org/venues/aacl/
Folder name that would be created: acl
Link found: https://aclanthology.org/venues/acl/
Folder name that would be created: anlp
Link found: https://aclanthology.org/venues/anlp/
Folder name that would be created: cl
Link found: https://aclanthology.org/venues/cl/
Folder name that would be created: conll
Link found: https://aclanthology.org/venues/conll/
Folder name that would be created: eacl
Link found: https://aclanthology.org/venues/eacl/
Folder name that would be created: emnlp
Link found: https://aclanthology.org/venues/emnlp/
Folder name that would be created: findings
Link found: https://aclanthology.org/venues/findings/
Folder name that would be created: iwslt
Link found: https://aclanthology.org/venues/iwslt/
Folder name that would be created: naacl
Link found: https://aclanthology.org/venues/naacl/
Folder name that would be created: semeval
Link found: https://aclanthology.org/venues/sem