In [1]:
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.http import MediaIoBaseUpload
from selenium.common.exceptions import StaleElementReferenceException
import io
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload

In [2]:
import os

In [3]:
SCOPES = ['https://www.googleapis.com/auth/drive.file']

In [4]:
# Authenticate with Google Drive
def authenticate_google_drive():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    drive_service = build('drive', 'v3', credentials=creds)
    return drive_service

In [5]:
drive_service = authenticate_google_drive()
parent_folder_id = ''

In [6]:
# Function to create a log file in Google Drive
def create_log_file(drive_service, folder_id, log_filename='checkpoint.txt'):
    log_file_metadata = {
        'name': log_filename,
        'parents': [folder_id],
        'mimeType': 'text/plain'
    }
    log_file = drive_service.files().create(body=log_file_metadata, fields='id').execute()
    print(f"Created log file: {log_filename} with ID: {log_file.get('id')}")
    return log_file.get('id')

# Function to append to the log file in Google Drive
def append_to_log_file(drive_service, log_file_id, text):
    # Get the log file content (if any)
    request = drive_service.files().get_media(fileId=log_file_id)
    log_content = io.BytesIO()
    downloader = MediaIoBaseDownload(log_content, request)
    done = False
    while not done:
        status, done = downloader.next_chunk()
    log_content.seek(0)
    
    # Append the new log content
    new_content = log_content.read().decode('utf-8') + text + '\n'
    
    # Write the new content back to the log file
    media_body = MediaFileUpload(io.BytesIO(new_content.encode('utf-8')), mimetype='text/plain')
    drive_service.files().update(fileId=log_file_id, media_body=media_body).execute()
    print(f"Updated log file {log_file_id} with new entry: {text}")


In [7]:
# Function to list files in a specific folder
def list_files_in_folder(drive_service, folder_id):
    query = f"'{folder_id}' in parents and trashed=false"
    result = drive_service.files().list(q=query).execute()
    return result.get('files', [])

# Function to create or update the checkpoint.txt file in each folder
def create_or_update_checkpoint_file(drive_service, folder_id, files):
    log_filename = 'checkpoint.txt'
    
    # Check if the checkpoint file already exists in this folder
    existing_log_files = list_files_in_folder(drive_service, folder_id)
    log_file_id = None
    for file in existing_log_files:
        if file['name'] == log_filename:
            log_file_id = file['id']
            break

    # If the log file doesn't exist, create it
    if log_file_id is None:
        log_file_id = create_log_file(drive_service, folder_id, log_filename)

    # Prepare the content of the checkpoint file (file names)
    log_content = '\n'.join([f['name'] for f in files])

    # Update the checkpoint file with the list of files
    media_body = MediaFileUpload(io.BytesIO(log_content.encode('utf-8')), mimetype='text/plain')
    drive_service.files().update(fileId=log_file_id, media_body=media_body).execute()
    print(f"Checkpoint updated for folder {folder_id}")

# Function to recursively traverse folders and log files
def traverse_and_log_files(drive_service, parent_folder_id):
    # List all event folders (subfolders of ACLanthology)
    event_folders = list_files_in_folder(drive_service, parent_folder_id)

    for event_folder in event_folders:
        event_folder_id = event_folder['id']
        event_name = event_folder['name']
        print(f"Processing event: {event_name}")

        # List all volumes within the event folder
        volume_folders = list_files_in_folder(drive_service, event_folder_id)
        
        for volume_folder in volume_folders:
            volume_folder_id = volume_folder['id']
            volume_name = volume_folder['name']
            print(f"Processing volume: {volume_name}")

            # List all papers (files) within the volume folder
            paper_files = list_files_in_folder(drive_service, volume_folder_id)

            # Create or update the checkpoint.txt for this volume
            create_or_update_checkpoint_file(drive_service, volume_folder_id, paper_files)

In [8]:
# Function to list files in a specific folder
def list_files_in_folder(drive_service, folder_id):
    query = f"'{folder_id}' in parents and trashed=false"
    result = drive_service.files().list(q=query).execute()
    return result.get('files', [])

# Function to create or update the checkpoint.txt file in each folder
def create_or_update_checkpoint_file(drive_service, folder_id, files):
    log_filename = 'checkpoint.txt'
    
    # Check if the checkpoint file already exists in this folder
    existing_log_files = list_files_in_folder(drive_service, folder_id)
    log_file_id = None
    for file in existing_log_files:
        if file['name'] == log_filename:
            log_file_id = file['id']
            break

    # Prepare the content of the checkpoint file (file names)
    log_content = '\n'.join([f['name'] for f in files])

    # Write the content to a temporary file
    with open(log_filename, 'w') as temp_log_file:
        temp_log_file.write(log_content)

    # Upload or update the checkpoint file
    media_body = MediaFileUpload(log_filename, mimetype='text/plain')
    
    if log_file_id:
        # Update the existing log file
        drive_service.files().update(fileId=log_file_id, media_body=media_body).execute()
    else:
        # Create a new log file in the folder
        file_metadata = {'name': log_filename, 'parents': [folder_id]}
        drive_service.files().create(body=file_metadata, media_body=media_body).execute()

    # Remove the temporary local file after uploading
    os.remove(log_filename)

    print(f"Checkpoint updated for folder {folder_id}")

# Function to recursively traverse folders and log files
def traverse_and_log_files(drive_service, parent_folder_id):
    # List all event folders (subfolders of ACLanthology)
    event_folders = list_files_in_folder(drive_service, parent_folder_id)

    for event_folder in event_folders:
        event_folder_id = event_folder['id']
        event_name = event_folder['name']
        print(f"Processing event: {event_name}")

        # List all volumes within the event folder
        volume_folders = list_files_in_folder(drive_service, event_folder_id)
        
        for volume_folder in volume_folders:
            volume_folder_id = volume_folder['id']
            volume_name = volume_folder['name']
            print(f"Processing volume: {volume_name}")

            # List all papers (files) within the volume folder
            paper_files = list_files_in_folder(drive_service, volume_folder_id)

            # Create or update the checkpoint.txt for this volume
            create_or_update_checkpoint_file(drive_service, volume_folder_id, paper_files)

In [24]:
traverse_and_log_files(drive_service, parent_folder_id)

Processing event: conll
Processing volume: 2023.conll-1
Checkpoint updated for folder 1LwCWYKgw5zYv6Va0EQkjJrcSQ6-u-EVF
Processing event: cl
Processing volume: 2024.cl-1
Checkpoint updated for folder 1fQPv5aivNulmduEedwzWkKWLsF19MtJ6
Processing event: acl
Processing volume: 2024.acl-long
Checkpoint updated for folder 1X-OaGgPSH76t2dHLicjSQLECMG63Uflg
Processing event: anlp
Processing volume: A00-1
Checkpoint updated for folder 1wRSZ9tRIo3QN8fo4hBX9qyfWR5GLueje
Processing event: aacl
Processing volume: 2020.aacl-demo
Checkpoint updated for folder 1r72V_mkJ29yLhJJCL-0k6xZ5KqkPeF0u
Processing volume: 2020.aacl-srw
Checkpoint updated for folder 1JCeaLyLXLhV43lGZJHAy4C_R3R9S7O2g
Processing volume: 2020.aacl-main
Checkpoint updated for folder 1oPX2tp3_AoFlL8GMuKw1TbqtYBYeT0oO
Processing volume: 2022.aacl-tutorials
Checkpoint updated for folder 1c4RxX4XKNQQNuRRHGftOBvepqzXvsixW
Processing volume: 2022.aacl-demo
Checkpoint updated for folder 1ANib5a05yzlynBr8cYbL2fluS0uDMpqZ
Processing volume:

In [2]:
import pandas as pd
df = pd.read_csv("acl_anthology_dataset_with_pages.csv")
df.head()
df.columns
df.shape
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5872 entries, 0 to 5871
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   location     5872 non-null   object 
 1   paper_name   5872 non-null   object 
 2   paper_pages  3071 non-null   float64
dtypes: float64(1), object(2)
memory usage: 137.8+ KB


Unnamed: 0,paper_pages
count,3071.0
mean,10.834582
std,6.989754
min,0.0
25%,7.0
50%,10.0
75%,12.0
max,121.0


In [4]:
df_repaeted= pd.read_csv("acl_anthology_repeated_papers.csv")

In [6]:
df_original= pd.read_csv("acl_anthology_dataset.csv")

In [7]:
# Load the original dataset
df_original = pd.read_csv("acl_anthology_dataset.csv")

# Print initial info
print(f"Original dataset size: {len(df_original)}")

# Remove duplicates based on paper_name (or any other relevant columns)
df_original_clean = df_original.drop_duplicates(subset=['paper_name'])

# Print results
print(f"Dataset size after removing duplicates: {len(df_original_clean)}")
print(f"Removed {len(df_original) - len(df_original_clean)} duplicate entries")

# Save the cleaned dataset
df_original_clean.to_csv("acl_anthology_dataset_clean.csv", index=False)

Original dataset size: 5872
Dataset size after removing duplicates: 5787
Removed 85 duplicate entries
