In [1]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client PyMuPDF pandas

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.13


In [3]:
!pip install PyMuPDF pandas fuzzywuzzy[speedup]

Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Do

In [4]:
# Step 3: Import libraries
import os
import io
import pandas as pd
import fitz  # PyMuPDF for PDF processing
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [5]:
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from google.colab import auth

In [6]:
from google.auth import default

In [7]:
# Step 4: Set Google Drive API scope and authenticate
SCOPES = ['https://www.googleapis.com/auth/drive']

In [8]:
auth.authenticate_user()
creds, _ = default(scopes=['https://www.googleapis.com/auth/drive'])
drive_service = build('drive', 'v3', credentials=creds)


In [None]:
# Step 5: Helper function to count pages in a PDF file
def count_pdf_pages(service, file_id, max_retries=3):
    attempt = 0
    while attempt < max_retries:
        try:
            request = service.files().get_media(fileId=file_id)
            file_content = request.execute(num_retries=3)
            with io.BytesIO(file_content) as pdf_stream:
                with fitz.open("pdf", pdf_stream) as pdf:
                    return pdf.page_count
        except Exception as e:
            print(f"Attempt {attempt + 1} for file {file_id} failed: {e}")
            attempt += 1
            time.sleep(1)
    print(f"Failed to retrieve page count for file {file_id} after {max_retries} attempts.")
    return None


In [11]:
# Helper function to normalize text as in the dataset
def normalize_text(text):
    text = re.sub(r'\.pdf$', '', text, flags=re.IGNORECASE)
    text = text.strip('\'"').lower().replace(" ", "_")
    text = re.sub(r'[^a-zA-Z0-9_]', '', text)
    return text


In [10]:
import os
import io
import pandas as pd
import fitz  # PyMuPDF for PDF processing
from google.colab import auth
from google.auth import default
from googleapiclient.discovery import build
from fuzzywuzzy import fuzz
import re
import time

In [14]:
# Step 7: Main function to process all files in the folder
def process_google_drive_folder(service, folder_id, output_csv='papers_dataset.csv'):
    # Initialize the dataset
    dataset = []

    # List all files in the folder
    query = f"'{folder_id}' in parents and mimeType='application/pdf'"
    results = service.files().list(q=query, spaces='drive', fields="files(id, name)").execute()
    files = results.get('files', [])
    print(f"Found {len(files)} PDF files in the folder.")

    for file in files:
        file_id = file['id']
        original_name = file['name']
        normalized_name = normalize_text(original_name)

        print(f"Processing file: {original_name} (ID: {file_id})")
        page_count = count_pdf_pages(service, file_id)

        dataset.append({
            'normalized_name': normalized_name,
            'page_count': page_count,
            'file_id': file_id
        })

        # Save progress to CSV after processing each file
        pd.DataFrame(dataset).to_csv(output_csv, index=False)
        print(f"Saved progress to {output_csv}")

        # Introduce delay to avoid rate-limiting
        time.sleep(2)

    print(f"Processing complete. Dataset saved to {output_csv}")
    return pd.DataFrame(dataset)

In [27]:
def process_google_drive_folder(service, folder_id, output_csv='papers_dataset.csv'):
    # Check if the CSV file exists
    if os.path.exists(output_csv):
        # Load existing records
        existing_df = pd.read_csv(output_csv)
        existing_file_ids = set(existing_df['file_id'])
        print(f"Loaded existing progress: {len(existing_df)} records found.")
    else:
        # Create an empty DataFrame if no existing file
        existing_df = pd.DataFrame(columns=['normalized_name', 'page_count', 'file_id'])
        existing_file_ids = set()
        print("No existing progress found. Starting fresh.")

    # Function to list all files with pagination
    def list_all_files_in_folder(service, folder_id):
        all_files = []
        page_token = None
        query = f"'{folder_id}' in parents and mimeType='application/pdf'"

        while True:
            results = service.files().list(
                q=query,
                spaces='drive',
                fields="nextPageToken, files(id, name)",
                pageToken=page_token
            ).execute()

            files = results.get('files', [])
            all_files.extend(files)  # Add current batch of files to the list

            page_token = results.get('nextPageToken')
            if not page_token:
                break  # No more pages

        print(f"Total files retrieved: {len(all_files)}")
        return all_files

    # Retrieve all files in the folder
    files = list_all_files_in_folder(service, folder_id)
    print(f"Found {len(files)} PDF files in the folder.")

    # Process the files
    new_records = []
    for file in files:
        file_id = file['id']
        original_name = file['name']

        # Skip files already processed
        if file_id in existing_file_ids:
            print(f"Skipping already processed file: {original_name} (ID: {file_id})")
            continue

        normalized_name = normalize_text(original_name)
        print(f"Processing file: {original_name} (ID: {file_id})")
        page_count = count_pdf_pages(service, file_id)

        new_records.append({
            'normalized_name': normalized_name,
            'page_count': page_count,
            'file_id': file_id
        })

    # Append new records to the DataFrame and save
    if new_records:
        new_df = pd.DataFrame(new_records)
        updated_df = pd.concat([existing_df, new_df], ignore_index=True)
        updated_df.to_csv(output_csv, index=False)
        print(f"Saved {len(new_records)} new records to {output_csv}. Total records: {len(updated_df)}")
    else:
        print("No new files to process.")

    return updated_df


In [32]:
# Step 8: Execute the function
parent_folder_id = '1v0NkebMgXHzSslTKf6GL62Jcvib7RA8O'  # Replace with your folder ID
papers_dataset = process_google_drive_folder(drive_service, parent_folder_id)

# Step 9: Display the dataset
papers_dataset.head()

Loaded existing progress: 862 records found.
Total files retrieved: 875
Found 875 PDF files in the folder.
Processing file: Design of ultra-low noise amplifier for quantum applications (QLNA).pdf (ID: 17EBZ2827mLh-qcW8BKhWTWvsgcpXgJQD)
Processing file: Development of a solar powered multirotor micro aerial vehicle.pdf (ID: 1JpX6W1wwrHCUDp7LZgWh9pddGIhy8yVX)
Processing file: Fractional order memcapacitive neuromorphic elements reproduce and predict neuronal function.pdf (ID: 1oN_-4GO-eSNPNVWJC-RkA2h_1W8Lq1Fq)
Processing file: Penning micro-trap for quantum computing.pdf (ID: 12iOSdFdfCjtQx63FqOtyXU4UARhWYkUD)
Processing file: Pipeline quantum processor architecture for silicon spin qubits.pdf (ID: 1VLhA4V4w0n5ju8IOKVQDK9favtLbsK6n)
Processing file: A cost-effective approach to measurements of fluorophore temperature sensitivity and temperature cha.pdf (ID: 1WKsJlwMhH3vGDGbBKvThkjr89q35x7fR)
Processing file: Low disorder and high valley splitting in silicon.pdf (ID: 19p2OA2_tNiX8PPozw0u8

Unnamed: 0,normalized_name,page_count,file_id
0,emitted_current_selfbalancing_for_spacecraft_c...,18.0,1nar5NacYT4G3R52lGEHV8iLJ8WNIXOpq
1,reliability_research_of_thyristors_for_hvdc_tr...,12.0,1Rk6UwitxKKo94R9BOvptw72MBoxpX-KC
2,high_performance_simos_2_heterogeneous_embedde...,9.0,13GBnvysEZOGjD6UjvwtdN6x2aV5xaF4d
3,recent_advances_in_artificial_sensory_neurons_...,49.0,14sjZl21DnLq-FudkIkenIvuMRyVilzNG
4,achieving_nearly_barrier_free_transport_in_hig...,9.0,1FlSDj-bVL2GWAeMvXy7ffpn0sdYMf4lp


In [29]:
import chardet

with open('papers_dataset.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)


{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [30]:
import pandas as pd

# Load the existing CSV file (replace with your actual file name)
input_file = 'papers_dataset.csv'
output_file = 'papers_dataset.csv'

# Read the file with a fallback encoding
try:
    df = pd.read_csv(input_file, encoding='utf-8')
    print(f"File '{input_file}' is already UTF-8 encoded.")
except UnicodeDecodeError:
    print(f"File '{input_file}' is not UTF-8 encoded. Converting to UTF-8.")
    df = pd.read_csv(input_file, encoding='latin-1', errors='replace')

# Save the DataFrame as a UTF-8 encoded CSV
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"File has been successfully converted to UTF-8 and saved as '{output_file}'.")


File 'papers_dataset.csv' is already UTF-8 encoded.
File has been successfully converted to UTF-8 and saved as 'papers_dataset.csv'.
