In [None]:
!pip install boxsdk>=10


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import os
from box_sdk_gen import BoxClient, BoxDeveloperTokenAuth

auth = BoxDeveloperTokenAuth(token=os.getenv('BOX_DEVELOPER_TOKEN'))
client = BoxClient(auth=auth)

In [62]:
file_ids = []
for item in client.folders.get_folder_items('0').entries:
    file_ids.append(item.id)

print(f"Found {len(file_ids)} files.")

Found 10 files.


In [65]:
import requests

def download_file(folder_path: str, item_id: str) -> int:
    try:
        # Make a direct API call to download the file
        download_url = f"https://api.box.com/2.0/files/{item_id}/content"
        headers = {
            "Authorization": f"Bearer {os.getenv('BOX_DEVELOPER_TOKEN')}"
        }
        
        response = requests.get(download_url, headers=headers)
        
        if response.status_code == 200:
            # Save the PDF
            filename = f"{folder_path}/{item_id}.pdf"
            with open(filename, 'wb') as f:
                f.write(response.content)

            return len(response.content)
    except Exception as e:
        raise Exception(f"Failed to download file: {e}")

def bytes_to_mb(bytes: int) -> float:
    return bytes / 1024 / 1024

import time
start = time.time()
folder_path = "ir_papers"
file_sizes = []
for file_id in file_ids:
    file_sizes.append(download_file(folder_path, file_id))

end = time.time()

total_bytes = sum(file_sizes)
avg_bytes = total_bytes / len(file_sizes)

print(f"Downloaded {len(file_ids)} files in {end - start} seconds.")
print(f"Total file sizes: {total_bytes} bytes ({bytes_to_mb(total_bytes):.2f} MB).")
print(f"Average file size: {avg_bytes} bytes ({bytes_to_mb(avg_bytes):.2f} MB).")

Downloaded 10 files in 9.185985803604126 seconds.
Total file sizes: 8310725 bytes (7.93 MB).
Average file size: 831072.5 bytes (0.79 MB).


In [68]:
import requests
import concurrent.futures
import threading

def download_file(folder_path: str, item_id: str, semaphore: threading.Semaphore) -> int:
    with semaphore:
        try:
            # Make a direct API call to download the file
            download_url = f"https://api.box.com/2.0/files/{item_id}/content"
            headers = {
                "Authorization": f"Bearer {os.getenv('BOX_DEVELOPER_TOKEN')}"
            }

            response = requests.get(download_url, headers=headers)

            if response.status_code == 200:
                # Save the PDF
                filename = f"{folder_path}/{item_id}.pdf"
                with open(filename, 'wb') as f:
                    f.write(response.content)

                return len(response.content)
            else:
                raise Exception(f"Download failed with status code: {response.status_code}")
        except Exception as e:
            raise Exception(f"Failed to download file {item_id}: {e}")

def bytes_to_mb(bytes: int) -> float:
    return bytes / 1024 / 1024

import time
start = time.time()
folder_path = "ir_papers"
file_sizes = []
semaphore = threading.Semaphore(5)

def download_worker(file_id):
    return download_file(folder_path, file_id, semaphore)

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(download_worker, file_ids))
    file_sizes = results

end = time.time()

total_bytes = sum(file_sizes)
avg_bytes = total_bytes / len(file_sizes)

print(f"Downloaded {len(file_ids)} files in {end - start} seconds.")
print(f"Total file sizes: {total_bytes} bytes ({bytes_to_mb(total_bytes):.2f} MB).")
print(f"Average file size: {avg_bytes} bytes ({bytes_to_mb(avg_bytes):.2f} MB).")

Downloaded 10 files in 1.8027098178863525 seconds.
Total file sizes: 8310725 bytes (7.93 MB).
Average file size: 831072.5 bytes (0.79 MB).


In [None]:
# Upload PDFs to Weaviate
# WIP