In [6]:
import os
import json

filtered_documents_json_path = "/Volumes/MyDataDrive/thesis/code-2/src/fireworks/filtering/pii_filtering_data.json"
existing_downloads_dir = "/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/downloaded_images"


with open(filtered_documents_json_path, 'r') as f:
    filtered_documents = json.load(f)

# Get list of already downloaded documents
existing_docs = set()
if os.path.exists(existing_downloads_dir):
    existing_docs = set(os.listdir(existing_downloads_dir))
    print(f"Found {len(existing_docs)} already downloaded documents")

print(len(filtered_documents['documents_with_pii']))

# Create a set of documents to download (excluding already downloaded ones)
docs_to_download = []
for doc in filtered_documents['documents_with_pii']:
    doc_id = doc['file_path'].split('/')[-1].split('.')[0]
    if doc_id not in existing_docs:
        docs_to_download.append(doc)

print(f"Found {len(docs_to_download)} new documents to download")
print(f"Skipping {len(existing_docs)} already downloaded documents")



Found 5229 already downloaded documents
5253
Found 24 new documents to download
Skipping 5229 already downloaded documents


In [2]:
# Here we will download the images from the filtered documents, this the sample url for download "this is the sample url for a download "@https://download.industrydocuments.ucsf.edu/f/j/m/g/fjmg0021/fjmg0021.tif ""

import requests
import os
from tqdm import tqdm
from urllib.parse import urljoin
import time
from PIL import Image
import io

# Create a directory to store downloaded images if it doesn't exist
output_dir = "downloaded_images"
os.makedirs(output_dir, exist_ok=True)

# Base URL for downloads
base_url = "https://download.industrydocuments.ucsf.edu/"

# Function to construct download URL from file path
def get_download_url(file_path):
    # Extract the document ID from the file path (assuming format like 'fjmg0021')
    doc_id = os.path.splitext(os.path.basename(file_path))[0]
    # Construct path segments (f/j/m/g/fjmg0021/fjmg0021.tif)
    segments = [doc_id[i] for i in range(4)]  # First 4 characters as segments
    url_path = '/'.join(segments + [doc_id, f"{doc_id}.tif"])
    return urljoin(base_url, url_path)

# Function to save individual pages from a TIFF file
def save_tiff_pages(tiff_data, doc_id, output_dir):
    try:
        # Open the TIFF data as an image
        with Image.open(io.BytesIO(tiff_data)) as img:
            # Create directory for this document if it doesn't exist
            doc_dir = os.path.join(output_dir, doc_id)
            os.makedirs(doc_dir, exist_ok=True)
            
            # Get the number of frames/pages in the TIFF
            n_frames = 1
            try:
                while True:
                    # Save the current frame (1-based indexing)
                    output_path = os.path.join(doc_dir, f"{doc_id}_page{n_frames}.tif")
                    img.save(output_path)
                    n_frames += 1
                    img.seek(img.tell() + 1)
            except EOFError:
                pass  # We've reached the end of the frames
            
            return True, n_frames - 1
    except Exception as e:
        print(f"Error processing TIFF for {doc_id}: {str(e)}")
        return False, 0

# Function to download and process a single document with retry logic
def download_and_process_document(url, doc_id, max_retries=3):
    for attempt in range(max_retries):
        try:
            response = requests.get(url, stream=True, timeout=30)
            response.raise_for_status()
            
            # Read the entire response content
            content = response.content
            
            # Process and save individual pages
            success, num_pages = save_tiff_pages(content, doc_id, output_dir)
            if success:
                return True, num_pages
            else:
                return False, 0
                
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed to download {url} after {max_retries} attempts: {str(e)}")
                return False, 0
            time.sleep(1)  # Wait before retrying

# Download and process images with progress bar
failed_downloads = []
successful_downloads = []

for doc in tqdm(docs_to_download, desc="Processing documents"):
    file_path = doc['file_path']
    doc_id = os.path.splitext(os.path.basename(file_path))[0]
    url = get_download_url(file_path)
    
    # Skip if document directory already exists
    doc_dir = os.path.join(output_dir, doc_id)
    if os.path.exists(doc_dir):
        continue
    
    # Download and process the document
    success, num_pages = download_and_process_document(url, doc_id)
    
    if success:
        successful_downloads.append((doc_id, num_pages))
    else:
        failed_downloads.append((doc_id, url))
    
    # Add a small delay between downloads to be nice to the server
    time.sleep(0.5)

# Print summary
print("\nDownload Summary:")
print(f"Total documents processed: {len(filtered_documents['documents_with_pii'])}")
print(f"Successful downloads: {len(successful_downloads)}")
print(f"Failed downloads: {len(failed_downloads)}")

if successful_downloads:
    print("\nSuccessful Downloads:")
    for doc_id, num_pages in successful_downloads:
        print(f"- {doc_id}: {num_pages} pages")

if failed_downloads:
    print("\nFailed Downloads:")
    for doc_id, url in failed_downloads:
        print(f"- {doc_id}: {url}")



Processing documents:   0%|          | 0/1553 [00:00<?, ?it/s]

Failed to download https://download.industrydocuments.ucsf.edu/f/m/b/g/fmbg0252/fmbg0252.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/m/b/g/fmbg0252/fmbg0252.tif


Processing documents:   0%|          | 1/1553 [00:04<2:03:26,  4.77s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/j/n/c/fjnc0242/fjnc0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/j/n/c/fjnc0242/fjnc0242.tif


Processing documents:   0%|          | 2/1553 [00:09<2:00:42,  4.67s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/h/j/n/fhjn0234/fhjn0234.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/h/j/n/fhjn0234/fhjn0234.tif


Processing documents:   0%|          | 3/1553 [00:13<1:56:19,  4.50s/it]

Error processing TIFF for fhbc0223: Image size (286720000 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.


Processing documents:   0%|          | 4/1553 [00:15<1:25:48,  3.32s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/k/k/m/fkkm0249/fkkm0249.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/k/k/m/fkkm0249/fkkm0249.tif


Processing documents:   0%|          | 5/1553 [00:19<1:35:56,  3.72s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/h/p/m/fhpm0237/fhpm0237.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/h/p/m/fhpm0237/fhpm0237.tif


Processing documents:   0%|          | 6/1553 [00:24<1:41:55,  3.95s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/j/c/b/fjcb0242/fjcb0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/j/c/b/fjcb0242/fjcb0242.tif


Processing documents:   0%|          | 7/1553 [00:28<1:44:46,  4.07s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/j/f/y/fjfy0253/fjfy0253.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/j/f/y/fjfy0253/fjfy0253.tif


Processing documents:   1%|          | 8/1553 [00:32<1:46:12,  4.12s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/j/c/l/fjcl0243/fjcl0243.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/j/c/l/fjcl0243/fjcl0243.tif


Processing documents:   1%|          | 9/1553 [00:37<1:48:48,  4.23s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/l/c/v/flcv0238/flcv0238.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/l/c/v/flcv0238/flcv0238.tif


Processing documents:   1%|          | 10/1553 [00:41<1:49:21,  4.25s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/h/x/d/fhxd0242/fhxd0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/h/x/d/fhxd0242/fhxd0242.tif


Processing documents:   1%|          | 11/1553 [00:45<1:50:28,  4.30s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/g/w/f/fgwf0242/fgwf0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/g/w/f/fgwf0242/fgwf0242.tif


Processing documents:   1%|          | 12/1553 [00:50<1:50:33,  4.30s/it]

Error processing TIFF for fkff0223: Image size (286720000 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.


Processing documents:   1%|          | 13/1553 [00:51<1:30:07,  3.51s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/h/b/p/fhbp0237/fhbp0237.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/h/b/p/fhbp0237/fhbp0237.tif


Processing documents:   1%|          | 14/1553 [00:55<1:35:39,  3.73s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/f/d/h/ffdh0240/ffdh0240.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/f/d/h/ffdh0240/ffdh0240.tif


Processing documents:   1%|          | 15/1553 [01:00<1:39:26,  3.88s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/k/m/c/fkmc0242/fkmc0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/k/m/c/fkmc0242/fkmc0242.tif


Processing documents:   1%|          | 16/1553 [01:04<1:41:47,  3.97s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/k/x/w/fkxw0242/fkxw0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/k/x/w/fkxw0242/fkxw0242.tif


Processing documents:   1%|          | 17/1553 [01:08<1:42:23,  4.00s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/l/h/h/flhh0235/flhh0235.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/l/h/h/flhh0235/flhh0235.tif


Processing documents:   1%|          | 18/1553 [01:12<1:44:07,  4.07s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/k/y/f/fkyf0246/fkyf0246.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/k/y/f/fkyf0246/fkyf0246.tif


Processing documents:   6%|▋         | 98/1553 [03:21<45:48,  1.89s/it]  _TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
Processing documents:   7%|▋         | 110/1553 [03:35<28:08,  1.17s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/j/g/k/fjgk0245/fjgk0245.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/j/g/k/fjgk0245/fjgk0245.tif


Processing documents:  18%|█▊        | 286/1553 [08:14<29:56,  1.42s/it]  

Failed to download https://download.industrydocuments.ucsf.edu/f/k/m/g/fkmg0242/fkmg0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/k/m/g/fkmg0242/fkmg0242.tif


Processing documents:  20%|█▉        | 310/1553 [08:55<28:19,  1.37s/it]

Failed to download https://download.industrydocuments.ucsf.edu/f/g/c/x/fgcx0242/fgcx0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/g/c/x/fgcx0242/fgcx0242.tif


Processing documents:  29%|██▉       | 450/1553 [12:27<25:31,  1.39s/it]_TIFFVSetField: downloaded_images/flwp0087/flwp0087_page1.tif: Bad value 0 for "ResolutionUnit" tag.


Error processing TIFF for flwp0087: Error setting from dictionary


Processing documents:  38%|███▊      | 583/1553 [15:48<23:33,  1.46s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
Processing documents:  41%|████▏     | 642/1553 [17:17<32:39,  2.15s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
Processing documents:  48%|████▊     | 744/1553 [19:49<17:34,  1.30s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
Processing documents:  53%|█████▎    | 827/1553 [21:53<16:49,  1.39s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "T

Failed to download https://download.industrydocuments.ucsf.edu/f/g/m/g/fgmg0242/fgmg0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/g/m/g/fgmg0242/fgmg0242.tif


Processing documents:  59%|█████▉    | 921/1553 [24:26<22:44,  2.16s/it]_TIFFVSetField: downloaded_images/fjhf0093/fjhf0093_page1.tif: Bad value 0 for "ResolutionUnit" tag.


Error processing TIFF for fjhf0093: Error setting from dictionary


Processing documents:  72%|███████▏  | 1116/1553 [29:11<10:54,  1.50s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
Processing documents:  72%|███████▏  | 1121/1553 [29:18<11:00,  1.53s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
Processing documents:  74%|███████▍  | 1154/1553 [30:07<09:54,  1.49s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
Processing documents:  79%|███████▉  | 1224/1553 [31:53<09:29,  1.73s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count fo

Failed to download https://download.industrydocuments.ucsf.edu/f/k/j/f/fkjf0242/fkjf0242.tif after 3 attempts: 403 Client Error: Forbidden for url: https://download.industrydocuments.ucsf.edu/f/k/j/f/fkjf0242/fkjf0242.tif


Processing documents:  90%|█████████ | 1402/1553 [36:03<04:00,  1.59s/it]_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
_TIFFVSetField: tempfile.tif: Null count for "Tag 32934" (type 4, writecount -3, passcount 1).
Processing documents: 100%|██████████| 1553/1553 [40:21<00:00,  1.56s/it]


Download Summary:
Total documents processed: 5253
Successful downloads: 1527
Failed downloads: 26

Successful Downloads:
- fgvc0000: 1 pages
- fgdj0003: 3 pages
- ffjf0074: 1 pages
- fjwn0219: 23 pages
- ffpj0115: 2 pages
- fjyj0212: 3 pages
- fgwj0109: 1 pages
- flph0136: 1 pages
- fljn0147: 1 pages
- fhhl0239: 49 pages
- ffmv0002: 1 pages
- ffvn0164: 3 pages
- fjll0127: 1 pages
- flcb0134: 1 pages
- fjmx0024: 1 pages
- flbj0122: 3 pages
- fhyl0026: 1 pages
- fhlf0142: 3 pages
- fkpp0006: 1 pages
- fkdp0209: 4 pages
- ffcv0201: 1 pages
- flnp0225: 52 pages
- flhm0071: 1 pages
- fjwd0007: 1 pages
- fgbj0029: 1 pages
- fghm0039: 1 pages
- fldl0189: 1 pages
- flhn0028: 124 pages
- flgw0215: 13 pages
- ffxm0090: 1 pages
- flcy0081: 2 pages
- ffxp0050: 3 pages
- fhmn0184: 1 pages
- fkxb0047: 1 pages
- fjdc0115: 1 pages
- fhjk0112: 20 pages
- fjkv0205: 5 pages
- flyh0225: 3 pages
- flfh0021: 1 pages
- fllp0005: 1 pages
- fkgx0100: 8 pages
- ffhk0052: 2 pages
- fgmc0028: 1 pages
- fhlc0202:


