In [2]:
import os
import requests

In [3]:
# Here we convert the tif images to png images and get their dimension and orientation using pytesseract
import os
import json
from PIL import Image
from tqdm import tqdm
import glob
import pytesseract

# Directory containing the downloaded TIFF images
input_dir = "downloaded_images"
output_dir = "png_images"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load existing metadata if it exists
metadata_path = "image_metadata.json"
if os.path.exists(metadata_path):
    with open(metadata_path, 'r') as f:
        image_metadata = json.load(f)
    # Create set of already processed documents
    processed_docs = {img["doc_id"] for img in image_metadata["images"]}
else:
    image_metadata = {"images": []}
    processed_docs = set()

# Function to determine orientation based on dimensions
def get_orientation_confidence(image):
    """
    Get orientation and confidence score using pytesseract OSD (Orientation and Script Detection)
    Returns:
        tuple: (orientation_angle, confidence, detected_orientation)
        orientation_angle: 0, 90, 180, or 270 degrees
        confidence: confidence score from pytesseract
    """
    try:
        # Get OSD data from pytesseract
        osd = pytesseract.image_to_osd(image, output_type=pytesseract.Output.DICT)

        confidence = osd['orientation_conf']
        rotation_angle = osd['rotate']
            
        return rotation_angle, confidence
        
    except Exception as e:
        print(f"Error in OCR orientation detection: {str(e)}")
        return 0, 0

# Process all TIFF files in subdirectories
for doc_dir in tqdm(glob.glob(os.path.join(input_dir, "*")), desc="Processing documents"):
    doc_id = os.path.basename(doc_dir)
    
    # Skip if document has already been processed
    if doc_id in processed_docs:
        continue
        
    # Create corresponding output directory
    doc_output_dir = os.path.join(output_dir, doc_id)
    os.makedirs(doc_output_dir, exist_ok=True)
    
    # Process each TIFF file in the document directory
    for tiff_path in glob.glob(os.path.join(doc_dir, "*.tif")):
        try:
            # Get the page number from the filename
            page_num = int(os.path.splitext(os.path.basename(tiff_path))[0].split('_page')[1])
            
            # Open and process the image
            with Image.open(tiff_path) as img:
                # Get image dimensions
                width, height = img.size

                # Convert to RGB for PNG
                rgb_img = img.convert('RGB')
                
                # Determine orientation
                angle, confidence = get_orientation_confidence(rgb_img)
                
                # Create PNG filename
                png_filename = f"{doc_id}_page{page_num}.png"
                png_path = os.path.join(doc_output_dir, png_filename)
                
                # Convert and save as PNG
                img.convert('RGB').save(png_path, 'PNG')
                
                # Store metadata
                image_metadata["images"].append({
                    "doc_id": doc_id,
                    "page": doc_id+"_page"+str(page_num),
                    "page_number": page_num,
                    "original_path": tiff_path,
                    "png_path": png_path,
                    "width": width,
                    "height": height,
                    "rotation_angle": angle,
                    "orientation_confidence": confidence,
                    "aspect_ratio": round(width / height, 3)
                })
                
        except Exception as e:
            print(f"Error processing {tiff_path}: {str(e)}")

# Sort images by doc_id and page_number
image_metadata["images"].sort(key=lambda x: (x["doc_id"], x["page_number"]))

# Save metadata to JSON file
metadata_path = "image_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(image_metadata, f, indent=2)

# Print summary
print("\nProcessing Summary:")
print(f"Total images processed: {len(image_metadata['images'])}")

# Print dimension ranges
widths = [img["width"] for img in image_metadata["images"]]
heights = [img["height"] for img in image_metadata["images"]]

print("\nDimension Ranges:")
print(f"Width: {min(widths)} to {max(widths)} pixels")
print(f"Height: {min(heights)} to {max(heights)} pixels")

Processing documents:   0%|          | 3/5229 [00:13<8:04:55,  5.57s/it]



Processing documents:   1%|          | 47/5229 [00:47<51:37,  1.67it/s]  



Processing documents:   5%|▍         | 243/5229 [03:28<1:49:21,  1.32s/it]



Processing documents:   5%|▍         | 248/5229 [03:30<1:22:35,  1.01it/s]



Processing documents:   5%|▍         | 260/5229 [03:46<1:13:25,  1.13it/s]



Processing documents:   6%|▌         | 308/5229 [04:14<36:46,  2.23it/s]  



Processing documents:   6%|▌         | 316/5229 [04:18<39:43,  2.06it/s]



Processing documents:   8%|▊         | 441/5229 [11:58<2:17:40,  1.73s/it] 



Processing documents:  11%|█         | 557/5229 [23:43<1:51:20,  1.43s/it] 



Processing documents:  11%|█▏        | 596/5229 [26:59<4:20:41,  3.38s/it] 



Processing documents:  12%|█▏        | 609/5229 [27:03<1:21:37,  1.06s/it]



Processing documents:  12%|█▏        | 625/5229 [27:14<56:51,  1.35it/s]  



Processing documents:  14%|█▎        | 710/5229 [29:04<46:24,  1.62it/s]   



Processing documents:  14%|█▍        | 756/5229 [30:50<58:10,  1.28it/s]   



Processing documents:  15%|█▌        | 800/5229 [31:18<32:02,  2.30it/s]  



Processing documents:  17%|█▋        | 908/5229 [37:08<15:36:20, 13.00s/it]



Processing documents:  18%|█▊        | 918/5229 [37:12<6:07:22,  5.11s/it] 



Processing documents:  18%|█▊        | 967/5229 [38:50<2:48:04,  2.37s/it]



Processing documents:  20%|██        | 1051/5229 [41:18<1:06:55,  1.04it/s] 



Processing documents:  21%|██        | 1080/5229 [42:40<1:08:28,  1.01it/s]



Processing documents:  21%|██        | 1105/5229 [44:53<9:43:17,  8.49s/it]



Processing documents:  21%|██▏       | 1123/5229 [45:29<1:54:32,  1.67s/it]



Processing documents:  21%|██▏       | 1124/5229 [46:02<5:46:13,  5.06s/it]



Processing documents:  22%|██▏       | 1129/5229 [46:59<7:31:16,  6.60s/it] 



Processing documents:  23%|██▎       | 1198/5229 [47:48<48:39,  1.38it/s]  



Processing documents:  26%|██▌       | 1350/5229 [50:26<45:16,  1.43it/s]  



Processing documents:  26%|██▌       | 1371/5229 [50:38<38:26,  1.67it/s]  



Processing documents:  27%|██▋       | 1430/5229 [51:24<14:05,  4.49it/s]  



Processing documents:  28%|██▊       | 1450/5229 [51:32<19:46,  3.19it/s]



Processing documents:  30%|██▉       | 1548/5229 [54:49<2:14:17,  2.19s/it] 



Processing documents:  31%|███       | 1612/5229 [55:13<21:49,  2.76it/s]  



Processing documents:  32%|███▏      | 1674/5229 [59:53<2:48:16,  2.84s/it] 



Processing documents:  33%|███▎      | 1702/5229 [1:00:26<58:45,  1.00it/s]  



Processing documents:  34%|███▍      | 1801/5229 [1:01:42<41:27,  1.38it/s]  



Processing documents:  35%|███▍      | 1811/5229 [1:01:51<45:42,  1.25it/s]



Processing documents:  35%|███▍      | 1821/5229 [1:02:03<1:22:55,  1.46s/it]



Processing documents:  36%|███▌      | 1861/5229 [1:07:05<15:18:01, 16.35s/it]



Processing documents:  38%|███▊      | 2000/5229 [1:12:23<50:05,  1.07it/s]   



Processing documents:  38%|███▊      | 2012/5229 [1:12:48<1:25:37,  1.60s/it]



Processing documents:  40%|███▉      | 2070/5229 [1:15:15<56:48,  1.08s/it]  



Processing documents:  40%|███▉      | 2073/5229 [1:15:35<2:36:57,  2.98s/it]



Processing documents:  43%|████▎     | 2234/5229 [1:18:35<57:36,  1.15s/it]  



Processing documents:  43%|████▎     | 2258/5229 [1:20:48<1:40:50,  2.04s/it]



Processing documents:  44%|████▎     | 2276/5229 [1:31:06<15:28:13, 18.86s/it]



Processing documents:  44%|████▍     | 2288/5229 [1:31:07<6:23:03,  7.81s/it] 



Processing documents:  44%|████▍     | 2317/5229 [1:31:26<1:21:34,  1.68s/it]



Processing documents:  45%|████▍     | 2346/5229 [1:32:00<38:20,  1.25it/s]  



Processing documents:  45%|████▍     | 2347/5229 [1:33:25<11:51:41, 14.82s/it]



Processing documents:  46%|████▌     | 2394/5229 [1:33:59<30:53,  1.53it/s]   



Processing documents:  46%|████▋     | 2428/5229 [1:34:56<43:03,  1.08it/s]  



Processing documents:  48%|████▊     | 2521/5229 [1:49:01<24:41,  1.83it/s]   



Processing documents:  51%|█████     | 2642/5229 [1:51:12<56:01,  1.30s/it]  



Processing documents:  52%|█████▏    | 2736/5229 [1:52:28<23:48,  1.75it/s]  



Processing documents:  53%|█████▎    | 2752/5229 [1:52:50<40:34,  1.02it/s]  



Processing documents:  54%|█████▍    | 2821/5229 [1:54:37<14:35,  2.75it/s]  



Processing documents:  56%|█████▌    | 2932/5229 [1:57:34<37:03,  1.03it/s]  



Processing documents:  57%|█████▋    | 2964/5229 [1:58:15<55:31,  1.47s/it]  



Processing documents:  57%|█████▋    | 2965/5229 [1:58:40<2:03:18,  3.27s/it]



Processing documents:  57%|█████▋    | 2966/5229 [1:59:06<3:27:41,  5.51s/it]



Processing documents:  58%|█████▊    | 3057/5229 [2:01:37<1:32:35,  2.56s/it]



Processing documents:  60%|█████▉    | 3117/5229 [2:05:01<3:41:20,  6.29s/it]



Processing documents:  61%|██████    | 3186/5229 [2:06:24<34:27,  1.01s/it]  



Processing documents:  62%|██████▏   | 3238/5229 [2:10:24<21:10,  1.57it/s]  



Processing documents:  64%|██████▎   | 3331/5229 [2:14:16<26:37,  1.19it/s]  



Processing documents:  66%|██████▌   | 3436/5229 [2:18:56<1:49:28,  3.66s/it]



Processing documents:  67%|██████▋   | 3523/5229 [2:19:52<16:05,  1.77it/s]  



Processing documents:  68%|██████▊   | 3570/5229 [2:20:35<28:34,  1.03s/it]



Processing documents:  71%|███████   | 3704/5229 [2:24:26<09:45,  2.61it/s]  



Processing documents:  74%|███████▍  | 3884/5229 [2:30:54<10:36,  2.11it/s]  



Processing documents:  75%|███████▍  | 3901/5229 [2:32:12<2:25:52,  6.59s/it]



Processing documents:  75%|███████▍  | 3913/5229 [2:36:21<4:35:44, 12.57s/it] 



Processing documents:  75%|███████▌  | 3927/5229 [2:36:38<1:11:58,  3.32s/it]



Processing documents:  76%|███████▌  | 3973/5229 [2:37:09<11:45,  1.78it/s]  



Processing documents:  76%|███████▌  | 3977/5229 [2:38:08<2:07:10,  6.09s/it]



Processing documents:  76%|███████▋  | 3988/5229 [2:38:10<50:59,  2.47s/it]  



Processing documents:  77%|███████▋  | 4047/5229 [2:41:30<08:10,  2.41it/s]  



Processing documents:  77%|███████▋  | 4048/5229 [2:41:34<13:22,  1.47it/s]



Processing documents:  79%|███████▉  | 4135/5229 [2:43:22<12:26,  1.46it/s]  



Processing documents:  79%|███████▉  | 4147/5229 [2:43:38<17:35,  1.03it/s]



Processing documents:  80%|████████  | 4202/5229 [2:44:21<10:02,  1.71it/s]



Processing documents:  87%|████████▋ | 4555/5229 [2:50:06<04:06,  2.74it/s]  



Processing documents:  87%|████████▋ | 4556/5229 [2:50:12<08:20,  1.35it/s]



Processing documents:  88%|████████▊ | 4596/5229 [2:51:18<22:58,  2.18s/it]



Processing documents:  89%|████████▉ | 4645/5229 [2:52:54<09:24,  1.03it/s]  



Processing documents:  89%|████████▉ | 4648/5229 [2:52:56<08:32,  1.13it/s]



Processing documents:  90%|█████████ | 4709/5229 [2:54:10<06:18,  1.37it/s]



Processing documents:  90%|█████████ | 4732/5229 [2:54:28<05:59,  1.38it/s]



Processing documents:  93%|█████████▎| 4873/5229 [3:00:57<05:22,  1.11it/s]  



Processing documents:  94%|█████████▍| 4921/5229 [3:01:37<03:11,  1.61it/s]



Processing documents:  94%|█████████▍| 4922/5229 [3:01:38<03:48,  1.34it/s]



Processing documents:  99%|█████████▊| 5153/5229 [3:05:16<00:20,  3.66it/s]



Processing documents:  99%|█████████▊| 5162/5229 [3:06:42<04:06,  3.68s/it]



Processing documents:  99%|█████████▉| 5164/5229 [3:06:46<03:37,  3.34s/it]



Processing documents: 100%|██████████| 5229/5229 [3:07:42<00:00,  2.15s/it]



Processing Summary:
Total images processed: 36950

Dimension Ranges:
Width: 817 to 8000 pixels
Height: 891 to 7221 pixels


In [5]:
# Here we use the image metadata along with the pii filtering data and Donut Model for image classification to weight each images
import re
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch


processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")

device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device) 

task_prompt = "<s_rvlcdip>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

In [10]:
def classify_document(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values
    outputs = model.generate(
    pixel_values.to(device),
    decoder_input_ids=decoder_input_ids.to(device),
    max_length=model.decoder.config.max_position_embeddings,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    use_cache=True,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
    return_dict_in_generate=True,
    )
    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
    return processor.token2json(sequence)


def classify_document_cloud(image_path):
    with open(image_path, "rb") as f:
        response = requests.post("http://0.0.0.0:8000/predict", files={"file": f})
        res = response.json()
        return res['prediction']['class']

In [13]:
allowed_document_types = ["letter", "form", "email", "specification", "budget", "invoice", "presentation", "questionnaire", "resume", "memo"]
import shutil


# Load PII filtering data and image metadata
filtered_documents_json_path = "/Volumes/MyDataDrive/thesis/code-2/src/fireworks/filtering/pii_filtering_data.json"
image_metadata_json_path = "/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/image_metadata.json"
filtered_output_dir = "filtered_images"  # New directory for PII-containing images

# Create output directory
os.makedirs(filtered_output_dir, exist_ok=True)

# Load the required data files
with open(filtered_documents_json_path, 'r') as f:
    filtered_documents = json.load(f)

with open(image_metadata_json_path, 'r') as f:
    image_metadata = json.load(f)

# Create a dictionary mapping document IDs to their PII pages with additional metadata
pii_pages = {}
for doc in filtered_documents['documents_with_pii']:
    # Extract doc_id from the file path
    doc_id = doc['file_path'].split('/')[-1].split('.')[0]
    
    # Store PII information for each page
    if doc_id not in pii_pages:
        pii_pages[doc_id] = {}
    
    # Process each page with PII
    for page in doc['pages_with_pii']:
        # Extract page number from page_id
        page_id = page['page_id']
        page_num = int(page_id.split('_page')[-1])
        
        # Store page metadata
        pii_pages[doc_id][page_num] = {
            'pii_types': page['pii_types'],
            'main_topic': page.get('main_topic', ''),
            'page_type': page.get('page_type', ''),
            'confidence': page.get('confidence', 0.0)
        }

# Initialize or load metadata for filtered images
filtered_metadata_path = "filtered_images_metadata.json"
processed_pages = set()
if os.path.exists(filtered_metadata_path):
    print("Loading existing filtered metadata...")
    with open(filtered_metadata_path, 'r') as f:
        filtered_metadata = json.load(f)
    processed_pages = {(img["doc_id"], img["page_number"]) for img in filtered_metadata.get("images", [])}
    print(f"Found {len(processed_pages)} already processed pages")
    total_pages = sum(len(pages) for pages in pii_pages.values())
    remaining = total_pages - len(processed_pages)
    print(f"Need to process {remaining} more pages")
else:
    filtered_metadata = {
        "images": [],
        "pii_statistics": {
            "total_documents": len(pii_pages),
            "total_pages": 0,
            "document_types": {},
            "pii_types_distribution": {}
        }
    }
    print(f"Need to process {sum(len(pages) for pages in pii_pages.values())} pages")

print(f"Found {sum(len(pages) for pages in pii_pages.values())} pages with PII across {len(pii_pages)} documents")

# Process and copy only the pages that contain PII
print("Processing images...")
for img_data in tqdm(image_metadata["images"]):
    doc_id = img_data["doc_id"]
    page_num = img_data["page_number"]
    
    # Only process if this specific page has PII and hasn't been processed before
    if doc_id in pii_pages and page_num in pii_pages[doc_id] and (doc_id, page_num) not in processed_pages:
        # Create document directory in filtered output
        doc_output_dir = os.path.join(filtered_output_dir, doc_id)
        os.makedirs(doc_output_dir, exist_ok=True)
        
        # Copy the PNG file to filtered directory
        src_path = img_data["png_path"]
        dst_filename = f"{doc_id}_page{page_num}.png"
        dst_path = os.path.join(doc_output_dir, dst_filename)
        
        try:
            shutil.copy2(src_path, dst_path)
            
            # Classify the document
            doc_type = classify_document_cloud(dst_path)
            
            # Get PII metadata for this page
            pii_metadata = pii_pages[doc_id][page_num]
            
            # Create new metadata entry
            new_metadata = {
                **img_data,  # Keep existing metadata
                "filtered_png_path": dst_path,
                "document_type": doc_type,
                "is_allowed_type": doc_type.lower() in [t.lower() for t in allowed_document_types],
                "pii_types": pii_metadata['pii_types'],
                "main_topic": pii_metadata['main_topic'],
                "page_type": pii_metadata['page_type'],
                "pii_confidence": pii_metadata['confidence']
            }
            
            filtered_metadata["images"].append(new_metadata)
            
            # Update statistics
            filtered_metadata["pii_statistics"]["total_pages"] += 1
            filtered_metadata["pii_statistics"]["document_types"][doc_type] = \
                filtered_metadata["pii_statistics"]["document_types"].get(doc_type, 0) + 1
            
            # Update PII type statistics
            for pii_type in pii_metadata['pii_types']:
                filtered_metadata["pii_statistics"]["pii_types_distribution"][pii_type] = \
                    filtered_metadata["pii_statistics"]["pii_types_distribution"].get(pii_type, 0) + 1
            
        except Exception as e:
            print(f"Error processing {src_path}: {str(e)}")

# Sort images by doc_id and page_number
filtered_metadata["images"].sort(key=lambda x: (x["doc_id"], x["page_number"]))

# Save the new metadata
with open(filtered_metadata_path, 'w') as f:
    json.dump(filtered_metadata, f, indent=2)

# Print summary statistics
print("\nProcessing Summary:")
print(f"Total documents with PII pages: {len(pii_pages)}")
print(f"Total PII pages processed: {filtered_metadata['pii_statistics']['total_pages']}")

print("\nDocument Type Distribution:")
doc_types = filtered_metadata["pii_statistics"]["document_types"]
total_pages = filtered_metadata["pii_statistics"]["total_pages"]
for doc_type, count in sorted(doc_types.items(), key=lambda x: x[1], reverse=True):
    print(f"{doc_type}: {count} pages ({count/total_pages*100:.1f}%)")


Loading existing filtered metadata...
Found 14956 already processed pages
Need to process 6699 more pages
Found 21655 pages with PII across 5253 documents
Processing images...


100%|██████████| 36950/36950 [1:30:55<00:00,  6.77it/s]   



Processing Summary:
Total documents with PII pages: 5253
Total PII pages processed: 21088

Document Type Distribution:
scientific_report: 3400 pages (16.1%)
questionnaire: 2376 pages (11.3%)
budget: 2266 pages (10.7%)
presentation: 1953 pages (9.3%)
letter: 1910 pages (9.1%)
news_article: 1489 pages (7.1%)
form: 1459 pages (6.9%)
email: 1320 pages (6.3%)
memo: 1111 pages (5.3%)
specification: 1097 pages (5.2%)
scientific_publication: 862 pages (4.1%)
handwritten: 561 pages (2.7%)
advertisement: 480 pages (2.3%)
resume: 418 pages (2.0%)
invoice: 279 pages (1.3%)
file_folder: 107 pages (0.5%)


In [1]:
# Here we get the stat on the filtered_images_metadata.json file
import json

with open('filtered_images_metadata.json', 'r') as f:
    data = json.load(f)

print(len(data['images']))

# Now, we want to get the statistics of the filtered_images_metadata.json file

# First, we want to get the total number of documents
total_documents = len(data['images'])
print(f"Total documents: {total_documents}")


pages_with_pii_and_allowed_document_types = 0
pii_types_distribution = {}
pii_type_combinations = {}

new_allowed_document_types = ["letter", "form", "email", "budget", "invoice", "resume", "memo"]
allowed_combo = [
    ('location', 'person_name', 'phone'),
    ('email', 'location', 'person_name', 'phone'),
    ('person_name', 'phone'),
    ('email', 'person_name'),
    ('location', 'phone'),
    ('email', 'location', 'person_name'),
    ('email', 'location', 'phone'),
    ('email', 'person_name', 'phone'),
    ('email',),
    ('email', 'location'),
    ('email', 'phone')
]

top_files = []

for image in data['images']:
    if image['document_type'].lower() in new_allowed_document_types:
        pii_type_combination = tuple(sorted(image['pii_types']))
        existing_combination = pii_type_combinations.get(pii_type_combination, None)
        doc_type = image['document_type']
        if existing_combination:
            if doc_type not in existing_combination['document_types']: 
                existing_combination['document_types'].append(doc_type)
        pii_type_combinations[pii_type_combination] = {
            "count": existing_combination['count'] + 1 if existing_combination else 1,
            "document_types": existing_combination['document_types'] if existing_combination else [doc_type]
        }

        if pii_type_combination in allowed_combo:
            top_files.append(image)

        pages_with_pii_and_allowed_document_types += 1
        for pii_type in image['pii_types']:
            pii_types_distribution[pii_type] = pii_types_distribution.get(pii_type, 0) + 1



open("top_files.json", "w").write(json.dumps(top_files, indent=4))


print(f"Pages with PII and allowed document types: {pages_with_pii_and_allowed_document_types}")
print(f"PII types distribution: {pii_types_distribution}")
print(f"PII type combinations:")
for combination, value in pii_type_combinations.items():
    print(f"{combination}: {value['count']} pages")
    print(f"Document types: {value['document_types']}")
    print("-" * 100)    

21088
Total documents: 21088
Pages with PII and allowed document types: 8763
PII types distribution: {'person_name': 6082, 'location': 6277, 'phone': 2250, 'email': 490}
PII type combinations:
('person_name',): 2048 pages
Document types: ['memo', 'email', 'resume', 'form', 'letter', 'invoice', 'budget']
----------------------------------------------------------------------------------------------------
('location', 'person_name', 'phone'): 1186 pages
Document types: ['memo', 'invoice', 'form', 'letter', 'email', 'budget', 'resume']
----------------------------------------------------------------------------------------------------
('location', 'person_name'): 2195 pages
Document types: ['memo', 'resume', 'letter', 'form', 'budget', 'invoice', 'email']
----------------------------------------------------------------------------------------------------
('location',): 2106 pages
Document types: ['memo', 'budget', 'invoice', 'resume', 'form', 'letter', 'email']
----------------------------

In [9]:
import json
import os
new_allowed_document_types = ["letter", "form", "email", "budget", "invoice", "resume", "memo"]
pages_with_pii_and_allowed_document_types = 0
pii_types_distribution = {}
pii_type_combinations = {}
allowed_combo = [
    ('location', 'person_name', 'phone'),
    ('email', 'location', 'person_name', 'phone'),
    ('person_name', 'phone'),
    ('email', 'person_name'),
    ('location', 'phone'),
    ('email', 'location', 'person_name'),
    ('email', 'location', 'phone'),
    ('email', 'person_name', 'phone'),
    ('email',),
    ('email', 'location'),
    ('email', 'phone')
]

top_files = []


with open("/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/top_files-train-v1.json", "r") as f:
    data = json.load(f)

train_data_labels_dir = "/Volumes/MyDataDrive/thesis/code-2/new-data/IIT-CDIP/train/labels/"

# Get list of existing label files for faster lookup
existing_label_files = set(os.listdir(train_data_labels_dir))

for image in data:
    # Check if corresponding label file exists
    label_filename = f"votes_{image['doc_id']}_page{image['page_number']}.json"
    
    # Only process if label file exists
    if label_filename in existing_label_files and image['document_type'].lower() in new_allowed_document_types:
        pii_type_combination = tuple(sorted(image['pii_types']))
        existing_combination = pii_type_combinations.get(pii_type_combination, None)
        doc_type = image['document_type']
        if existing_combination:
            if doc_type not in existing_combination['document_types']: 
                existing_combination['document_types'].append(doc_type)
        pii_type_combinations[pii_type_combination] = {
            "count": existing_combination['count'] + 1 if existing_combination else 1,
            "document_types": existing_combination['document_types'] if existing_combination else [doc_type]
        }

        if pii_type_combination in allowed_combo:
            top_files.append(image)

        pages_with_pii_and_allowed_document_types += 1
        for pii_type in image['pii_types']:
            pii_types_distribution[pii_type] = pii_types_distribution.get(pii_type, 0) + 1


print(f"Pages with PII and allowed document types (with labels): {pages_with_pii_and_allowed_document_types}")
print(f"PII types distribution: {pii_types_distribution}")
print(f"PII type combinations:")
for combination, value in pii_type_combinations.items():
    print(f"{combination}: {value['count']} pages")
    print(f"Document types: {value['document_types']}")
    print("-" * 100)

Pages with PII and allowed document types (with labels): 1958
PII types distribution: {'person_name': 1529, 'location': 1635, 'phone': 1812, 'email': 418}
PII type combinations:
('location', 'person_name', 'phone'): 967 pages
Document types: ['memo', 'invoice', 'form', 'letter', 'email', 'budget', 'resume']
----------------------------------------------------------------------------------------------------
('email', 'location', 'person_name', 'phone'): 209 pages
Document types: ['email', 'letter', 'form', 'resume', 'invoice', 'memo', 'budget']
----------------------------------------------------------------------------------------------------
('person_name', 'phone'): 161 pages
Document types: ['email', 'letter', 'memo', 'form', 'resume', 'budget', 'invoice']
----------------------------------------------------------------------------------------------------
('email', 'person_name'): 95 pages
Document types: ['email', 'letter', 'form']
--------------------------------------------------

In [31]:
count = 0
import os
script_dir = "/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/"
target_combination = ('email', 'person_name')
for image in data['images']:
    if image['document_type'].lower() in new_allowed_document_types:
        pii_type_combination = tuple(sorted(image['pii_types']))
        if pii_type_combination == target_combination:
            # Convert absolute path to relative path
            rel_path =  script_dir + image['png_path']
            print(rel_path, image['document_type'], image['page_type'], "Main Topic: " + image['main_topic'])
            count += 1

            if count > 20:
                break

/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/png_images/ffbl0235/ffbl0235_page1.png email Internal Email Correspondence Main Topic: Corporate internal communications strategy
/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/png_images/ffcd0004/ffcd0004_page1.png email Email Message Main Topic: Business Communication
/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/png_images/fffh0251/fffh0251_page1.png email Email Communication Main Topic: Expense Report Notification
/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/png_images/ffgm0238/ffgm0238_page1.png email Email Communication Main Topic: Corporate document destruction and internal communication
/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/png_images/ffjm0236/ffjm0236_page1.png email Email Correspondence - Packaging Design Discussion Main Topic: Medication Packaging Design and Dimensions
/Volumes/MyDataDrive/thesis/code-2/src/fireworks/image_assets/png_images/ff