##### Extract PDF from PDFs
----------------------------
v.3.0.0

## Imports

In [85]:
import re
import os
import numpy as np
import pandas as pd

from os import listdir
from shutil import copyfile

## Input / Output Path

In [40]:
# Set to the directory containing the pdfs
input_dir = "/Users/nicolebrye/Desktop/HGC/Data_Management/vf-dicom"

# Set to the directory where the pdfs will be copied to
output_dir = "/Users/nicolebrye/Desktop"

# Set to directory that contains the subset table, type "False" (without quotes) after the equal sign if there 
# is no subset being used.
subset = "/Users/nicolebrye/Desktop/HGC/Data_Management/vf-dicom/test1.csv"

### Creating a Faster Lookup Table

In [105]:
folders = [folder for folder in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, folder))]

if len(folders) == 0:
    folders   = [input_dir.split("/")[-1]]
    input_dir = "/" + "/".join(input_dir.split("/")[1:-1])

In [108]:
try:
    table = pd.read_csv(subset)

    # Fix dates
    try:
        table["Date"] = pd.to_datetime(table["Date"]).astype(str).str.replace("-", "")
        table["Date"] = table["Date"].replace("NaT", np.NaN)
    except KeyError:
        pass

    # Create sets for IDs and a dictionary of sets for other attributes
    IDS  = set(table["maskedID"])
    cols = list(table.columns)
    
    cols.remove("maskedID")
    if len(cols) == 0:
        attrs   = IDS
        missing = IDS.copy()
    else:
        attrs   = table[cols].apply(set, 1).to_dict()
        missing = table.apply(list, 1).to_dict()
    
except ValueError:
    
    table   = None
    missing = None

### Loop through files and copy to output directory

In [109]:
for f in folders:
    
    # Extract pfs from folders
    fp    = os.path.join(input_dir, f)
    files = [file for file in listdir(fp) if file.endswith(".pdf")]
    
    # Create output location, overwriting file if it exists
    out_path = os.path.join(output_dir, f + "_subset")
    
    if not os.path.isdir(out_path):
        os.makedirs(out_path)
    
    # Loop through each file to determine whether it should be copied
    for i, x in enumerate(files):
        
        # Source and destination paths
        src = os.path.join(input_dir, f, x)
        dst = os.path.join(output_dir, f + "_subset", x)
        
        if not isinstance(table, pd.core.frame.DataFrame):  
            copyfile(src, dst)
            continue
            
        # Extract ID, eye, and date from file name
        ID   = re.search('^[A-Z]{2}\d{4}', x).group(0)
        eye  = re.search('_(OS|OD)_', x).group(0).replace("_", "")
        date = re.search('\d{8}', x).group(0)
    
        # Check if the ID is in the input table
        if ID in IDS:
            
            if isinstance(attrs, set):
                copyfile(src, dst)
                
                try:
                    missing.remove(ID)
                except KeyError:
                    pass
                
                continue
            else:
                idx = table.loc[table["maskedID"] == ID].index
        else:
            continue
        
        for j in idx:
            
            sub    = attrs[j]
            copied = False
            
            if len(sub) == 1 and np.NaN in sub:
                copyfile(src, dst)
                copied = True
                
            elif len(sub) == 1 and eye in sub:
                copyfile(src, dst)
                copied = True
                
            elif len(sub) == 1 and date in sub:
                copyfile(src, dst)
                copied = True
            
            elif eye in sub and date in sub:
                copyfile(src, dst)
                copied = True
                
            elif eye in sub and np.NaN in sub:
                copyfile(src, dst) 
                copied = True
                
            elif date in sub and np.NaN in sub:
                copyfile(src, dst)
                copied = True
                
            else:
                continue
                
            if copied:
                try:
                    del missing[j]
                except KeyError:
                    pass
        
    # Remove the folder if there are no pdfs
    copied = [file for file in listdir(out_path)]
        
    if len(copied) == 0:
        os.rmdir(out_path)

### Message for any file in the subset that could not be moved

In [110]:
# Message for if no subset was used
if not subset:
    print("All files were copied to the output directory successfully :D")

# Messages for if a subset was used
else:
    if len(missing) == 0:
        print("All files in the subset were copied to the output directory successfully :D")
    
    elif isinstance(missing, set):

        info = ", ".join(missing)
        print(f"Could not find any pdfs for the following ID(s): {info}")
    
    else:
        for key in missing:
        
            info = missing[key]
            info = [x for x in info if not pd.isnull(x)]
            info = ", ".join(info)
        
            print(f"Could not find any pdfs for the following information: {info}")

Could not find any pdfs for the following ID(s): NY8236
