### Extract PDF from PDFs

v.4.0.0

----------------------------

## Input / Output Path

Set to the directory containing the pdfs

In [1]:
input_dir = "/Users/nicolebrye/Desktop/HGC/Data_Management/vf-dicom/20210816_10-2"

Set to the directory where the pdfs will be copied to 

In [2]:
output_dir = "/Users/nicolebrye/Desktop"

Set to the directory that contains the subset table

In [3]:
subset = "/Users/nicolebrye/Desktop/HGC/Data_Management/vf-dicom/test4.csv"

------------------------------------

## Imports

The imports below are now setup so that if the package is not currently installed on your machine, the code below will do the installation for you. 

In [4]:
import re
import os
import numpy as np
import pandas as pd

# Packages that may not be installed
try:
    from shutil import copyfile
except:
    !pip install shutil
    from shutil import copyfile

---

## Creating a Lookup Table

Function for formatting the lookup structure

In [5]:
def nestValues(x):
    '''
    Converts the values of a lookup table into a nested dictionary containing 
    information for masked id, eye, and date
    
    :param x: A row of the table
    :returns: A nested dictionary
    '''
    
    # If the maskedID hasn't been added yet, add a new dictionary
    try:
        SUB[x["maskedID"]]
    except KeyError:
        SUB[x["maskedID"]] = {}
    
    try:
        d1 = x["Date"]
    except KeyError:
        d1 = "null"
        
    # Check if the table contains an Eye column
    try:
        d2 = x["Eye"]
    except KeyError:
        d2 = "null"
        
    try:
        SUB[x["maskedID"]][d2].update({d1:0})
    except:
        SUB[x["maskedID"]][d2] = {d1:0}
        
    return None


<br>
Function for extracting maskedID, Eye, and Date from a filename

In [6]:
def parseValues(x):
    '''
    Given a pdf filename, extracts the maskedID, Eye, and Date
    
    :param x: A filename as a string
    :returns: A list of the desired elements
    '''
    
    ID   = re.search("^[A-Z]{2}[0-9]{4}", x).group(0)
    Eye  = re.search("_(OD|OS)_", x).group(0).replace("_", "")
    Date = re.search("[0-9]{8}", x).group(0)
    
    return [ID, Eye, Date]

<br>
Function for looking up desired information in the nested dictionary lookup structure

In [7]:
def lookupValues(x):
    '''
    Given a pdf filename, checks if the information matches any of the values
    that can be found in the lookup table
    
    :param x: A row of the filenames table
    :returns: A boolean stating whether the file is in the lookup table
    '''
    
    try: 
        temp = SUB[x["maskedID"]]
    except:
        return False
    
    try:
        temp = temp[x["Eye"]]
        v1   = x["Eye"]
        
    except KeyError:
        
        if "null" in temp.keys():
            temp = temp["null"]
            v1   = "null"
            
        else:
            return False
        
    # Keep track of missing values
    try:
        if x["Date"] in temp.keys():
            SUB[x["maskedID"]][v1][x["Date"]] += 1
            return True
        
        elif "null" in temp.keys():
            SUB[x["maskedID"]][v1]["null"] += 1
            return True
        
        else:
            return False
        
    except KeyError:
        return False


---

## Listing all of the Files

Creating the table

In [8]:
table = pd.read_csv(subset)

# Fix dates
try:
    table["Date"] = pd.to_datetime(table["Date"]).astype(str).str.replace("-", "")
    table["Date"] = table["Date"].replace("NaT", np.NaN)
except KeyError:
    pass

table = table.replace(np.NaN, "null")

# Create the nested dictionary structure
SUB = {}
table.apply(nestValues, axis = 1)
SUB

{'SD4560': {'null': {'20180621': 0}}}

<br>
Looping through files and folders to ensure everything is in the correct format

In [9]:

pattern = "^[A-Z]{2}[0-9]{4}_[A-Z]{2}_[0-9]{8}.*\.pdf$"

PDFS  = [f for f in os.listdir(input_dir) if re.match(pattern, f)]
outer = np.repeat(input_dir, len(PDFS))

# Now loop through the folders and add additional pdfs on
folders = [f for f in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, f))]

files = []
inner = np.array([])

for folder in folders:
    temp = [f for f in os.listdir(os.path.join(input_dir, folder)) if re.match(pattern, f)]
    files += temp

    inner = np.append(inner, np.repeat(os.path.join(input_dir, folder), len(temp)))


# Add the files together
s1 = list(outer) + list(inner)
s2 = PDFS + files

fnames = pd.Series(zip(s1, s2)).apply(lambda x: os.path.join(x[0], x[1]))

---

## Extracting all information from the file names

Looping through each of the files to extract information using the `parseValues` function defined above

In [10]:

# Create a DataFrame of information
DF = pd.DataFrame(list(pd.Series(s2).apply(parseValues).values), 
                  columns = ["maskedID", "Eye", "Date"])

# Assign file names
DF = DF.assign(fp = fnames)
DF = DF.assign(status = DF.apply(lookupValues, axis = 1))

SUBSET = DF.loc[DF["status"]]

# Loop through the desired fps and copy to another directory
for fp in SUBSET["fp"]:
    
    fold = fp.split("/")[-2]
    file = fp.split("/")[-1]
    
    src = fp
    out = os.path.join(output_dir, fold + "_subset")
    dst = os.path.join(out, file)
    
    if not os.path.isdir(out):
        os.makedirs(out)
    
    copyfile(src, dst)

---

## Display Information about missing files

In [11]:
# Level 1
MVALS = []
KEY1  = list(SUB.keys())
VAL1  = list(SUB.values())

for i, v1 in enumerate(VAL1):
    
    # Level 2
    KEY2 = list(v1.keys())
    VAL2 = list(v1.values())
    
    for j, v2 in enumerate(VAL2):
        
        # Level 3
        KEY3 = list(v2.keys())
        VAL3 = list(v2.values())
        
        if 0 in VAL3:
            idx = np.where(np.array(VAL3) == 0)[0]
            
            for k in idx:
                MVALS.append([KEY1[i], KEY2[j], KEY3[k]])

                
                
# Print out the results
if len(MVALS) == 0:
    print("All files were copied to the output directory successfully :D")
    
else:
    print("Could not find any pdfs for the following information:\n")
    
    OUT = pd.DataFrame(MVALS, columns = ["maskedID", "Eye", "Date"])
    OUT = OUT.replace("null", "Not Specified")
    
    print(OUT)

All files were copied to the output directory successfully :D
