# Finding the predicate...

In [77]:
import pandas as pd
from typing import List
import tempfile
import urllib.request
from os import path
import PyPDF2 as ppdf
import re
import json
from collections import defaultdict

In [97]:
def find_predicate(pcode: str) -> List[str]:

    # Extract year from product code
    if pcode[0] == 'K': yr = pcode[1:3]
    else: return None

    url = f'https://www.accessdata.fda.gov/cdrh_docs/pdf{yr}/{pcode}.pdf'
    print(url)

    # Pull PDF data
    response = urllib.request.urlopen(url)
    pg_data = response.read()

    # Write to tempfile object
    temp_file = path.join(tempfile.gettempdir(), 'temp.pdf')
    pdf_file = open(temp_file, 'wb')
    pdf_file.write(pg_data)

    found = []
    doc = ppdf.PdfFileReader(open(temp_file, 'rb'))
    for pg in doc.pages:
        try:
            pg_content = (pg.extract_text() + '\n').upper()
            regex = re.search('PREDICATE', pg_content)

            # Find all product codes after each occurrence of 'predicate' sorted in order of appearance
            if regex is not None:
                pcodes = re.findall('(P|DEN|K)([0-9]{6})', pg_content[regex.span()[1]:])
                if len(pcodes): found.append(pcodes[0][0]+pcodes[0][1])

        except Exception as e:
            print(f'issue finding predicate for {pcode}: {e}')

    # Return most occuring element in found, just an extra layer of assurance
    return max(set(found), key = found.count) if len(found) else None


In [91]:
def get_nums(pcode: str, limit: int) -> List[str]:
    url = f'https://api.fda.gov/device/510k.json?search={pcode}&limit={limit}'

    response = urllib.request.urlopen(url)
    db = json.loads(response.read())

    return [d["k_number"] for d in db["results"]]

In [41]:
def to_root(src: str): 
    res = [src]

    while res[-1][0] == 'K':
        pred = find_predicate(res[-1])
        if pred is None: break
        res.append(pred)

    return res

In [None]:
# MYN
# ├────P000041
# │  ├────K201560
# │  └────K210666
# └────P980025
#    ├────K212519
#    ├────K213795
#    └────K210365

In [None]:
# QAS
# └────DEN170073
#      ├────K182177
#      │    ├────K203260
#      │    ├────K201310
#      │    └────K190424
#      ├────K200873
#      │    └────K211788
#      ├────K212261
#      ├────K180647
#      │    ├────K193298
#      │    ├────K190896
#      │    ├────K193351
#      │    ├────K200855
#      │    ├────K203508
#      │    │    ├────K221240
#      │    │    ├────K213886
#      │    │    ├────K213721
#      │    │    └────K221314
#      │    ├────K192167
#      │    ├────K200921
#      │    ├────K192383
#      │    ├────K182875
#      │    ├────K190072
#      │    │    ├────K210237
#      │    │    ├────K220499
#      │    │    └────K201020
#      │    ├────K211179
#      │    └────K193087
#      │         └────K200941
#      │              └────K221248
#      └────K193658
#           └────K210209

In [92]:
print(get_nums('QAS', 100))

['K192383', 'K193658', 'K203260', 'K200855', 'K791354', 'K190896', 'K221314', 'K190072', 'K890414', 'K221240', 'K213886', 'K193087', 'K193298', 'K200941', 'K192167', 'K790840', 'K936002', 'K220499', 'K800657', 'K190424', 'K211788', 'K213721', 'K212261', 'K201020', 'K201310', 'K182177', 'K193351', 'K182875', 'K210209', 'K200873', 'DEN170073', 'K200921', 'K800708', 'K203508', 'K221248', 'K210237', 'K180647', 'K211179']


In [105]:
def form_tree(pcode: str):
    nums = get_nums(pcode, 100)

    adj_list = defaultdict(list)
    degree = defaultdict(int)

    for num in nums:
        print(f'STARTING NEW PATH AT {num}')
        path = to_root(num)
        print(path, '\n')

    print(adj_list)


In [106]:
form_tree('QAS')

STARTING NEW PATH AT K192383
https://www.accessdata.fda.gov/cdrh_docs/pdf19/K192383.pdf
issue finding predicate for K192383: '/W'
https://www.accessdata.fda.gov/cdrh_docs/pdf18/K180647.pdf
issue finding predicate for K180647: '/W'
['K192383', 'K180647', 'DEN170073'] 

STARTING NEW PATH AT K193658
https://www.accessdata.fda.gov/cdrh_docs/pdf19/K193658.pdf
issue finding predicate for K193658: '/W'
['K193658', 'DEN170073'] 

STARTING NEW PATH AT K203260
https://www.accessdata.fda.gov/cdrh_docs/pdf20/K203260.pdf
https://www.accessdata.fda.gov/cdrh_docs/pdf18/K182177.pdf
issue finding predicate for K182177: '/W'
['K203260', 'K182177', 'DEN170073'] 

STARTING NEW PATH AT K200855
https://www.accessdata.fda.gov/cdrh_docs/pdf20/K200855.pdf
issue finding predicate for K200855: '/W'
https://www.accessdata.fda.gov/cdrh_docs/pdf18/K180647.pdf
issue finding predicate for K180647: '/W'
['K200855', 'K180647', 'DEN170073'] 

STARTING NEW PATH AT K791354
https://www.accessdata.fda.gov/cdrh_docs/pdf79/K7

PdfReadError: PDF starts with '﻿<!', but '%PDF-' expected