# Finding the predicate...

In [2]:
import pandas as pd
from typing import List
import tempfile
import urllib.request
from os import path
import PyPDF2 as ppdf
import re
import json
from collections import defaultdict
from tqdm import tqdm
from collections import deque

In [3]:
def find_predicate(pcode: str) -> str:

    # Extract year from product code
    if pcode[0] == 'K': yr = pcode[1:3]
    else: return None

    url = f'https://www.accessdata.fda.gov/cdrh_docs/pdf{yr}/{pcode}.pdf'
    # print(url)

    # Pull PDF data
    response = urllib.request.urlopen(url)
    pg_data = response.read()

    # Write to tempfile object
    temp_file = path.join(tempfile.gettempdir(), 'temp.pdf')
    pdf_file = open(temp_file, 'wb')
    pdf_file.write(pg_data)

    found = []

    # In case the document is too old
    try: 
        doc = ppdf.PdfFileReader(open(temp_file, 'rb'))
    except Exception as e: 
        print(f'could not open pdf for {pcode}')
        return None

    for pg in doc.pages:
        try:
            pg_content = (pg.extract_text() + '\n').upper()
            regex = re.search('PREDICATE', pg_content)

            # Find all product codes after each occurrence of 'predicate' sorted in order of appearance
            if regex is not None:
                chunks = re.findall('(P|DEN|K)([0-9]{6})', pg_content[regex.span()[1]:])
                pcodes = [p1+p2 for p1,p2 in chunks]
                for cand in pcodes:
                    if cand != pcode: 
                        found.append(cand)
                        break

        except Exception as e:
            # print(f'issue finding predicate for {pcode}: {e}')
            pass

    # Return most occuring element in found, just an extra layer of assurance
    return max(set(found), key = found.count) if len(found) else None


In [4]:
def get_nums(pcode: str, limit: int) -> List[str]:
    url = f'https://api.fda.gov/device/510k.json?search={pcode}&limit={limit}'

    response = urllib.request.urlopen(url)
    db = json.loads(response.read())

    return [d["k_number"] for d in db["results"]]

In [5]:
def to_root(src: str): 
    res = [src]

    while res[-1][0] == 'K':
        pred = find_predicate(res[-1])
        if pred is None or pred in res: break
        res.append(pred)

    return res

In [6]:
def form_tree(pcode, verbose = False):
    nums = get_nums(pcode, 500)

    adj_list = defaultdict(list)
    visited = set()
    in_degree = defaultdict(int)

    for i, num in enumerate(nums):
        path = to_root(num)

        if verbose:
            print(f'Starting new path at {num}, iter {i+1}/{len(nums)}')
            print(path, '\n')

        for i in range(len(path)-1, 0, -1):
            parent, child = path[i], path[i-1]
            visited.update([parent, child])
            in_degree[child] += 1
            if child not in adj_list[parent]:
                adj_list[parent].append(child)

    for node in visited:
        if in_degree[node] == 0: 
            adj_list[pcode].append(node)

    return adj_list


In [39]:
def convert_root_path(root_path):
    url = f'https://api.fda.gov/device/510k.json?search={root_path[0]}'
    response = urllib.request.urlopen(url)
    db = json.loads(response.read())
    pcode = db['results'][0]['product_code']

    adj_list = defaultdict(list)
    for i in range(len(root_path)-1):
        adj_list[root_path[i+1]].append(root_path[i])
    adj_list[pcode].append(root_path[-1])

    """
    Now we need to add all descendents of the queried knumber.
    Ultimately, instead of rescraping the entire product code for any given submission number, 
    you would just pull the tree from some sort of cache/database
    """
    entire_tree = form_tree(pcode)
    q = deque([root_path[0]])
    seen = [root_path[0]]

    # Simple BFS starting from root_path[0]
    while (q):
        cur = q.popleft()
        adj_list[cur] = entire_tree[cur]
        for child in adj_list[cur]:
            if child not in seen:
                seen.append(child)
                q.append(child)

    return adj_list, pcode

In [9]:
def get_node_val(adj_list, pcode):
    nodes = []
    values = {}

    stk = []
    stk.append((pcode, 0))

    while stk:
        node, level = stk.pop()
        if node in nodes:
            continue

        nodes.append(node)
        values[node] = level

        for child in adj_list[node]:
            stk.append((child, level+1))

    return nodes, values

# JSON Output

In [8]:
def to_json(pcode, adj_list, nodes, values):
    res = {}
    res['tree'] = adj_list
    res['info'] = {}

    # Pull relevant information from openFDA API for each node
    url = f'https://api.fda.gov/device/510k.json?search={pcode}&limit=500'

    response = urllib.request.urlopen(url)
    db = json.loads(response.read())

    for doc in db['results']:
        if doc['k_number'] in nodes:
            res['info'][doc['k_number']] = {
                'DECISION_DATE': doc['decision_date'],
                'PRODUCT_CODES': doc['product_code'],
                'DEVICE_TRADE_NAME': doc['device_name'],
                'GENERATION': values[doc['k_number']]
            }

    for n in nodes:
        if n not in res['info']:
            res['info'][n] = {};
            res['info'][n]['GENERATION'] = values[n]


    return json.dumps(res)

### From product code to JSON
1. Generate adjacency list with form_tree, all we need to pass in is the product code
2. Get list of nodes from get_node_val, simple DFS through the adj_list to extract relevant k numbers
3. Pass product code, adj_list, and node list to to_json

In [10]:
def scrape_tree(pcode, verbose = False):
    adj_list = form_tree(pcode, verbose = verbose)
    nodes, values = get_node_val(adj_list, pcode)
    return to_json(pcode, adj_list, nodes, values)

### From knumber to JSON
1. Generate root path with to_root, only passing in the source knumber
2. Convert rooth path to adj_list and product code with convert_root_path
3. Rest is the same as with scrape_tree...

In [40]:
def scrape_branch(knumber, verbose = False):
    root_path = to_root(knumber)
    adj_list, pcode = convert_root_path(root_path)
    nodes, values = get_node_val(adj_list, pcode)
    return to_json(pcode, adj_list, nodes, values)

In [43]:
res = scrape_branch('K171422', verbose = False)

adjusting tree
could not open pdf for K082457
could not open pdf for K951921
could not open pdf for K081911
could not open pdf for K091851
could not open pdf for K093696
could not open pdf for K143247
could not open pdf for K973903
could not open pdf for K974183
could not open pdf for K063627
could not open pdf for K090336
could not open pdf for K000139
could not open pdf for K040176
could not open pdf for K061505
could not open pdf for K090464
could not open pdf for K081089
could not open pdf for K980419
could not open pdf for K082125
could not open pdf for K091965
could not open pdf for K984615
could not open pdf for K011482
could not open pdf for K090464
could not open pdf for K992830
could not open pdf for K090568
could not open pdf for K060733
could not open pdf for K002819
could not open pdf for K030529
could not open pdf for K990420
could not open pdf for K080536
could not open pdf for K031944
could not open pdf for K030075
could not open pdf for K000138
could not open pdf for K

URLError: <urlopen error [Errno 51] Network is unreachable>

In [42]:
res

'{"tree": {"K180647": ["K190072"], "DEN170073": ["K180647"], "QAS": ["DEN170073"], "K190072": ["K220499", "K201020", "K210237"], "K220499": [], "K201020": [], "K210237": []}, "info": {"K190072": {"DECISION_DATE": "2019-04-15", "PRODUCT_CODES": "QAS", "DEVICE_TRADE_NAME": "BriefCase", "GENERATION": 3}, "K220499": {"DECISION_DATE": "2022-05-17", "PRODUCT_CODES": "QAS", "DEVICE_TRADE_NAME": "Rapid PE Triage and Notification (PETN)", "GENERATION": 4}, "K201020": {"DECISION_DATE": "2020-08-26", "PRODUCT_CODES": "QAS", "DEVICE_TRADE_NAME": "BriefCase", "GENERATION": 4}, "DEN170073": {"DECISION_DATE": "2018-02-13", "PRODUCT_CODES": "QAS", "DEVICE_TRADE_NAME": "ContaCT", "GENERATION": 1}, "K210237": {"DECISION_DATE": "2021-05-19", "PRODUCT_CODES": "QAS", "DEVICE_TRADE_NAME": "CINA CHEST", "GENERATION": 4}, "K180647": {"DECISION_DATE": "2018-08-01", "PRODUCT_CODES": "QAS", "DEVICE_TRADE_NAME": "BriefCase", "GENERATION": 2}, "QAS": {"GENERATION": 0}}}'