# Finding the predicate...

In [55]:
import pandas as pd
from typing import List
import tempfile
import urllib.request
from os import path
import PyPDF2 as ppdf
import re
import json
from collections import defaultdict
from tqdm import tqdm

In [78]:
def find_predicate(pcode: str) -> str:

    # Extract year from product code
    if pcode[0] == 'K': yr = pcode[1:3]
    else: return None

    url = f'https://www.accessdata.fda.gov/cdrh_docs/pdf{yr}/{pcode}.pdf'
    # print(url)

    # Pull PDF data
    response = urllib.request.urlopen(url)
    pg_data = response.read()

    # Write to tempfile object
    temp_file = path.join(tempfile.gettempdir(), 'temp.pdf')
    pdf_file = open(temp_file, 'wb')
    pdf_file.write(pg_data)

    found = []

    # In case the document is too old
    try: 
        doc = ppdf.PdfFileReader(open(temp_file, 'rb'))
    except Exception as e: 
        print(f'could not open pdf for {pcode}')
        return None

    for pg in doc.pages:
        try:
            pg_content = (pg.extract_text() + '\n').upper()
            regex = re.search('PREDICATE', pg_content)

            # Find all product codes after each occurrence of 'predicate' sorted in order of appearance
            if regex is not None:
                chunks = re.findall('(P|DEN|K)([0-9]{6})', pg_content[regex.span()[1]:])
                pcodes = [p1+p2 for p1,p2 in chunks]
                for cand in pcodes:
                    if cand != pcode: 
                        found.append(cand)
                        break

        except Exception as e:
            # print(f'issue finding predicate for {pcode}: {e}')
            pass

    # Return most occuring element in found, just an extra layer of assurance
    return max(set(found), key = found.count) if len(found) else None


In [4]:
def get_nums(pcode: str, limit: int) -> List[str]:
    url = f'https://api.fda.gov/device/510k.json?search={pcode}&limit={limit}'

    response = urllib.request.urlopen(url)
    db = json.loads(response.read())

    return [d["k_number"] for d in db["results"]]

In [47]:
def to_root(src: str): 
    res = [src]

    while res[-1][0] == 'K':
        pred = find_predicate(res[-1])
        if pred is None or pred in res: break
        res.append(pred)

    return res

In [None]:
# MYN
# ├────P000041
# │  ├────K201560
# │  └────K210666
# └────P980025
#    ├────K212519
#    ├────K213795
#    └────K210365

In [None]:
# QAS
# └────DEN170073
#      ├────K182177
#      │    ├────K203260
#      │    ├────K201310
#      │    └────K190424
#      ├────K200873
#      │    └────K211788
#      ├────K212261
#      ├────K180647
#      │    ├────K193298
#      │    ├────K190896
#      │    ├────K193351
#      │    ├────K200855
#      │    ├────K203508
#      │    │    ├────K221240
#      │    │    ├────K213886
#      │    │    ├────K213721
#      │    │    └────K221314
#      │    ├────K192167
#      │    ├────K200921
#      │    ├────K192383
#      │    ├────K182875
#      │    ├────K190072
#      │    │    ├────K210237
#      │    │    ├────K220499
#      │    │    └────K201020
#      │    ├────K211179
#      │    └────K193087
#      │         └────K200941
#      │              └────K221248
#      └────K193658
#           └────K210209

In [32]:
print(len(get_nums('QAS', 100)))

38


In [88]:
def form_tree(pcode: str):
    nums = get_nums(pcode, 100)

    adj_list = defaultdict(list)
    visited = set()
    in_degree = defaultdict(int)

    for i, num in enumerate(nums):
        print(f'Starting new path at {num}, iter {i+1}/{len(nums)}')
        path = to_root(num)
        print(path, '\n')

        for i in range(len(path)-1, 0, -1):
            parent, child = path[i], path[i-1]
            visited.update([parent, child])
            in_degree[child] += 1
            if child not in adj_list[parent]:
                adj_list[parent].append(child)

    for node in visited:
        if in_degree[node] == 0: 
            adj_list[pcode].append(node)

    return adj_list


In [89]:
adj_list = form_tree('MYN')

Starting new path at K213795, iter 1/5
['K213795', 'P980025'] 

Starting new path at K210666, iter 2/5
['K210666', 'P000041'] 

Starting new path at K212519, iter 3/5
['K212519', 'P980025'] 

Starting new path at K201560, iter 4/5
['K201560', 'P000041'] 

Starting new path at K210365, iter 5/5
['K210365', 'P980025'] 



In [91]:
adj_list = form_tree('QAS')

Starting new path at K192383, iter 1/38
['K192383', 'K180647', 'DEN170073'] 

Starting new path at K193658, iter 2/38
['K193658', 'DEN170073'] 

Starting new path at K203260, iter 3/38
['K203260', 'K182177', 'DEN170073'] 

Starting new path at K200855, iter 4/38
['K200855', 'K180647', 'DEN170073'] 

Starting new path at K791354, iter 5/38
could not open pdf for K791354
['K791354'] 

Starting new path at K190896, iter 6/38
['K190896', 'K180647', 'DEN170073'] 

Starting new path at K221314, iter 7/38
['K221314', 'K203508', 'K190072', 'K180647', 'DEN170073'] 

Starting new path at K190072, iter 8/38
['K190072', 'K180647', 'DEN170073'] 

Starting new path at K890414, iter 9/38
could not open pdf for K890414
['K890414'] 

Starting new path at K221240, iter 10/38
['K221240', 'K203508', 'K190072', 'K180647', 'DEN170073'] 

Starting new path at K213886, iter 11/38
['K213886', 'K203508', 'K190072', 'K180647', 'DEN170073'] 

Starting new path at K193087, iter 12/38
['K193087', 'K180647', 'DEN170

In [92]:
adj_list

defaultdict(list,
            {'DEN170073': ['K180647',
              'K193658',
              'K182177',
              'K212261',
              'K182875',
              'K200873'],
             'K180647': ['K192383',
              'K200855',
              'K190896',
              'K190072',
              'K193087',
              'K193298',
              'K193351',
              'K200921',
              'K211179'],
             'K182177': ['K203260', 'K192167', 'K190424', 'K201310'],
             'K190072': ['K203508', 'K220499', 'K201020', 'K210237'],
             'K203508': ['K221314', 'K221240', 'K213886', 'K213721'],
             'K193087': ['K200941'],
             'K193658': ['K210209'],
             'K200941': ['K221248'],
             'QAS': ['DEN170073']})

In [118]:
import networkx as nx
from pyvis.network import Network

In [136]:
DEFAULT = 50
SIZE_DICT = {0: 400, 1: 200, 2: 100}

def graph_info(adj_list, pcode):
    nodes = []
    values = []

    stk = []
    stk.append((pcode, 0))

    while stk:
        node, level = stk.pop()
        if node in nodes: continue

        nodes.append(node)
        values.append(SIZE_DICT[level]) if level <= 2 else values.append(DEFAULT)

        for child in adj_list[node]:
            stk.append((child, level+1))

    return nodes, values

In [140]:
nodes, values = graph_info(adj_list, 'QAS')

In [127]:
class GraphVisual:
    def __init__(self, pcode):
        self.edges = []
        self.name = pcode

    def add_edge(self, a, b):
        self.edges.append([a,b])

    def visualize(self):
        G = nx.Graph()
        G.add_edges_from(self.edges)

        net = Network(notebook=True)
        net.from_nx(G)
        net.show(f'{self.name}.html')


In [129]:
G = GraphVisual('QAS')

for parent in adj_list:
    for child in adj_list[parent]:
        G.add_edge(parent, child)

G.visualize()