# 510(k) database

In [5]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from typing import List
import tempfile
import urllib.request
from os import path
import PyPDF2 as ppdf
import re

In [6]:
def get_sub_numbers(pcodes: List[str]) -> List[str]:
    """
    Returns all 510(k) premarket submission numbers for given product codes

    Params:
        pcodes :: list of interested product codes

    Returns:
        list[str] :: list of scraped submission document numbers
    """

    res = []
    for pcode in pcodes:
        # TODO: Might need to add more options to this URL
        src = requests.get(
            f'https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?start_search=1&Center=CDRH&Panel=&ProductCode={pcode}&KNumber=&Applicant=&DeviceName=&Type=Traditional&ThirdPartyReviewed=&ClinicalTrials=&Decision=&DecisionDateFrom=&DecisionDateTo=07%2F13%2F2022&IVDProducts=&Redact510K=&CombinationProducts=&ZNumber=&PAGENUM=500').text

        soup = BeautifulSoup(src, 'lxml')
        table = soup.find_all('table')[3]  # table of devices
        # skip all the boilerplate entries
        entries = table.tbody.find_all('tr')[4:]

        for e in entries:
            res.append(e.find_all('td')[2].text)

    return res

In [23]:
def get_labels(labels: List[str], sub_numbers: List[str]) -> pd.DataFrame:
    """
    Returns a DataFrame of document-specific information

    Params:
        labels :: categories from the 510(k) database (e.g. Product Codes, Applicant)
        sub_numbers :: a list of submission document numbers 

    Returns:
        DataFrame :: with labels as as columns and documents as rows
    """
    db = []
    for sub_num in sub_numbers:
        res = {'Submission Number': sub_num}

        src = requests.get(
            f'https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID={sub_num}').text
        soup = BeautifulSoup(src, 'lxml')
        table = soup.find_all('table')[8]  # single out the central table
        specs = table.find_all('tr')

        # Parse for relevant labels
        for s in specs:
            if s.th and s.th.text in labels:
                res[s.th.text] = s.td.text

        # Default for values not found
        for lab in labels:
            if lab not in res.keys():
                res[lab] = None

        # print(res)

        db.append(res)

    return pd.DataFrame(db)

In [59]:
# Form dataframe of submission documents, can add more labels here
snums = get_sub_numbers(['QAS'])
df = get_labels(['Device Name', 'Applicant',
                'Date Received', 'Decision Date'], snums)

In [60]:
def clean_cols(df: pd.DataFrame):
    """
    For now, just cleans the applicant column. Should add kwargs ??
    """
    df['Applicant'] = df['Applicant'].str.split('\n').str[2]

clean_cols(df)

In [61]:
df

Unnamed: 0,Submission Number,Device Name,Applicant,Date Received,Decision Date
0,K220499,Rapid PE Triage and Notification (PETN),iSchemaView Inc.,02/22/2022,05/17/2022
1,K213886,BriefCase,"Aidoc Medical, Ltd.",12/13/2021,04/26/2022
2,K213721,BriefCase,"Aidoc Medical, Ltd.",11/24/2021,03/21/2022
3,K203260,syngo.CT Brain Hemorrhage,"Siemens Medical Solutions USA, Inc.",11/05/2020,01/28/2022
4,K212261,StrokeSENS LVO,"Circle Neurovascular Imaging, Inc",07/20/2021,10/14/2021
5,K211179,InferRead CT Stroke.AI,"Infervision Medical Technology Co., Ltd.",04/20/2021,08/12/2021
6,K210237,CINA CHEST,Avicenna.AI,01/28/2021,05/19/2021
7,K210209,Viz ICH,"Viz.ai, Inc.",01/26/2021,03/23/2021
8,K200873,HALO,NICo-Lab B.V.,04/01/2020,11/20/2020
9,K201020,BriefCase,"Aidoc Medical, Ltd.",04/17/2020,08/26/2020


In [62]:
def form_pdf_url(df: pd.DataFrame):
    """
    Forms pdf url for each submission document based off date received
    """

    df['PDF URL'] = 'https://www.accessdata.fda.gov/cdrh_docs/pdf' + df['Date Received'].str[-2:] + '/' + df['Submission Number'] + '.pdf'

In [63]:
form_pdf_url(df)

pd.set_option('max_colwidth', None)
df['PDF URL']

0     https://www.accessdata.fda.gov/cdrh_docs/pdf22/K220499.pdf
1     https://www.accessdata.fda.gov/cdrh_docs/pdf21/K213886.pdf
2     https://www.accessdata.fda.gov/cdrh_docs/pdf21/K213721.pdf
3     https://www.accessdata.fda.gov/cdrh_docs/pdf20/K203260.pdf
4     https://www.accessdata.fda.gov/cdrh_docs/pdf21/K212261.pdf
5     https://www.accessdata.fda.gov/cdrh_docs/pdf21/K211179.pdf
6     https://www.accessdata.fda.gov/cdrh_docs/pdf21/K210237.pdf
7     https://www.accessdata.fda.gov/cdrh_docs/pdf21/K210209.pdf
8     https://www.accessdata.fda.gov/cdrh_docs/pdf20/K200873.pdf
9     https://www.accessdata.fda.gov/cdrh_docs/pdf20/K201020.pdf
10    https://www.accessdata.fda.gov/cdrh_docs/pdf20/K201310.pdf
11    https://www.accessdata.fda.gov/cdrh_docs/pdf20/K200941.pdf
12    https://www.accessdata.fda.gov/cdrh_docs/pdf20/K200855.pdf
13    https://www.accessdata.fda.gov/cdrh_docs/pdf19/K193298.pdf
14    https://www.accessdata.fda.gov/cdrh_docs/pdf20/K200921.pdf
15    https://www.accessd

In [72]:
def scan_pdf(df: pd.DataFrame, keyword: str, sz = 50):
    """
    Naive keyword parsing. Looks for keywords and grabs the next 50 characters if there's a match.

    Params:
        df :: DataFrame with a PDF URL column

    Returns:
    TBD
    """
    print("THiS HAPPENS")

    kw_frags = []
    for url in df['PDF URL']:
        print(url)
        res = [] 

        # Pull PDF data from url
        response = urllib.request.urlopen(url)
        pg_data = response.read()

        # Write data to tempfile object
        temp_file = path.join(tempfile.gettempdir(), 'temp.pdf')
        pdf_file = open(temp_file, 'wb')
        pdf_file.write(pg_data)

        # print(pdf_file) 

        # Read tempfile object with PyPDF2
        pdfDoc = ppdf.PdfFileReader(open(temp_file, 'rb'))
        for pg in pdfDoc.pages:
            pg_content = (pg.extract_text() + '\n').lower()

            # regex match for keyword
            reSearch = re.search(keyword, pg_content)
            if reSearch is not None:
                start = reSearch.span()[0]
                end = min(start+sz, len(pg_content))
                res.append(pg_content[start:end])

        kw_frags.append(res)

    print(kw_frags)

In [2]:
# scan_pdf(df, 'hello')

# TODO: Figure out what goes wrong here

# CASE: syngo.CT Brain Hemorrhage

# Product Code: K203260

In [53]:
def validate_pcode(pcode: str) -> bool:
    """Function is self explanatory"""

    reg = re.match('(K|DEN)[0-9]{6}', pcode)

    return reg is not None and reg.span()[0] == 0 and reg.span()[1] == len(pcode)

In [54]:
def find_pcodes(pcode: str) -> List[str]:
    """
    Takes year and product code, returns list of found product codes in submission document

    Params:
        pcode :: product code, e.g. KXXXXXX, DENXXXXXX
    """

    found_pcodes = []

    # Extract year from product code
    if pcode[0] == 'K':
        yr = pcode[1:3]
    else:
        yr = pcode[3:5]

    # Form URL
    url = f'https://www.accessdata.fda.gov/cdrh_docs/pdf{yr}/{pcode}.pdf'

    print(url)

    # Pull PDF data
    response = urllib.request.urlopen(url)
    pg_data = response.read()

    # Write to tempfile object
    temp_file = path.join(tempfile.gettempdir(), 'temp.pdf')
    pdf_file = open(temp_file, 'wb')
    pdf_file.write(pg_data)

    doc = ppdf.PdfFileReader(open(temp_file, 'rb'))
    for i, pg in enumerate(doc.pages):
        print(f'currently scanning page {i}')

        try:
            pg_content = pg.extract_text() + '\n'

            regex = re.search('(K|DEN)[0-9]{6}', pg_content)
            if regex is not None:
                i, j = regex.span()

                # Weed out duplicate product codes
                found = pg_content[i:j]
                print(found) 
                if found != pcode and validate_pcode(pcode):
                    found_pcodes.append(found)

        except KeyError as e:
            print('found error ', e)

    return found_pcodes
    


In [55]:
pcode = 'K203260'

found = find_pcodes(pcode)

https://www.accessdata.fda.gov/cdrh_docs/pdf20/K203260.pdf
currently scanning page 0
K203260
currently scanning page 1
K203260
currently scanning page 2
currently scanning page 3
K203260
currently scanning page 4
K182177
currently scanning page 5
currently scanning page 6


In [56]:
found

['K182177']

In [57]:
pcode = found[0]

found2 = find_pcodes(pcode)

https://www.accessdata.fda.gov/cdrh_docs/pdf18/K182177.pdf
currently scanning page 0
K182177
currently scanning page 1
K182177
currently scanning page 2
found error  '/W'
currently scanning page 3
DEN170073
currently scanning page 4
currently scanning page 5


In [58]:
found2

['DEN170073']

In [59]:
pcode = found2[0]

found3 = find_pcodes(pcode)

https://www.accessdata.fda.gov/cdrh_docs/pdf17/DEN170073.pdf
currently scanning page 0
DEN170073
currently scanning page 1
DEN170073
currently scanning page 2
DEN170073
currently scanning page 3
DEN170073
currently scanning page 4
DEN170073


In [60]:
found3

[]

# Putting together the pieces

In [65]:
def form_predicate_tree(source: str) -> List[str]:
    """
    Naive way to find predicate devices starting from any source submission number

    Params:
        source: the starting submission number

    Returns:
        List[str]: list of predicate devices in ascending order
    """

    if not validate_pcode(source):
        print("Invalid product code found")
        return

    res = [source]

    while res[-1][0:3] != 'DEN':
        next_pcodes = find_pcodes(res[-1])

        i = 0 # Grabs the first unique product code found on the page
        while i < len(next_pcodes) and next_pcodes[i] not in res: i += 1
        if i == len(next_pcodes): break # covers both the 0 case and no-unique case

        res.append(next_pcodes[i])

    return res

In [66]:
tree = form_predicate_tree('K203260')

https://www.accessdata.fda.gov/cdrh_docs/pdf20/K203260.pdf
currently scanning page 0
K203260
currently scanning page 1
K203260
currently scanning page 2
currently scanning page 3
K203260
currently scanning page 4
K182177
currently scanning page 5
currently scanning page 6
https://www.accessdata.fda.gov/cdrh_docs/pdf18/K182177.pdf
currently scanning page 0
K182177
currently scanning page 1
K182177
currently scanning page 2
found error  '/W'
currently scanning page 3
DEN170073
currently scanning page 4
currently scanning page 5


In [67]:
tree

# success?

['K203260', 'K182177', 'DEN170073']