#FDA Orange Book Products - drug list

###Scope
- Download drug list

###Data Sources:
- FDA Orange Book
 - Schema: http://www.fda.gov/Drugs/InformationOnDrugs/ucm129689.htm
 - download: http://www.fda.gov/downloads/Drugs/InformationOnDrugs/UCM163762.zip
  - products.txt contains information on individual drugs 

#Imports

In [1]:
from pprint import pprint
import pickle

#Constants

In [2]:
orangebookLocation = './EOBZIP_2015_04/products.txt'

In [3]:
OB_NEWLINE = '\r\n'
OB_SEP = '~'

#Code

In [4]:
def parseOrangebookProducts(file, drug_type='ALL', limit=-1):
    """Returns parsed contents of orangebook product list
    Optionally filter on drug type:
        ALL, RX, OTC, CURRENT (rx+otc), DISCN (discontinued)
    """
    results = []
    header = None
    lineno = 0
    with open(file) as f:
        for line in f.read().split(OB_NEWLINE):
            contents = line.split(OB_SEP)
            if lineno == 0:
                header = contents
            else:
                entry = {header[i]:val for i, val in enumerate(contents)}
                for k in entry.keys():  #special case for complex entries, further parse entry
                    if ';' in k:
                        fields = k.split(';')
                        values = entry[k].split(';')
                        assert len(fields) == len(values)
                        entry.update(dict(zip(fields, values)))
                results.append(entry)
            lineno += 1
            if lineno == limit:
                break
    if drug_type.upper() == 'ALL':
        return results
    elif drug_type.upper() in ['RX', 'OTC', 'DISCN']:
        dt_list = [drug_type.upper()]
    elif drug_type.upper() == 'CURRENT':
        dt_list = ['RX', 'OTC']
    else:
        raise Exception('parseOrangebookProducts: Unvalid drug type')
    return [x for x in orangebook_products if x['Type'] in dt_list]

In [5]:
def getUniqueTradeNames(products):
    """Extracts unique druge trade names from list of products"""
    names = {x['Trade_Name'].lower() for x in products}
    return sorted(list(names))

#FDA Orangebook Product list
(previously downloaded)

In [6]:
orangebook_products = parseOrangebookProducts(orangebookLocation)
print len(orangebook_products)

30457


In [7]:
orangebook_products[0]

{'Appl_No': '205613',
 'Appl_Type': 'N',
 'Applicant': 'VALEANT PHARMS INTL',
 'Applicant_Full_Name': 'VALEANT PHARMACEUTICALS INTERNATIONAL',
 'Approval_Date': 'Oct 7, 2014',
 'DF': 'AEROSOL, FOAM',
 'DF;Route': 'AEROSOL, FOAM;RECTAL',
 'Ingredient': 'BUDESONIDE',
 'Product_No': '001',
 'RLD': 'Yes',
 'Route': 'RECTAL',
 'Strength': '2MG/ACTUATION',
 'TE_Code': '',
 'Trade_Name': 'UCERIS',
 'Type': 'RX'}

In [8]:
set([x['Type'] for x in orangebook_products])

{'DISCN', 'OTC', 'RX'}

In [9]:
all_drugs = getUniqueTradeNames(parseOrangebookProducts(orangebookLocation, drug_type='ALL'))
print 'Total drugs:{}'.format(len(all_drugs))
all_drugs[:20]

Total drugs:6317


['"clopra-""yellow"""',
 '"germa-medica ""mg"""',
 '"hy-pam ""25"""',
 '"hydro-serp ""25"""',
 '"hydro-serp ""50"""',
 '8-hour bayer',
 '8-mop',
 'a-hydrocort',
 'a-methapred',
 'a-n stannous aggregated albumin',
 'a-poxide',
 'a.p.l.',
 'a/t/s',
 'abacavir sulfate',
 'abacavir sulfate, lamivudine and zidovudine',
 'abelcet',
 'abilify',
 'abilify maintena kit',
 'abitrexate',
 'ablavar']

In [10]:
pickle.dump( all_drugs, open( "all_drugs.p", "wb" ))

In [11]:
current_drugs = getUniqueTradeNames(parseOrangebookProducts(orangebookLocation,
                                                            drug_type='CURRENT'))
print 'Total drugs:{}'.format(len(current_drugs))
current_drugs[:20]

Total drugs:3785


['8-mop',
 'a-hydrocort',
 'a-methapred',
 'abacavir sulfate',
 'abacavir sulfate, lamivudine and zidovudine',
 'abelcet',
 'abilify',
 'abilify maintena kit',
 'ablavar',
 'abraxane',
 'abreva',
 'absorica',
 'abstral',
 'acamprosate calcium',
 'acanya',
 'acarbose',
 'accolate',
 'accuneb',
 'accupril',
 'accuretic']

In [12]:
pickle.dump( current_drugs, open( "current_drugs.p", "wb" ))

In [13]:
rx_drugs = getUniqueTradeNames(parseOrangebookProducts(orangebookLocation,
                                                       drug_type='RX'))
print 'Total drugs:{}'.format(len(rx_drugs))
rx_drugs[:20]

Total drugs:3568


['8-mop',
 'a-hydrocort',
 'a-methapred',
 'abacavir sulfate',
 'abacavir sulfate, lamivudine and zidovudine',
 'abelcet',
 'abilify',
 'abilify maintena kit',
 'ablavar',
 'abraxane',
 'absorica',
 'abstral',
 'acamprosate calcium',
 'acanya',
 'acarbose',
 'accolate',
 'accuneb',
 'accupril',
 'accuretic',
 'acebutolol hydrochloride']

In [14]:
pickle.dump( rx_drugs, open( "rx_drugs.p", "wb" ))

In [15]:
otc_drugs = getUniqueTradeNames(parseOrangebookProducts(orangebookLocation,
                                                        drug_type='OTC'))
print 'Total drugs:{}'.format(len(otc_drugs))
otc_drugs[:20]

Total drugs:242


['abreva',
 'acephen',
 'acetaminophen',
 'acetaminophen, aspirin and caffeine',
 'advil',
 'advil allergy and congestion relief',
 'advil allergy sinus',
 'advil cold and sinus',
 'advil congestion relief',
 'advil liqui-gels',
 'advil migraine liqui-gels',
 'advil pm',
 'afrinol',
 'alavert',
 'alaway',
 'aleve',
 'aleve pm',
 'aleve-d sinus & cold',
 'allegra allergy',
 'allegra hives']

In [16]:
pickle.dump( otc_drugs, open( "otc_drugs.p", "wb" ))

In [17]:
discontinued_drugs = getUniqueTradeNames(parseOrangebookProducts(orangebookLocation,
                                                                 drug_type='DISCN'))
print 'Total drugs:{}'.format(len(discontinued_drugs))
discontinued_drugs[:20]

Total drugs:3599


['"clopra-""yellow"""',
 '"germa-medica ""mg"""',
 '"hy-pam ""25"""',
 '"hydro-serp ""25"""',
 '"hydro-serp ""50"""',
 '8-hour bayer',
 'a-hydrocort',
 'a-methapred',
 'a-n stannous aggregated albumin',
 'a-poxide',
 'a.p.l.',
 'a/t/s',
 'abilify',
 'abitrexate',
 'accretropin',
 'accurbron',
 'accutane',
 'acebutolol hydrochloride',
 'acephen',
 'acetaminophen']

In [18]:
pickle.dump( discontinued_drugs, open( "discontinued_drugs.p", "wb" ))