In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm

CDER Website
https://www.fda.gov/drugs/development-approval-process-drugs/new-drugs-fda-cders-new-molecular-entities-and-new-therapeutic-biological-products

NME Biological Approvals:
https://www.fda.gov/vaccines-blood-biologics/development-approval-process-cber/biological-approvals-year

PDF:
https://www.fda.gov/drugs/nda-and-bla-approvals/new-molecular-entity-nme-drug-and-new-biologic-approvals

In [21]:
def get_pdf_links(url):
    site = requests.get(url)
    soup = BeautifulSoup(site.content, 'html.parser')
    
    # Find all links in the webpage
    links = soup.find_all('a', href=True)
        
    # Filter out links that end with .pdf
    pages = [link['href'] for link in links if 'download' in link['href']]
    
    # Handle relative URLs
    pdf_links = [link if link.startswith('http') else 'https://www.fda.gov' + link for link in pages]    
    
    return pdf_links

In [24]:
url = 'https://www.fda.gov/drugs/nda-and-bla-approvals/new-molecular-entity-nme-drug-and-new-biologic-approvals'

links = get_pdf_links(url)
links

['https://www.fda.gov/media/177083/download?attachment',
 'https://www.fda.gov/media/165828/download?attachment',
 'https://www.fda.gov/media/158152/download?attachment',
 'https://www.fda.gov/media/147400/download?attachment',
 'https://www.fda.gov/media/147414/download?attachment',
 'https://www.fda.gov/media/124809/download?attachment',
 'https://www.fda.gov/media/110746/download',
 'https://www.fda.gov/media/102967/download',
 'https://www.fda.gov/media/93424/download']

In [25]:
for link in links:
    

SyntaxError: incomplete input (2591280078.py, line 1)

In [2]:
def extract_links_from_fda_drugname(table_provided):
    """
    Extracts hyperlinks and corresponding drug names from an HTML table.

    Parameters:
    - table_provided (BeautifulSoup): HTML table containing drug information.

    Returns:
    - links (list): List of hyperlinks.
    - names (list): List of drug names.
    """

    # Initialize lists to store links and names
    links, names = [], []

    # Iterate through each row in the provided table, excluding the header (first row)
    for tr in table_provided.select("tr")[1:]:
        try: 
            # Try to find the first hyperlink in the row
            trs = tr.find("a")
            
            # Check if trs is not None before trying to access attributes
            if trs is not None:
                actual_link, name = trs.get('href', ''), trs.get_text()
            else:
                actual_link, name = '', ''
            
        except (AttributeError, IndexError): 
            # Handle cases where there's an attribute error or indexing error
            actual_link, name = '', ''

        # Append the extracted link and name to the respective lists
        links.append(actual_link)
        names.append(name)
        
    return links, names

def scrape_fda_drug_approvals(start_year, end_year):
    """
    Scrapes FDA drug approvals data from specified years.

    Parameters:
    - start_year (int): The starting year for scraping.
    - end_year (int): The ending year for scraping.

    Returns:
    - df_final (DataFrame): Pandas DataFrame containing drug approval information.
    """

    # Initialize an empty list to store DataFrames
    tables = []

    # Iterate through each year in the specified range
    for year in range(start_year, end_year + 1):
        print(f"Scraping data for year {year}")

        # Construct the URL for the FDA drug approvals page for the current year
        url = f'https://www.fda.gov/drugs/new-drugs-fda-cders-new-molecular-entities-and-new-therapeutic-biological-products/novel-drug-approvals-{year}'

        # Make a request to the URL and get the HTML content
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve content for year {year}. Status code: {response.status_code}")
            continue  # Skip to the next iteration

        # Extract the table from the HTML content
        df_list = pd.read_html(response.content)

        # Check if any tables were found
        if not df_list:
            print(f"No tables found for year {year}.")
            continue  # Skip to the next iteration

        # Use the first table found
        df = df_list[0]

        # Rename columns for consistency
        df.rename(columns={'Date': 'Approval Date', 'Drug  Name': 'Drug Name'}, inplace=True)

        # Extract links and names from the drug names in the table
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table')

        # Check if the table is found
        if table is None:
            print(f"No table found for year {year}.")
            continue  # Skip to the next iteration

        links, names = extract_links_from_fda_drugname(table)

        # Add links and names as new columns in the DataFrame
        df['links'], df['check_names'] = links, names

        # Append the DataFrame to the list of tables
        tables.append(df)
        
    df_final = pd.concat(tables, ignore_index=True)
    return df_final

# Specify the range of years for scraping
start_year = 2015
end_year = 2024

# Call the function to scrape FDA drug approvals data
df_result = scrape_fda_drug_approvals(start_year, end_year)
df_result

Scraping data for year 2015
Failed to retrieve content for year 2015. Status code: 404
Scraping data for year 2016
Failed to retrieve content for year 2016. Status code: 404
Scraping data for year 2017
Failed to retrieve content for year 2017. Status code: 404
Scraping data for year 2018
Failed to retrieve content for year 2018. Status code: 404
Scraping data for year 2019
Failed to retrieve content for year 2019. Status code: 404
Scraping data for year 2020
Failed to retrieve content for year 2020. Status code: 404
Scraping data for year 2021
Scraping data for year 2022
Scraping data for year 2023
Scraping data for year 2024


Unnamed: 0,No.,Drug Name,Active Ingredient,Approval Date,FDA-approved use on approval date*,links,check_names
0,50.0,Adbry,tralokinumab-ldrm,12/27/2021,To treat moderate-to-severe atopic dermatitis ...,http://www.accessdata.fda.gov/scripts/cder/daf...,Adbry
1,49.0,Leqvio,inclisiran,12/22/2021,To treat heterozygous familial hypercholestero...,http://www.accessdata.fda.gov/scripts/cder/daf...,Leqvio
2,48.0,Vyvgart,efgartigimod alfa-fcab,12/17/2021,To treat generalized myasthenia gravis Press R...,http://www.accessdata.fda.gov/scripts/cder/daf...,Vyvgart
3,47.0,Tezspire,tezepelumab-ekko,12/17/2021,To treat severe asthma as an add-on maintenanc...,http://www.accessdata.fda.gov/scripts/cder/daf...,Tezspire
4,46.0,Cytalux,pafolacianine,11/29/2021,To help identify ovarian cancer lesions Press ...,http://www.accessdata.fda.gov/scripts/cder/daf...,Cytalux
...,...,...,...,...,...,...,...
159,5.0,Rezdiffra,resmetirom,3/14/2024,To treat noncirrhotic non-alcoholic steatohepa...,https://www.accessdata.fda.gov/drugsatfda_docs...,Rezdiffra
160,4.0,Tevimbra,tislelizumab-jsgr,3/13/2024,To treat unresectable or metastatic esophageal...,https://www.accessdata.fda.gov/drugsatfda_docs...,Tevimbra
161,3.0,Letybo,letibotulinumtoxinA-wlbg,2/29/2024,To temporarily improve the appearance of moder...,https://www.accessdata.fda.gov/drugsatfda_docs...,Letybo
162,2.0,Exblifep,"cefepime, enmetazobactam",2/22/2024,To treat complicated urinary tract infections ...,https://www.accessdata.fda.gov/drugsatfda_docs...,Exblifep


In [3]:
all_main_label_pdf_links = []

for counter, each_url in enumerate(tqdm(df_result['links'], desc='Obtaining Drug Labels')):
    # Check if the URL is correctly formatted
    if each_url.startswith(('http://', 'https://')):
        try:
            html = requests.get(each_url).content
            soup = BeautifulSoup(html, 'html5lib')

            possible_label_pdf_links = []
            if soup:
                for link in soup.findAll('a'):
                    current_link = link.get('href')
                    if current_link is not None:
                        label_pdf_pattern = ['https://www.accessdata.fda.gov/drugsatfda_docs/label/', '.pdf']
                        if all(x in current_link for x in label_pdf_pattern):
                            if '#' in current_link:
                                hashsymbol_stripped = current_link[:current_link.find('#')]
                            else:
                                hashsymbol_stripped = current_link
                            possible_label_pdf_links.append(hashsymbol_stripped)

            possible_label_pdf_links = list(set(possible_label_pdf_links))

            try:
                all_main_label_pdf_links.append(possible_label_pdf_links[0]) if possible_label_pdf_links else all_main_label_pdf_links.append('')
            except IndexError:
                all_main_label_pdf_links.append('')

        except requests.exceptions.RequestException as e:
            print(f"Error fetching content for {each_url}: {e}")
            all_main_label_pdf_links.append('')
    else:
        # Skip invalid URLs
        all_main_label_pdf_links.append('')

# Check if the final lists have the same number of items as the number of rows in the DataFrame
if len(all_main_label_pdf_links) != len(df_result):
    print("The lengths of the lists do not match the number of rows in the DataFrame.")
    
df_result['main_label_pdf'] = all_main_label_pdf_links
df_result.head()

Obtaining Drug Labels: 100%|██████████| 164/164 [03:52<00:00,  1.42s/it]


Unnamed: 0,No.,Drug Name,Active Ingredient,Approval Date,FDA-approved use on approval date*,links,check_names,main_label_pdf
0,50.0,Adbry,tralokinumab-ldrm,12/27/2021,To treat moderate-to-severe atopic dermatitis ...,http://www.accessdata.fda.gov/scripts/cder/daf...,Adbry,https://www.accessdata.fda.gov/drugsatfda_docs...
1,49.0,Leqvio,inclisiran,12/22/2021,To treat heterozygous familial hypercholestero...,http://www.accessdata.fda.gov/scripts/cder/daf...,Leqvio,https://www.accessdata.fda.gov/drugsatfda_docs...
2,48.0,Vyvgart,efgartigimod alfa-fcab,12/17/2021,To treat generalized myasthenia gravis Press R...,http://www.accessdata.fda.gov/scripts/cder/daf...,Vyvgart,https://www.accessdata.fda.gov/drugsatfda_docs...
3,47.0,Tezspire,tezepelumab-ekko,12/17/2021,To treat severe asthma as an add-on maintenanc...,http://www.accessdata.fda.gov/scripts/cder/daf...,Tezspire,https://www.accessdata.fda.gov/drugsatfda_docs...
4,46.0,Cytalux,pafolacianine,11/29/2021,To help identify ovarian cancer lesions Press ...,http://www.accessdata.fda.gov/scripts/cder/daf...,Cytalux,https://www.accessdata.fda.gov/drugsatfda_docs...


In [4]:
df_result.loc[(df_result['Drug Name'] != df_result['check_names'])]

Unnamed: 0,No.,Drug Name,Active Ingredient,Approval Date,FDA-approved use on approval date*,links,check_names,main_label_pdf
143,21.0,Ohtuvayre,ensifentrine,6/26/2024,To treat chronic obstructive pulmonary disease,,,


In [5]:
df_result = df_result.drop(columns=['No.'])
df_result.to_csv(f'fda_approved_drugs_{start_year}_{end_year}.csv', index=False)