In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [None]:
def extract_links_from_fda_drugname(table_provided):
    """
    Extracts hyperlinks and corresponding drug names from an HTML table.

    Parameters:
    - table_provided (BeautifulSoup): HTML table containing drug information.

    Returns:
    - links (list): List of hyperlinks.
    - names (list): List of drug names.
    """

    # Initialize lists to store links and names
    links, names = [], []

    # Iterate through each row in the provided table, excluding the header (first row)
    for tr in table_provided.select("tr")[1:]:
        try: 
            # Try to find the first hyperlink in the row
            trs = tr.find("a")
            
            # Check if trs is not None before trying to access attributes
            if trs is not None:
                actual_link, name = trs.get('href', ''), trs.get_text()
            else:
                actual_link, name = '', ''
            
        except (AttributeError, IndexError): 
            # Handle cases where there's an attribute error or indexing error
            actual_link, name = '', ''

        # Append the extracted link and name to the respective lists
        links.append(actual_link)
        names.append(name)
        
    return links, names

def scrape_fda_drug_approvals(start_year, end_year):
    """
    Scrapes FDA drug approvals data from specified years.

    Parameters:
    - start_year (int): The starting year for scraping.
    - end_year (int): The ending year for scraping.

    Returns:
    - df_final (DataFrame): Pandas DataFrame containing drug approval information.
    """

    # Initialize an empty list to store DataFrames
    tables = []

    # Iterate through each year in the specified range
    for year in range(start_year, end_year + 1):
        print(f"Scraping data for year {year}")

        # Construct the URL for the FDA drug approvals page for the current year
        url = f'https://www.fda.gov/drugs/new-drugs-fda-cders-new-molecular-entities-and-new-therapeutic-biological-products/novel-drug-approvals-{year}'

        # Make a request to the URL and get the HTML content
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve content for year {year}. Status code: {response.status_code}")
            continue  # Skip to the next iteration

        # Extract the table from the HTML content
        df_list = pd.read_html(response.content)

        # Check if any tables were found
        if not df_list:
            print(f"No tables found for year {year}.")
            continue  # Skip to the next iteration

        # Use the first table found
        df = df_list[0]

        # Rename columns for consistency
        df.rename(columns={'Date': 'Approval Date', 'Drug  Name': 'Drug Name'}, inplace=True)

        # Extract links and names from the drug names in the table
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('table')

        # Check if the table is found
        if table is None:
            print(f"No table found for year {year}.")
            continue  # Skip to the next iteration

        links, names = extract_links_from_fda_drugname(table)

        # Add links and names as new columns in the DataFrame
        df['links'], df['check_names'] = links, names

        # Append the DataFrame to the list of tables
        tables.append(df)
        
    df_final = pd.concat(tables, ignore_index=True)
    return df_final

# Specify the range of years for scraping
start_year = 2015
end_year = 2023

# Call the function to scrape FDA drug approvals data
df_result = scrape_fda_drug_approvals(start_year, end_year)
df_result.head()

In [None]:
all_main_label_pdf_links = []

for counter, each_url in enumerate(df_result['links']):
    # Check if the URL is correctly formatted
    if each_url.startswith(('http://', 'https://')):
        try:
            html = requests.get(each_url).content
            soup = BeautifulSoup(html, 'html5lib')

            possible_label_pdf_links = []
            if soup:
                for link in soup.findAll('a'):
                    current_link = link.get('href')
                    if current_link is not None:
                        label_pdf_pattern = ['https://www.accessdata.fda.gov/drugsatfda_docs/label/', '.pdf']
                        if all(x in current_link for x in label_pdf_pattern):
                            if '#' in current_link:
                                hashsymbol_stripped = current_link[:current_link.find('#')]
                            else:
                                hashsymbol_stripped = current_link
                            possible_label_pdf_links.append(hashsymbol_stripped)

            possible_label_pdf_links = list(set(possible_label_pdf_links))

            try:
                all_main_label_pdf_links.append(possible_label_pdf_links[0]) if possible_label_pdf_links else all_main_label_pdf_links.append('')
            except IndexError:
                all_main_label_pdf_links.append('')

        except requests.exceptions.RequestException as e:
            print(f"Error fetching content for {each_url}: {e}")
            all_main_label_pdf_links.append('')
    else:
        # Skip invalid URLs
        all_main_label_pdf_links.append('')

# Check if the final lists have the same number of items as the number of rows in the DataFrame
if len(all_main_label_pdf_links) != len(df_result):
    print("The lengths of the lists do not match the number of rows in the DataFrame.")
    
df_result['main_label_pdf'] = all_main_label_pdf_links
df_result.head()

In [None]:
df_result.loc[(df_result['Drug Name'] != df_result['check_names'])]

In [None]:
df_result = df_result.drop(columns=['No.'])
df_result.to_csv(f'fda_approved_drugs_{start_year}_{end_year}.csv', index=False)