### Medical GDD Document Collection

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

url = "https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
name_cap_letters = soup.find('div', id='drugname').find_all('a')


### FDA

#### 按首字母顺序获取全部药品分类详细页面url

In [2]:
# 获取href
medical_category_urls = []
for letter in name_cap_letters:
    href = letter['href']
    # 拼接成url
    url = "https://www.accessdata.fda.gov" + href
    medical_category_urls.append(url)
print("总计获取到", len(medical_category_urls), "个药品分类详细页面url")


总计获取到 27 个药品分类详细页面url


#### 获取每个药品分类详细页面中的药品名和药品url


In [3]:
def get_drug_name_and_url(category_url: str):
    """
    Get the drug name and url of each drug in the category page

    Args:
        category_url: The url of the category page

    Returns:
        A list of dictionaries, each containing the drug name, url, application number, dosage, and manufacturer
    """
    ret = []
    # Get the category page
    response = requests.get(category_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Differentiate the drug name ul (starts with drugName)
    drug_name_uls = soup.find_all('ul', id=re.compile('^drugName\d+$'))
    for drug_name_ul in drug_name_uls:
        # Individual drug type (same type of drug)
        drug_name_lis = drug_name_ul.find_all('li')
        for drug_name_li in drug_name_lis:  
            drug_name_a = drug_name_li.find_all('a')

            # Get url of each drug (Normally there is only one)
            for drug_name_a in drug_name_a:
                href = drug_name_a['href']
                drug_url = "https://www.accessdata.fda.gov" + href
                # Get the drug info
                drug_info = drug_name_li.text.split("|")
                drug_name = drug_info[0].strip()
                application_number = drug_info[1].strip()
                dosage = drug_info[2].strip()
                manufacturer = drug_info[3].strip()

                ret.append({
                    "drug_name": drug_name,
                    "application_number": application_number,
                    "dosage": dosage,
                    "manufacturer": manufacturer,
                    "url": drug_url,
                })
    return ret


#### 获取药品列表并存储到csv文件



In [4]:
data_list = []
for url in medical_category_urls:
    data_list.extend(get_drug_name_and_url(url))
data_table = pd.DataFrame(data_list)

import os
if not os.path.exists('./docs/fda'):
    os.makedirs('./docs/fda')
data_table.to_csv('./docs/fda/drug_list.csv', index=False, encoding='utf-8')

In [5]:
(data_table.groupby(
    by=['drug_name', 'application_number']
).count()["url"] > 1).sum()  # 有重复的药品名和药品编号

0

#### 说明书下载



获取全部下载连接

In [6]:
def get_download_list(url: str):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # 定义一个列表来存储提取的信息
    download_list = []

    # 查找所有包含文件信息的表格
    tables = soup.find_all('table', class_='table table-bordered')

    for table in tables:
        # 第一行为最新文本
        latest_row = table.find('tbody').find_all('tr')[0]
        cells = latest_row.find_all('td')
        if len(cells) > 5:
            action_date = cells[0].get_text(strip=True)
            submission = cells[1].get_text(strip=True)
            category = cells[2].get_text(strip=True)
            links = cells[-3].find_all('a')
            urls = [link['href'] for link in links if 'href' in link.attrs]

            if urls:
                download_list.append({
                    'Action Date': action_date,
                    'Submission': submission,
                    'Category': category,
                    'Latest URL': urls
                })
    
    return download_list


##### 下载文档

In [7]:
def download_file(file_url: str, saving_path: str):
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status() 
        
        os.makedirs(os.path.dirname(saving_path), exist_ok=True)
        
        with open(saving_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    file.write(chunk)
        return True
        
    except Exception as e:
        print(f"下载文件时发生错误: {str(e)}")
        return False

In [11]:
#TODO 
# - list and download label files from CDE
# - find and parse source for clinical trial

Function test -- FDA downloading

In [12]:
data_table = pd.read_csv('./docs/fda/drug_list.csv')


idx = 32


page_url = data_table.loc[idx, 'url']
drug_name = re.sub(r'[^a-zA-Z0-9]', '_', data_table.loc[idx, 'drug_name']).strip("_")
manufacturer = re.sub(r'[^a-zA-Z0-9]', '_', data_table.loc[idx, 'manufacturer']).strip("_")
download_list = get_download_list(page_url)
download_list

[{'Action Date': '10/23/2014',
  'Submission': 'ORIG-1',
  'Category': 'Tentative Approval',
  'Latest URL': ['https://www.accessdata.fda.gov/drugsatfda_docs/appletter/2014/204915Orig1s000TAltr.pdf']}]

In [13]:
download_urls = download_list[0].get("Latest URL")
for idx, download_url in enumerate(download_urls):
    file_name = download_list[0].get("Submission") + '_' + download_list[0].get("Category") + f'_{idx}.pdf'
    download_file(
        download_url, 
        f"docs/fda/{manufacturer}/{drug_name}/{file_name}"
    )
        