### Medical GDD Document Collection

In [44]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

url = "https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
name_cap_letters = soup.find('div', id='drugname').find_all('a')


#### 按首字母顺序获取全部药品分类详细页面url

In [46]:
# 获取href
medical_category_urls = []
for letter in name_cap_letters:
    href = letter['href']
    # 拼接成url
    url = "https://www.accessdata.fda.gov" + href
    medical_category_urls.append(url)
print("总计获取到", len(medical_category_urls), "个药品分类详细页面url")


总计获取到 27 个药品分类详细页面url


#### 获取每个药品分类详细页面中的药品名和药品url


In [63]:
def get_drug_name_and_url(category_url):
    """
    Get the drug name and url of each drug in the category page

    Args:
        category_url: The url of the category page

    Returns:
        A list of dictionaries, each containing the drug name, url, application number, dosage, and manufacturer
    """
    ret = []
    # Get the category page
    response = requests.get(category_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Differentiate the drug name ul (starts with drugName)
    drug_name_uls = soup.find_all('ul', id=re.compile('^drugName\d+$'))
    for drug_name_ul in drug_name_uls:
        # Individual drug type (same type of drug)
        drug_name_lis = drug_name_ul.find_all('li')
        for drug_name_li in drug_name_lis:  
            drug_name_a = drug_name_li.find_all('a')

            # Get url of each drug (Normally there is only one)
            for drug_name_a in drug_name_a:
                href = drug_name_a['href']
                drug_url = "https://www.accessdata.fda.gov" + href
                # Get the drug info
                drug_info = drug_name_li.text.split("|")
                drug_name = drug_info[0].strip()
                application_number = drug_info[1].strip()
                dosage = drug_info[2].strip()
                manufacturer = drug_info[3].strip()

                ret.append({
                    "drug_name": drug_name,
                    "application_number": application_number,
                    "dosage": dosage,
                    "manufacturer": manufacturer,
                    "url": drug_url,
                })
    return ret


#### 获取药品列表并存储到csv文件



In [None]:
data_list = []
for url in medical_category_urls:
    data_list.extend(get_drug_name_and_url(url))
data_table = pd.DataFrame(data_list)

import os
if not os.path.exists('./docs/fda'):
    os.makedirs('./docs/fda')
data_table.to_csv('./docs/fda/drug_list.csv', index=False, encoding='utf-8')

In [79]:
(data_table.groupby(
    by=['drug_name', 'application_number']
).count()["url"] > 1).sum()  # 有重复的药品名和药品编号

np.int64(0)

#### 说明书下载



In [95]:
url = 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=206843'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')





In [106]:
from io import StringIO

# 找到全部table
tables = soup.find_all('table')

for table in tables:
    # 把table转换为pandas的DataFrame
    df = pd.read_html(StringIO(table.prettify()))
    print(df[0])


  Drug Name           Active Ingredients      Strength Dosage Form/Route  \
0  DAKLINZA  DACLATASVIR DIHYDROCHLORIDE  EQ 30MG BASE       TABLET;ORAL   
1  DAKLINZA  DACLATASVIR DIHYDROCHLORIDE  EQ 60MG BASE       TABLET;ORAL   
2  DAKLINZA  DACLATASVIR DIHYDROCHLORIDE  EQ 90MG BASE       TABLET;ORAL   

  Marketing Status  TE Code  RLD  RS  
0     Discontinued      NaN  Yes  No  
1     Discontinued      NaN  Yes  No  
2     Discontinued      NaN  Yes  No  
  Action Date Submission Action Type      Submission Classification  \
0  07/24/2015     ORIG-1    Approval  Type 1 - New Molecular Entity   

  Review Priority; Orphan Status  \
0                       PRIORITY   

  Letters, Reviews, Labels, Patient Package Insert  Notes  \
0                Label (PDF)  Letter (PDF)  Review    NaN   

                                                 Url  
0  https://www.accessdata.fda.gov/drugsatfda_docs...  
  Action Date Submission             Supplement Categories or Approval Type  \
0  10/16/20

In [111]:
df = pd.read_html(StringIO(table.prettify()))
df[0].columns

Index(['Action Date', 'Submission', 'Supplement Categories or Approval Type',
       'Letters, Reviews, Labels,  Patient Package Insert', 'Note', 'Url'],
      dtype='object')

In [117]:
# 定义一个列表来存储提取的信息
data = []

# 查找所有包含文件信息的表格
tables = soup.find_all('table', class_='table table-bordered')

for table in tables:
    # 第一行为最新文本
    latest_row = table.find('tbody').find_all('tr')[0]
    cells = latest_row.find_all('td')
    if len(cells) > 5:
        action_date = cells[0].get_text(strip=True)
        submission = cells[1].get_text(strip=True)
        category = cells[2].get_text(strip=True)
        links = cells[-3].find_all('a')
        urls = [link['href'] for link in links if 'href' in link.attrs]

        if urls:
            data.append({
                'Action Date': action_date,
                'Submission': submission,
                'Category': category,
                'Latest URL': urls
            })

# 将数据转换为DataFrame
df = pd.DataFrame(data)

# 打印或保存为CSV
print(df)

  Action Date Submission                 Category  \
0  07/24/2015     ORIG-1                 Approval   
1  10/16/2019    SUPPL-8  Labeling-Package Insert   
2  10/16/2019    SUPPL-8  Labeling-Package Insert   

                                          Latest URL  
0  [https://www.accessdata.fda.gov/drugsatfda_doc...  
1  [https://www.accessdata.fda.gov/drugsatfda_doc...  
2  [https://www.accessdata.fda.gov/drugsatfda_doc...  


In [1]:
download_links

def fetch_fda_articles():
    url = "https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm"  # 示例URL，请根据实际情况调整
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    articles = []
    for item in soup.find_all('div', class_='views-row'):
        title = item.find('h2').get_text(strip=True)
        link = item.find('a')['href']
        date = item.find('span', class_='date-display-single').get_text(strip=True)
        articles.append({'title': title, 'link': link, 'date': date})
    
    return articles

def fetch_nmpa_articles():
    url = "https://www.nmpa.gov.cn/xxgk/"  # 示例URL，请根据实际情况调整
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    articles = []
    for item in soup.find_all('li'):
        title_tag = item.find('a')
        if title_tag:
            title = title_tag.get_text(strip=True)
            link = title_tag['href']
            date = item.find('span').get_text(strip=True) if item.find('span') else ''
            articles.append({'title': title, 'link': link, 'date': date})
    
    return articles

def save_to_csv(fda_articles, nmpa_articles):
    fda_df = pd.DataFrame(fda_articles)
    nmpa_df = pd.DataFrame(nmpa_articles)
    
    fda_df.to_csv('./docs/fda/fda_articles.csv', index=False, encoding='utf-8')
    nmpa_df.to_csv('./docs/nmpa/nmpa_articles.csv', index=False, encoding='utf-8')

    

ModuleNotFoundError: No module named 'bs4'

In [None]:
fda_articles = fetch_fda_articles()
nmpa_articles = fetch_nmpa_articles()
save_to_csv(fda_articles, nmpa_articles)
print("文章已成功抓取并保存。")