In [5]:
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
#引入时间
import time

# 使用 fake_useragent 创建一个随机的 User-Agent
ua = UserAgent()
headers = {
    'User-Agent': ua.random,
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive'
}
url="https://www.fda.gov/consumers/womens-health/womens-health-topics"
response = requests.get(url, headers=headers, timeout=30)
soup = BeautifulSoup(response.text, 'html.parser')
linkcontent = soup.find('div', class_='item-list').find_all('a')


In [18]:
links_of_all_pages = []
for link in linkcontent: 
    href = link.get('href')
    if href:
        full_url = href if href.startswith('http') else f"https://www.fda.gov{href}"
        links_of_all_pages.append({
            'url': full_url,
            'text': link.text.strip()  # 同时保存链接文本
        })

In [24]:
print('len of the links:', len(links_of_all_pages))
print('example of the links:', links_of_all_pages[:2])
topics =  [x.get('text') for x in links_of_all_pages]
print('all topics:', topics)
links_solo = [x.get('url') for x in links_of_all_pages]

len of the links: 27
example of the links: [{'url': 'https://www.fda.gov/consumers/womens-health-topics/5-healthy-aging-tips-women', 'text': 'Aging'}, {'url': 'https://www.fda.gov/consumers/womens-health-topics/women-and-anxiety', 'text': 'Anxiety'}]
all topics: ['Aging', 'Anxiety', 'Birth control', 'Caregiving', 'Cholesterol medicines', 'Clinical trials', 'Contact lens care', 'Cosmetics', 'Depression', 'Diabetes', 'Dietary supplements', 'Food safety', 'Heart health', 'High blood pressure', 'HIV', 'HPV', 'Mammography', 'Medication safety', 'Medicine and pregnancy', 'Menopause', 'Osteoporosis', 'Pain medicines', 'Pregnancy', 'Sleep problems', 'Smoking', 'Tattoos and permanent makeup', 'Uterine fibroids']


In [44]:
ua = UserAgent()
headers = {
    'User-Agent': ua.random,
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive'
}

def extract_main_content(url):
    response = requests.get(url, headers=headers, timeout=30)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Locate the main content area
    main_content = soup.find('div', class_='col-md-8 col-md-push-2', role='main')
    
    # If main content exists, proceed
    if main_content:
        text_content = main_content.get_text()
        return text_content
    else:
        return None

def extract_main_url(url):
    response = requests.get(url, headers=headers, timeout=30)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Locate the main content area
    main_content = soup.find('div', class_='col-md-8 col-md-push-2', role='main')
    
    # 存储结果的列表
    url_data = []
        
    if main_content:
        for link in main_content.find_all('a'):
            href = link.get('href')
            text = link.text.strip()
            if href:
                if href.startswith('/'):
                    href = f"https://www.fda.gov{href}"
                else:
                    continue
            # 只有当链接和文本都存在时才添加
            if href and text:
                url_data.append({                   
                'url': href,
                'text': text
                })
            url_list = [x.get('url') for x in url_data]
    return url_data, url_list

In [52]:
import time

def get_unique_links(links_of_all_pages, second_layer_links):
    """
    合并并去重所有链接，去除包含download的链接
    
    Args:
        links_of_all_pages: 第一层链接列表
        second_layer_links: 第二层链接列表的列表
    
    Returns:
        list: 去重后的唯一链接列表
    """
    # 合并所有链接
    all_links = links_of_all_pages + [link for sublist in second_layer_links if sublist for link in sublist]
    
    # 通过 URL 去重
    unique_links = {link['url']: link for link in all_links}.values()
    
    # 去掉包含download的链接
    filtered_links = [link for link in unique_links if 'download' not in link['url']]
    
    return filtered_links

def generate_content_list(unique_links):
    """
    从唯一链接列表中提取内容
    
    Args:
        unique_links: 去重后的链接列表
    
    Returns:
        list: 包含话题和内容的字典列表
    """
    all_content = []
    total_links = len(unique_links)
    
    for idx, link_data in enumerate(unique_links, start=1):
        # 显示进度信息
        print(f"Processing {idx}/{total_links}: {link_data.get('url', 'Unknown URL')}")
        
        topic = link_data.get('text', '').strip()
        content = extract_main_content(link_data.get('url', ''))
        
        if topic and content:
            all_content.append({
                'topic': topic,
                'content': content.strip()
            })
        
        time.sleep(5)  # 避免过多请求，设置延时
    
    # 将结果写入文件
    with open('content_list.txt', 'w', encoding='utf-8') as f:
        f.write(str(all_content))
    
    return all_content

# 使用示例:
# 假设 links_solo 已经定义
second_layer_links = [extract_main_url(link)[0] for link in links_solo]

# 先获取去重的链接
unique_links = get_unique_links(links_of_all_pages, second_layer_links)


In [57]:
len(unique_links)

228

In [58]:
# 然后生成内容列表
content_list = generate_content_list(unique_links)

Processing 1/228: https://www.fda.gov/consumers/womens-health-topics/5-healthy-aging-tips-women
Processing 2/228: https://www.fda.gov/consumers/womens-health-topics/women-and-anxiety
Processing 3/228: https://www.fda.gov/consumers/womens-health-topics/birth-control
Processing 4/228: https://www.fda.gov/consumers/womens-health-topics/caring-others-resources-help-you
Processing 5/228: https://www.fda.gov/consumers/womens-health-topics/cholesterol-medicines-guide
Processing 6/228: https://www.fda.gov/consumers/womens-health-topics/women-clinical-trials
Processing 7/228: https://www.fda.gov/consumers/womens-health-topics/contact-lens-care
Processing 8/228: https://www.fda.gov/consumers/womens-health-topics/cosmetics-tips-women
Processing 9/228: https://www.fda.gov/consumers/womens-health-topics/depression-medicines
Processing 10/228: https://www.fda.gov/consumers/womens-health-topics/women-and-diabetes
Processing 11/228: https://www.fda.gov/consumers/womens-health-topics/dietary-supplement

In [62]:
import json

# 将 content_list 保存为 JSON 文件
with open('content_list.json', 'w', encoding='utf-8') as f:
    json.dump(content_list, f, ensure_ascii=False, indent=4)

In [63]:
content_list[1]

{'topic': 'Anxiety',