# Cancer数据获取(患病例数-Prevalence)

代码注释参考inc_mort_data.ipynb文件

In [None]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.options import Options as EdgeOptions

edge_options = EdgeOptions()
edge_options.add_argument("--window-size=1920,1080")
# edge_options.add_argument("start-maximized")

# 设置webdriver路径，例如使用ChromeDriver
driver = webdriver.Edge(options=edge_options)

# 打开网页
driver.get('https://gco.iarc.who.int/today/en/dataviz/bars-prevalence?mode=population&key=total')

# 等待页面加载
time.sleep(35)

In [None]:
# 点击空白处
driver.find_element(By.XPATH, '/html/body').click()

# 点击与mode_population相关联的label标签
mode_population = driver.find_element(By.XPATH, "//label[@for='mode_population']")
mode_population.click()

In [None]:
# 选择cancer类型

cancer_sort = ['Melanoma of skin', 'Non-melanoma skin cancer']

def select_cancer(cancer_name):
    cancer_label = driver.find_element(By.XPATH, "//*[@id='tab_1']/fieldset[3]/div/div[2]/span")
    cancer_label.click()

    time.sleep(1)

    cancer_input = driver.find_element(By.ID, "multiselect-cancer")

    cancer_input.send_keys(cancer_name)
    time.sleep(1)
    cancer_input.send_keys(Keys.DOWN)
    cancer_input.send_keys(Keys.ENTER)

    time.sleep(3)

# select_cancer(cancer_sort[0])

In [None]:
# 选择年龄段

left_sort = [0, 2, 4, 6, 8, 10, 12, 14]
right_sort = [1, 3, 5, 7, 9, 11, 13, 15]

def select_age(index):
    # 年龄下限
    left_age_label = driver.find_element(By.XPATH, "//*[@id='tab_1']/fieldset[6]/div[1]/div[2]/span")
    left_age_label.click()

    time.sleep(1)

    left_age_xpath = f"//*[@id='tab_1']/fieldset[6]/div[1]/div[3]/ul[1]/li[@id='null-{left_sort[index]}']"
    left_age = driver.find_element(By.XPATH, left_age_xpath)
    left_age.click()

    time.sleep(1)

    # 年龄上限
    right_age_label = driver.find_element(By.XPATH, "//*[@id='tab_1']/fieldset[6]/div[2]/div[2]/span")
    right_age_label.click()

    time.sleep(1)

    right_age_xpath = f"//*[@id='tab_1']/fieldset[6]/div[2]/div[3]/ul[1]/li[@id='null-{right_sort[index]}']"
    right_age = driver.find_element(By.XPATH, right_age_xpath)
    right_age.click()

    # 年龄下限
    left_age_label.click()
    time.sleep(1)
    left_age.click()

    time.sleep(10)

# select_age(3)

In [None]:
# 选择时间跨度

one_year = driver.find_element(By.XPATH, "//label[@for='input_prev_time_1']")
three_year = driver.find_element(By.XPATH, "//label[@for='input_prev_time_3']")
five_year = driver.find_element(By.XPATH, "//label[@for='input_prev_time_5']")

three_year.click()
five_year.click()

In [None]:
# 选择download

def click_download():
    download_label = driver.find_element(
        By.XPATH, 
        "//*[@id='app']/div/div[2]/div[2]/div/div[2]/div[2]/div[1]/ul[1]/li[2]"
    )
    download_label.click()

    time.sleep(3)

# click_download()

In [None]:
# 下载json文件, 并重命名

cancer_map = {
    'Melanoma of skin': 'melanoma',
    'Non-melanoma skin cancer': 'nomelanoma'
}

age_map = [10, 20, 30, 40, 50, 60, 70, 80]

def download_json(cancer, age_index):
    json_label = driver.find_element(By.XPATH, "//*[@id='tab_downloads']/div/div/div[2]/ul[3]/li/a")

    filename = f"prevalence_{cancer_map[cancer]}_{age_map[age_index]}"
    driver.execute_script(f"arguments[0].setAttribute('download', '{filename}')", json_label)
    time.sleep(2)
    json_label.click()
    time.sleep(3)
    print(f"success: {filename}.json")

# download_json(cancer_sort[0], 3)

In [None]:
# 主函数

dir = "C:\\Users\\zcj\\Downloads"

def main(cancer, age_index):
    filename = f"prevalence_{cancer_map[cancer]}_{age_map[age_index]}.json"
    file_path=os.path.join(dir, filename)
    if os.path.exists(file_path):
        return

    select_age(age_index)
    click_download()
    download_json(cancer, age_index)

for cancer in cancer_sort:
    select_cancer(cancer)
    for index in range(8):
        main(cancer, index)

In [None]:
driver.quit()