In [1]:
import re

import requests
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd

# Download the html file and create a BeautifulSoup object
web_url = 'https://ncdrisc.org/data-downloads.html'
html_text = requests.get(web_url).text
soup = BeautifulSoup(html_text, 'html.parser')

# Data pages consist of all the boxes in `https://ncdrisc.org/data-downloads.html`
# There are totally 5 boxes, accessed on Jul 19.
dataset_pages = soup.find_all('a', attrs={'href':re.compile('^data-downloads.+\.html$')})
dataset_pages_url = []
for page in dataset_pages:
    if page['href'] not in dataset_pages_url:
        dataset_pages_url.append(page['href'])  
print(f'{len(dataset_pages_url)} download pages found')
dataset_pages_url

10 download pages found


['data-downloads-adiposity-ado.html',
 'data-downloads-adiposity.html',
 'data-downloads-adiposity-urban-rural-ado.html',
 'data-downloads-adiposity-urban-rural.html',
 'data-downloads-height.html',
 'data-downloads-height-urban-rural-ado.html',
 'data-downloads-diabetes.html',
 'data-downloads-blood-pressure.html',
 'data-downloads-hypertension.html',
 'data-downloads-cholesterol.html']

In [2]:
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager 

def get_dynamic_page(url):
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) 
    driver.get(url) 
    return driver.page_source


In [3]:
import os
import time

host = 'https://ncdrisc.org/'
failed = [] # Recording the failed links

output_dir = 'ncdrisc_datasets'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

all_datasets = {'FirstClass':[], 'SecondClass':[], 'ThirdClass':[], 'Link':[], 'File':[]}
for dataset_page_url in dataset_pages_url:
    
    # Download and parse the html file
    html_text = get_dynamic_page('{}{}'.format(host, dataset_page_url))
    soup = BeautifulSoup(html_text, 'html.parser')

    # Find the downloading links for csv and zip files
    data_files = soup.find_all('a', attrs={'href':re.compile(r'\.(csv|zip)$')})
    countries = [x.text for x in soup.find_all('option')]
    print('country:', len(countries))
    print(countries[:10])
    
    for page in data_files:
    
        first_class = dataset_page_url.replace('.html', '').replace('data-downloads-', '')
        second_class = page.find_previous(name='h5', attrs={'class':'uppercase'}).text.replace('Download ', '')
        third_class = page.text
        
        link = '{}{}'.format(host, page['href'])
        output_file_path = '{}/{}'.format(output_dir, link.replace('https://ncdrisc.org/downloads', '').replace('/', '_'))

        if link not in all_datasets['Link']:
            all_datasets['FirstClass'].append(first_class)
            all_datasets['SecondClass'].append(second_class)
            all_datasets['ThirdClass'].append(third_class)
            all_datasets['Link'].append(link) 
            all_datasets['File'].append(output_file_path) 
            
            # Process individual countries
            base_country = 'United Kingdom'
            if base_country in link:
                for country in countries:
                    country_link = link.replace(base_country, country)
                    output_file_path = '{}/{}'.format(output_dir, country_link.replace('https://ncdrisc.org/downloads', '').replace('/', '_'))
                    all_datasets['FirstClass'].append(first_class)
                    all_datasets['SecondClass'].append(second_class)
                    all_datasets['ThirdClass'].append(third_class)
                    all_datasets['Link'].append(country_link) 
                    all_datasets['File'].append(output_file_path) 
            
all_datasets_df = pd.DataFrame(all_datasets)
all_datasets_df.to_csv('summary_all_countries.csv', index=False)
print(f'In total {len(all_datasets_df)} datasets')
all_datasets_df

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 8.29M/8.29M [00:01<00:00, 6.53MB/s]


country: 200
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia']
country: 200
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia']
country: 200
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia']
country: 200
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia']
country: 200
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia']
country: 200
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia']
country: 200
['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Antigua a

Unnamed: 0,FirstClass,SecondClass,ThirdClass,Link,File
0,adiposity-ado,Global Data,Age-specific,https://ncdrisc.org/downloads/bmi_height/bmi/g...,ncdrisc_datasets/_bmi_height_bmi_global_NCD_Ri...
1,adiposity-ado,Country-specific Data,Age-specific,https://ncdrisc.org/downloads/bmi_height/bmi/a...,ncdrisc_datasets/_bmi_height_bmi_all_countries...
2,adiposity-ado,Country-specific Data,Age-standardised - Female,https://ncdrisc.org/downloads/bmi/NCD_RisC_Lan...,ncdrisc_datasets/_bmi_NCD_RisC_Lancet_2020_BMI...
3,adiposity-ado,Country-specific Data,Age-standardised - Male,https://ncdrisc.org/downloads/bmi/NCD_RisC_Lan...,ncdrisc_datasets/_bmi_NCD_RisC_Lancet_2020_BMI...
4,adiposity-ado,Region-specific Data,Age-specific,https://ncdrisc.org/downloads/bmi_height/bmi/r...,ncdrisc_datasets/_bmi_height_bmi_regional_NCD_...
...,...,...,...,...,...
2059,cholesterol,Individual country Data,Country Data,https://ncdrisc.org/downloads/chol/individual-...,ncdrisc_datasets/_chol_individual-countries_Ve...
2060,cholesterol,Individual country Data,Country Data,https://ncdrisc.org/downloads/chol/individual-...,ncdrisc_datasets/_chol_individual-countries_Vi...
2061,cholesterol,Individual country Data,Country Data,https://ncdrisc.org/downloads/chol/individual-...,ncdrisc_datasets/_chol_individual-countries_Ye...
2062,cholesterol,Individual country Data,Country Data,https://ncdrisc.org/downloads/chol/individual-...,ncdrisc_datasets/_chol_individual-countries_Za...


In [None]:
import pandas as pd
import time
host = 'https://ncdrisc.org/'
failed = [] # Recording the failed links

output_dir = 'ncdrisc_datasets'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

all_datasets_df = pd.read_csv('summary.csv')
for i, row in all_datasets_df.iterrows():
    data_download_link = row['Link']
    output_file_path = row['File']
    max_retry_count = 3
    trycnt = 1
    while trycnt > 0 and trycnt <= max_retry_count:
        try:
            print('Downloading', data_download_link)
            r = requests.get(data_download_link, allow_redirects=True)
            with open(output_file_path, 'wb') as f:
                f.write(r.content)
            print('Success!')
            trycnt = 0

        except:
            trycnt += 1
            wait = 0.1*trycnt
            print(f'retry {trycnt}')
            time.sleep(wait)

    if trycnt > max_retry_count:
        failed.append(data_download_link)

if len(failed) > 0:
    print('Failed\n{}'.format('\n'.join(failed)))