This notebook serves to scrape a large number of neurology-related PDF files from PubMed. 

In [58]:
import os 
import requests 
import time
from bs4 import BeautifulSoup 
import xml.etree.ElementTree as ET
from tqdm import tqdm
import tarfile

In [42]:
tree = ET.parse('pmcquery.xml')
root = tree.getroot() 

In [43]:
id_list = []

for id_element in root.findall(".//IdList/Id"): 
    id_list.append(id_element.text)

print(f'Number of neurology-related PMCIDs from the previous year: {len(id_list)}')

Number of neurology-related PMCIDs from the previous year: 10000


In [54]:
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
        else:
            print(f"Failed to download: {url} (Status code: {response.status_code})")
    except Exception as e:
        print(f"Error downloading {url}: {e}")


def extract_tar_gz(file_path, extract_to_folder):
    try:
        with tarfile.open(file_path, "r:gz") as tar:
            tar.extractall(path=extract_to_folder)
    except Exception as e:
        print(f"Error extracting {file_path}: {e}")        

In [59]:
base_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC' # We need only to append the id to this url and get all availible data for the article.
save_folder = 'dfolder'

for id in tqdm(id_list[532:]):

    document_base_url = base_url + id
    time.sleep(5)
    try:
        response = requests.get(url=document_base_url)

        if response.status_code == 200:
            root = ET.fromstring(response.text)

            tar_gz_url = None
            
            for link in root.findall(".//link"):
                format_type = link.attrib.get("format")
                href = link.attrib.get("href")
                if format_type == "tgz":
                    tar_gz_url = href[6:]
                    tar_gz_url = "https://" + tar_gz_url
                    break

            if tar_gz_url:

                    file_name = f"{id}.tar.gz"
                    file_path = os.path.join(save_folder, file_name)
                    
                    download_file(tar_gz_url, file_path)
                    extract_tar_gz(file_path, save_folder)

            else:
                print(f"No tar.gz link found for {id}")
    
    except Exception as e:
        print(f"Failed to fetch data for {id} (Status code: {response.status_code})") 


  tar.extractall(path=extract_to_folder)
  0%|          | 5/9468 [00:33<17:01:39,  6.48s/it]

No tar.gz link found for 11407594


  0%|          | 19/9468 [02:21<18:44:02,  7.14s/it]

No tar.gz link found for 11406461


  0%|          | 45/9468 [06:49<20:32:49,  7.85s/it]

No tar.gz link found for 11404723


  1%|          | 49/9468 [07:20<18:33:04,  7.09s/it]

No tar.gz link found for 11404524


  1%|          | 92/9468 [13:53<23:35:33,  9.06s/it]


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [27]:
sample = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC11461032"
res = requests.get(sample)

In [40]:
download_file(sample, './dfolder')
pass

Error downloading https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC11461032


KeyboardInterrupt: 