This notebook serves to scrape a large number of neurology-related PDF files from PubMed. 

In [1]:
import os 
import requests 
import time
from bs4 import BeautifulSoup 
import xml.etree.ElementTree as ET
from tqdm import tqdm
import tarfile

In [2]:
tree = ET.parse('pmcquery.xml')
root = tree.getroot() 

In [3]:
id_list = []

for id_element in root.findall(".//IdList/Id"): 
    id_list.append(id_element.text)

print(f'Number of neurology-related PMCIDs from the previous year: {len(id_list)}')

Number of neurology-related PMCIDs from the previous year: 6801


In [4]:
def download_file(url, save_path):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
        else:
            print(f"Failed to download: {url} (Status code: {response.status_code})")
    except Exception as e:
        print(f"Error downloading {url}: {e}")


def extract_tar_gz(file_path, extract_to_folder):
    try:
        with tarfile.open(file_path, "r:gz") as tar:
            tar.extractall(path=extract_to_folder)
    except Exception as e:
        print(f"Error extracting {file_path}: {e}")        

In [5]:
base_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC' # We need only to append the id to this url and get all availible data for the article.
save_folder = 'dfolder'

for id in tqdm(id_list):

    document_base_url = base_url + id
    time.sleep(5)
    try:
        response = requests.get(url=document_base_url)

        if response.status_code == 200:
            root = ET.fromstring(response.text)

            tar_gz_url = None
            
            for link in root.findall(".//link"):
                format_type = link.attrib.get("format")
                href = link.attrib.get("href")
                if format_type == "tgz":
                    tar_gz_url = href[6:]
                    tar_gz_url = "https://" + tar_gz_url
                    break

            if tar_gz_url:

                    file_name = f"{id}.tar.gz"
                    file_path = os.path.join(save_folder, file_name)
                    
                    download_file(tar_gz_url, file_path)
                    extract_tar_gz(file_path, save_folder)

            else:
                print(f"No tar.gz link found for {id}")
    
    except Exception as e:
        print(f"Failed to fetch data for {id} (Status code: {response.status_code})") 


  tar.extractall(path=extract_to_folder)
  0%|          | 30/6801 [04:34<30:47:31, 16.37s/it]

Failed to fetch data for 11488616 (Status code: 200)


  2%|▏         | 110/6801 [17:12<29:56:09, 16.11s/it]

Failed to fetch data for 11397496 (Status code: 200)


  2%|▏         | 127/6801 [19:46<29:45:54, 16.06s/it]

Failed to fetch data for 11502371 (Status code: 200)


  2%|▏         | 170/6801 [25:39<30:39:51, 16.65s/it]

Failed to fetch data for 11473404 (Status code: 200)


  4%|▎         | 252/6801 [36:48<29:22:38, 16.15s/it]

Failed to fetch data for 11347546 (Status code: 200)


  5%|▍         | 307/6801 [44:16<30:29:03, 16.90s/it]

Failed to fetch data for 10849735 (Status code: 200)


  5%|▌         | 350/6801 [50:30<29:11:07, 16.29s/it]

Failed to fetch data for 11264392 (Status code: 200)


  5%|▌         | 367/6801 [53:13<30:04:14, 16.83s/it]

Failed to fetch data for 11329216 (Status code: 200)


  6%|▌         | 383/6801 [55:49<29:15:25, 16.41s/it]

Failed to fetch data for 11245205 (Status code: 200)


  6%|▌         | 386/6801 [57:51<68:40:06, 38.54s/it]

Error downloading https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/e5/3b/PMC11275047.tar.gz: ('Connection broken: IncompleteRead(409600 bytes read, 8853622 more expected)', IncompleteRead(409600 bytes read, 8853622 more expected))
Error extracting dfolder/11275047.tar.gz: Compressed file ended before the end-of-stream marker was reached


  6%|▌         | 422/6801 [1:03:02<29:47:21, 16.81s/it]

Failed to fetch data for 11220971 (Status code: 200)


  7%|▋         | 447/6801 [1:06:37<28:03:44, 15.90s/it]

Failed to fetch data for 11233742 (Status code: 200)


  7%|▋         | 503/6801 [1:14:21<28:30:35, 16.30s/it]

Failed to fetch data for 11178805 (Status code: 200)


  9%|▉         | 622/6801 [1:30:10<29:24:43, 17.14s/it]

Failed to fetch data for 11101449 (Status code: 200)


 10%|▉         | 651/6801 [1:34:27<27:24:43, 16.05s/it]

Failed to fetch data for 11233287 (Status code: 200)


 10%|▉         | 666/6801 [1:36:51<28:10:09, 16.53s/it]

Failed to fetch data for 11126858 (Status code: 200)


 10%|█         | 687/6801 [1:40:13<28:03:01, 16.52s/it]

Failed to fetch data for 11066097 (Status code: 200)


 10%|█         | 689/6801 [1:40:59<35:48:04, 21.09s/it]

Failed to fetch data for 11064305 (Status code: 200)


 11%|█         | 730/6801 [1:46:57<26:59:37, 16.01s/it]

Failed to fetch data for 11180938 (Status code: 200)


 11%|█         | 745/6801 [1:49:31<28:24:04, 16.88s/it]

Failed to fetch data for 11029622 (Status code: 200)


 11%|█▏        | 771/6801 [1:53:13<27:31:29, 16.43s/it]

Failed to fetch data for 11016819 (Status code: 200)


 12%|█▏        | 847/6801 [2:03:24<27:28:58, 16.62s/it]

Failed to fetch data for 10981243 (Status code: 200)


 13%|█▎        | 860/6801 [2:05:26<26:35:21, 16.11s/it]

Failed to fetch data for 10971429 (Status code: 200)


 13%|█▎        | 888/6801 [2:09:35<27:15:47, 16.60s/it]

Failed to fetch data for 10969335 (Status code: 200)


 13%|█▎        | 890/6801 [2:10:18<33:50:56, 20.62s/it]

Failed to fetch data for 10987717 (Status code: 200)


 14%|█▍        | 967/6801 [2:21:12<29:35:18, 18.26s/it]

Failed to fetch data for 10957179 (Status code: 200)


 15%|█▍        | 997/6801 [2:25:39<26:49:10, 16.64s/it]

Failed to fetch data for 11132561 (Status code: 200)


 15%|█▍        | 1005/6801 [2:27:07<26:50:40, 16.67s/it]

Failed to fetch data for 11032569 (Status code: 200)


 15%|█▌        | 1037/6801 [2:31:33<25:55:15, 16.19s/it]

Failed to fetch data for 10916973 (Status code: 200)


 16%|█▌        | 1085/6801 [2:39:01<26:31:32, 16.71s/it]

Failed to fetch data for 10853890 (Status code: 200)


 17%|█▋        | 1148/6801 [2:47:36<25:38:23, 16.33s/it]

Failed to fetch data for 10811308 (Status code: 200)


 17%|█▋        | 1167/6801 [2:50:35<25:45:46, 16.46s/it]

Failed to fetch data for 10819946 (Status code: 200)


 17%|█▋        | 1184/6801 [2:53:10<26:21:54, 16.90s/it]

Failed to fetch data for 10863921 (Status code: 200)


 18%|█▊        | 1250/6801 [3:02:00<26:15:38, 17.03s/it]

Failed to fetch data for 10764327 (Status code: 200)


 19%|█▉        | 1312/6801 [3:10:16<24:55:03, 16.34s/it]

Failed to fetch data for 10757935 (Status code: 200)


 20%|█▉        | 1347/6801 [3:15:17<25:21:04, 16.73s/it]

Failed to fetch data for 10770482 (Status code: 200)


 20%|█▉        | 1358/6801 [3:17:52<26:23:24, 17.45s/it]

Failed to fetch data for 10742652 (Status code: 200)


 20%|██        | 1374/6801 [3:20:30<25:07:24, 16.67s/it]

Failed to fetch data for 10701950 (Status code: 200)


 21%|██        | 1407/6801 [3:25:57<24:11:34, 16.15s/it]

Failed to fetch data for 10689816 (Status code: 200)


 22%|██▏       | 1467/6801 [3:34:12<25:00:22, 16.88s/it]

Failed to fetch data for 10761431 (Status code: 200)


 23%|██▎       | 1567/6801 [3:47:26<23:48:46, 16.38s/it]

Failed to fetch data for 10619744 (Status code: 200)


 24%|██▎       | 1605/6801 [3:52:47<25:08:12, 17.42s/it]

Failed to fetch data for 10899537 (Status code: 200)


 24%|██▍       | 1629/6801 [3:56:14<23:15:05, 16.18s/it]

Failed to fetch data for 10604962 (Status code: 200)


 24%|██▍       | 1636/6801 [3:57:33<24:06:04, 16.80s/it]

Failed to fetch data for 10605073 (Status code: 200)


 25%|██▍       | 1668/6801 [4:02:00<24:38:41, 17.28s/it]

Failed to fetch data for 10564412 (Status code: 200)


 25%|██▍       | 1670/6801 [4:02:45<30:26:29, 21.36s/it]

Failed to fetch data for 10604050 (Status code: 200)


 25%|██▍       | 1696/6801 [4:06:33<23:07:07, 16.30s/it]

Failed to fetch data for 10637578 (Status code: 200)


 25%|██▌       | 1710/6801 [4:08:59<23:07:03, 16.35s/it]

Failed to fetch data for 10499811 (Status code: 200)


 26%|██▌       | 1736/6801 [4:12:36<22:36:34, 16.07s/it]

Failed to fetch data for 10503865 (Status code: 200)


 26%|██▌       | 1742/6801 [4:13:51<24:08:31, 17.18s/it]

Failed to fetch data for 10843695 (Status code: 200)


 26%|██▌       | 1760/6801 [4:17:04<24:53:55, 17.78s/it]

Failed to fetch data for 10471690 (Status code: 200)


 26%|██▌       | 1780/6801 [4:20:08<23:08:23, 16.59s/it]

Failed to fetch data for 10916939 (Status code: 200)


 26%|██▋       | 1793/6801 [4:22:15<23:18:28, 16.75s/it]

Failed to fetch data for 10686329 (Status code: 200)


 26%|██▋       | 1798/6801 [4:23:28<25:45:41, 18.54s/it]

Failed to fetch data for 10445398 (Status code: 200)


 27%|██▋       | 1829/6801 [4:28:21<23:55:51, 17.33s/it]

Failed to fetch data for 10567584 (Status code: 200)


 27%|██▋       | 1830/6801 [4:28:58<32:01:18, 23.19s/it]

Failed to fetch data for 10461810 (Status code: 200)


 28%|██▊       | 1876/6801 [4:35:01<22:01:43, 16.10s/it]

Failed to fetch data for 10436105 (Status code: 200)


 43%|████▎     | 2919/6801 [6:50:18<7:33:29,  7.01s/it] 

No tar.gz link found for 9877942


 44%|████▍     | 2984/6801 [6:58:37<17:16:50, 16.30s/it]

Failed to fetch data for 9695753 (Status code: 200)


 44%|████▍     | 3025/6801 [7:04:25<17:18:06, 16.50s/it]

Failed to fetch data for 9693722 (Status code: 200)


 46%|████▌     | 3107/6801 [7:15:08<17:17:46, 16.86s/it]

Failed to fetch data for 9658881 (Status code: 200)


 46%|████▋     | 3151/6801 [7:21:52<16:09:06, 15.93s/it]

Failed to fetch data for 9581650 (Status code: 200)


 47%|████▋     | 3168/6801 [7:24:26<16:44:23, 16.59s/it]

Failed to fetch data for 9543950 (Status code: 200)


 47%|████▋     | 3186/6801 [7:27:05<16:10:16, 16.10s/it]

Failed to fetch data for 9576849 (Status code: 200)


 48%|████▊     | 3250/6801 [7:35:18<16:06:44, 16.33s/it]

Failed to fetch data for 10115177 (Status code: 200)


 48%|████▊     | 3272/6801 [7:38:34<15:45:26, 16.07s/it]

Failed to fetch data for 9476413 (Status code: 200)


 49%|████▉     | 3358/6801 [7:49:32<15:16:36, 15.97s/it]

Failed to fetch data for 10425321 (Status code: 200)


 50%|████▉     | 3376/6801 [7:52:11<15:26:42, 16.23s/it]

Failed to fetch data for 9869390 (Status code: 200)


 50%|█████     | 3416/6801 [7:57:36<15:13:54, 16.20s/it]

Failed to fetch data for 9473453 (Status code: 200)


 52%|█████▏    | 3556/6801 [8:16:50<14:33:15, 16.15s/it]

Failed to fetch data for 9281365 (Status code: 200)


 53%|█████▎    | 3578/6801 [8:20:04<14:47:35, 16.52s/it]

Failed to fetch data for 9606092 (Status code: 200)


 53%|█████▎    | 3595/6801 [8:22:38<14:35:01, 16.38s/it]

Failed to fetch data for 9257520 (Status code: 200)


 53%|█████▎    | 3626/6801 [8:26:53<14:11:11, 16.09s/it]

Failed to fetch data for 9342860 (Status code: 200)


 54%|█████▎    | 3646/6801 [8:29:48<13:55:55, 15.90s/it]

Failed to fetch data for 9272064 (Status code: 200)


 55%|█████▍    | 3711/6801 [8:38:17<13:38:49, 15.90s/it]

Failed to fetch data for 9358410 (Status code: 200)


 55%|█████▌    | 3756/6801 [8:44:14<13:35:01, 16.06s/it]

Failed to fetch data for 9468130 (Status code: 200)


 56%|█████▌    | 3793/6801 [8:49:13<13:28:22, 16.12s/it]

Failed to fetch data for 9152128 (Status code: 200)


 57%|█████▋    | 3854/6801 [8:57:06<13:20:56, 16.31s/it]

Failed to fetch data for 9338189 (Status code: 200)


 58%|█████▊    | 3952/6801 [9:09:29<12:44:30, 16.10s/it]

Failed to fetch data for 9033019 (Status code: 200)


 59%|█████▉    | 4017/6801 [9:17:53<12:35:39, 16.29s/it]

Failed to fetch data for 9031324 (Status code: 200)


 61%|██████    | 4119/6801 [9:31:00<12:01:49, 16.15s/it]

Failed to fetch data for 8896101 (Status code: 200)


 61%|██████    | 4144/6801 [9:34:33<11:50:56, 16.05s/it]

Failed to fetch data for 8914109 (Status code: 200)


 61%|██████    | 4164/6801 [9:37:50<12:01:55, 16.43s/it]

Failed to fetch data for 9825569 (Status code: 200)


 62%|██████▏   | 4185/6801 [9:40:47<11:42:59, 16.12s/it]

Failed to fetch data for 8879457 (Status code: 200)


 63%|██████▎   | 4284/6801 [9:53:14<11:14:31, 16.08s/it]

Failed to fetch data for 8796646 (Status code: 200)


 64%|██████▍   | 4367/6801 [10:04:21<11:35:38, 17.15s/it]

Failed to fetch data for 8742368 (Status code: 200)


 65%|██████▍   | 4389/6801 [10:07:33<10:44:22, 16.03s/it]

Failed to fetch data for 8761855 (Status code: 200)


 65%|██████▌   | 4421/6801 [10:11:55<10:30:30, 15.90s/it]

Failed to fetch data for 8884854 (Status code: 200)


 65%|██████▌   | 4442/6801 [10:14:59<10:25:41, 15.91s/it]

Failed to fetch data for 8886653 (Status code: 200)


 66%|██████▌   | 4477/6801 [10:20:19<10:52:22, 16.84s/it]

Failed to fetch data for 8649917 (Status code: 200)


 66%|██████▌   | 4490/6801 [10:22:27<10:49:24, 16.86s/it]

Failed to fetch data for 8637062 (Status code: 200)


 68%|██████▊   | 4593/6801 [10:35:25<9:49:09, 16.01s/it] 

Failed to fetch data for 8649868 (Status code: 200)


 68%|██████▊   | 4644/6801 [10:42:13<9:42:29, 16.20s/it]

Failed to fetch data for 8528306 (Status code: 200)


 69%|██████▉   | 4681/6801 [10:48:44<10:27:18, 17.75s/it]

Failed to fetch data for 8507194 (Status code: 200)


 69%|██████▉   | 4708/6801 [10:53:25<9:13:50, 15.88s/it] 

Failed to fetch data for 8505759 (Status code: 200)


 69%|██████▉   | 4719/6801 [10:55:13<9:24:21, 16.26s/it]

Failed to fetch data for 8873001 (Status code: 200)


 70%|███████   | 4784/6801 [11:03:37<8:48:45, 15.73s/it]

Failed to fetch data for 8429427 (Status code: 200)


 70%|███████   | 4787/6801 [11:04:28<10:37:27, 18.99s/it]

Failed to fetch data for 8558882 (Status code: 200)


 71%|███████   | 4813/6801 [11:08:21<9:03:31, 16.40s/it] 

Failed to fetch data for 8461705 (Status code: 200)


 71%|███████   | 4831/6801 [11:10:58<8:46:54, 16.05s/it]

Failed to fetch data for 8466155 (Status code: 200)


 72%|███████▏  | 4882/6801 [11:17:40<8:32:59, 16.04s/it]

Failed to fetch data for 8358233 (Status code: 200)


 72%|███████▏  | 4900/6801 [11:20:17<8:21:35, 15.83s/it]

Failed to fetch data for 8374868 (Status code: 200)


 73%|███████▎  | 4932/6801 [11:24:36<8:14:59, 15.89s/it]

Failed to fetch data for 8391540 (Status code: 200)


 73%|███████▎  | 4933/6801 [11:25:12<11:24:34, 21.99s/it]

Failed to fetch data for 8359814 (Status code: 200)


 73%|███████▎  | 4950/6801 [11:27:48<8:27:24, 16.45s/it] 

Failed to fetch data for 8328733 (Status code: 200)


 73%|███████▎  | 4961/6801 [11:29:36<8:08:15, 15.92s/it]

Failed to fetch data for 8303126 (Status code: 200)


 73%|███████▎  | 4978/6801 [11:32:10<8:22:15, 16.53s/it]

Failed to fetch data for 8315271 (Status code: 200)


 74%|███████▎  | 4999/6801 [11:35:14<8:04:29, 16.13s/it]

Failed to fetch data for 8259927 (Status code: 200)


 74%|███████▍  | 5021/6801 [11:38:31<7:57:51, 16.11s/it]

Failed to fetch data for 8293634 (Status code: 200)


 75%|███████▍  | 5100/6801 [11:49:17<7:37:16, 16.13s/it]

Failed to fetch data for 8180841 (Status code: 200)


 75%|███████▌  | 5125/6801 [11:52:44<7:25:26, 15.95s/it]

Failed to fetch data for 8228334 (Status code: 200)


 76%|███████▌  | 5151/6801 [11:56:16<7:17:43, 15.92s/it]

Failed to fetch data for 8147046 (Status code: 200)


 77%|███████▋  | 5208/6801 [12:03:37<7:05:10, 16.01s/it]

Failed to fetch data for 8323049 (Status code: 200)


 77%|███████▋  | 5232/6801 [12:06:59<7:00:55, 16.10s/it]

Failed to fetch data for 8246214 (Status code: 200)


 77%|███████▋  | 5253/6801 [12:09:57<6:58:02, 16.20s/it]

Failed to fetch data for 8073969 (Status code: 200)


 78%|███████▊  | 5294/6801 [12:15:23<6:39:38, 15.91s/it]

Failed to fetch data for 8888529 (Status code: 200)


 78%|███████▊  | 5300/6801 [12:16:04<3:15:10,  7.80s/it]

Failed to download: https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/f0/c1/PMC8087475.tar.gz (Status code: 404)
Error extracting dfolder/8087475.tar.gz: [Errno 2] No such file or directory: 'dfolder/8087475.tar.gz'


 78%|███████▊  | 5302/6801 [12:16:48<6:50:58, 16.45s/it]

Failed to fetch data for 8069746 (Status code: 200)


 78%|███████▊  | 5319/6801 [12:19:20<6:37:07, 16.08s/it]

Failed to fetch data for 8067517 (Status code: 200)


 78%|███████▊  | 5336/6801 [12:21:51<6:33:31, 16.12s/it]

Failed to fetch data for 8359937 (Status code: 200)


 79%|███████▉  | 5356/6801 [12:24:46<6:34:30, 16.38s/it]

Failed to fetch data for 8193520 (Status code: 200)


 79%|███████▉  | 5397/6801 [12:30:11<6:18:06, 16.16s/it]

Failed to fetch data for 7962371 (Status code: 200)


 80%|███████▉  | 5416/6801 [12:33:02<6:14:38, 16.23s/it]

Failed to fetch data for 8075398 (Status code: 200)


 80%|████████  | 5456/6801 [12:37:51<2:37:09,  7.01s/it]

Error downloading https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/12/3b/PMC7943465.tar.gz: ('Connection broken: IncompleteRead(360448 bytes read, 1070911 more expected)', IncompleteRead(360448 bytes read, 1070911 more expected))
Error extracting dfolder/7943465.tar.gz: Compressed file ended before the end-of-stream marker was reached


 80%|████████  | 5457/6801 [12:38:28<5:58:00, 15.98s/it]

Failed to fetch data for 8410720 (Status code: 200)


 80%|████████  | 5459/6801 [12:39:12<7:35:22, 20.36s/it]

Failed to fetch data for 7927134 (Status code: 200)


 81%|████████  | 5485/6801 [12:43:17<6:12:51, 17.00s/it]

Failed to fetch data for 7937558 (Status code: 200)


 81%|████████▏ | 5542/6801 [12:51:10<5:47:44, 16.57s/it]

Failed to fetch data for 7908187 (Status code: 200)


 82%|████████▏ | 5563/6801 [12:54:19<5:39:47, 16.47s/it]

Failed to fetch data for 7855595 (Status code: 200)


 82%|████████▏ | 5578/6801 [12:56:39<5:29:47, 16.18s/it]

Failed to fetch data for 7905302 (Status code: 200)


 83%|████████▎ | 5627/6801 [13:03:30<5:13:57, 16.05s/it]

Failed to fetch data for 7796499 (Status code: 200)


 84%|████████▍ | 5711/6801 [13:14:15<4:49:12, 15.92s/it]

Failed to fetch data for 7717539 (Status code: 200)


 84%|████████▍ | 5741/6801 [13:18:27<4:44:14, 16.09s/it]

Failed to fetch data for 7726191 (Status code: 200)


 85%|████████▍ | 5753/6801 [13:20:21<4:41:37, 16.12s/it]

Failed to fetch data for 7686687 (Status code: 200)


 85%|████████▍ | 5763/6801 [13:22:04<4:44:26, 16.44s/it]

Failed to fetch data for 8131404 (Status code: 200)


 85%|████████▌ | 5783/6801 [13:24:54<4:30:20, 15.93s/it]

Failed to fetch data for 7741031 (Status code: 200)


 85%|████████▌ | 5795/6801 [13:26:59<4:44:40, 16.98s/it]

Failed to fetch data for 7666133 (Status code: 200)


 85%|████████▌ | 5805/6801 [13:28:41<4:29:19, 16.22s/it]

Failed to fetch data for 7658982 (Status code: 200)


 86%|████████▌ | 5832/6801 [13:32:30<4:28:13, 16.61s/it]

Failed to fetch data for 7643479 (Status code: 200)


 86%|████████▌ | 5844/6801 [13:34:29<4:20:56, 16.36s/it]

Failed to fetch data for 7604454 (Status code: 200)


 86%|████████▌ | 5848/6801 [13:35:29<4:53:07, 18.46s/it]

Failed to fetch data for 7603780 (Status code: 200)


 86%|████████▌ | 5862/6801 [13:37:41<4:15:59, 16.36s/it]

Failed to fetch data for 7732256 (Status code: 200)


 87%|████████▋ | 5894/6801 [13:42:03<4:03:55, 16.14s/it]

Failed to fetch data for 7572485 (Status code: 200)


 87%|████████▋ | 5941/6801 [13:48:18<3:48:49, 15.96s/it]

Failed to fetch data for 8521911 (Status code: 200)


 88%|████████▊ | 5956/6801 [13:50:34<3:47:24, 16.15s/it]

Failed to fetch data for 7554615 (Status code: 200)


 88%|████████▊ | 6007/6801 [13:57:22<3:46:21, 17.11s/it]

Failed to fetch data for 7592690 (Status code: 200)


 89%|████████▊ | 6030/6801 [14:00:36<3:24:29, 15.91s/it]

Failed to fetch data for 7487842 (Status code: 200)


 89%|████████▉ | 6036/6801 [14:01:49<3:37:23, 17.05s/it]

Failed to fetch data for 7671745 (Status code: 200)


 89%|████████▉ | 6047/6801 [14:03:37<3:25:07, 16.32s/it]

Failed to fetch data for 7369634 (Status code: 200)


 89%|████████▉ | 6081/6801 [14:08:24<3:11:55, 15.99s/it]

Failed to fetch data for 7605503 (Status code: 200)


 90%|████████▉ | 6098/6801 [14:10:58<3:09:22, 16.16s/it]

Failed to fetch data for 7450382 (Status code: 200)


 90%|█████████ | 6146/6801 [14:17:26<2:55:32, 16.08s/it]

Failed to fetch data for 7463112 (Status code: 200)


 91%|█████████ | 6157/6801 [14:19:18<2:54:24, 16.25s/it]

Failed to fetch data for 7382006 (Status code: 200)


 91%|█████████ | 6167/6801 [14:20:58<2:51:06, 16.19s/it]

Failed to fetch data for 7386130 (Status code: 200)


 91%|█████████ | 6172/6801 [14:22:06<3:08:40, 18.00s/it]

Failed to fetch data for 7689708 (Status code: 200)


 91%|█████████ | 6182/6801 [14:23:53<2:52:34, 16.73s/it]

Failed to fetch data for 7374556 (Status code: 200)


 92%|█████████▏| 6223/6801 [14:29:19<2:33:48, 15.97s/it]

Failed to fetch data for 7500810 (Status code: 200)


 92%|█████████▏| 6230/6801 [14:30:52<2:48:44, 17.73s/it]

Failed to fetch data for 7835306 (Status code: 200)


 92%|█████████▏| 6264/6801 [14:35:24<2:22:12, 15.89s/it]

Failed to fetch data for 7370590 (Status code: 200)


 92%|█████████▏| 6270/6801 [14:36:37<2:31:37, 17.13s/it]

Failed to fetch data for 7299377 (Status code: 200)


 92%|█████████▏| 6290/6801 [14:39:32<2:16:47, 16.06s/it]

Failed to fetch data for 7383150 (Status code: 200)


 93%|█████████▎| 6315/6801 [14:42:56<2:08:05, 15.81s/it]

Failed to fetch data for 7271529 (Status code: 200)


 93%|█████████▎| 6323/6801 [14:44:23<2:11:30, 16.51s/it]

Failed to fetch data for 7382768 (Status code: 200)


 93%|█████████▎| 6357/6801 [14:49:12<1:59:17, 16.12s/it]

Failed to fetch data for 7417970 (Status code: 200)


 94%|█████████▎| 6372/6801 [14:51:30<1:55:46, 16.19s/it]

Failed to fetch data for 7383876 (Status code: 200)


 94%|█████████▍| 6389/6801 [14:54:04<1:51:56, 16.30s/it]

Failed to fetch data for 7241955 (Status code: 200)


 95%|█████████▍| 6427/6801 [14:59:19<1:40:38, 16.15s/it]

Failed to fetch data for 7907981 (Status code: 200)


 95%|█████████▍| 6451/6801 [15:02:43<1:32:10, 15.80s/it]

Failed to fetch data for 7163924 (Status code: 200)


 95%|█████████▌| 6471/6801 [15:05:38<1:27:36, 15.93s/it]

Failed to fetch data for 7140337 (Status code: 200)


 95%|█████████▌| 6472/6801 [15:06:15<2:01:57, 22.24s/it]

Failed to fetch data for 7280943 (Status code: 200)


 95%|█████████▌| 6489/6801 [15:08:47<1:24:07, 16.18s/it]

Failed to fetch data for 7110730 (Status code: 200)


 96%|█████████▌| 6506/6801 [15:11:24<1:19:33, 16.18s/it]

Failed to fetch data for 7101159 (Status code: 200)


 96%|█████████▌| 6533/6801 [15:15:06<1:14:52, 16.76s/it]

Failed to fetch data for 7069351 (Status code: 200)


 96%|█████████▌| 6543/6801 [15:16:47<1:10:24, 16.38s/it]

Failed to fetch data for 7053087 (Status code: 200)


 97%|█████████▋| 6566/6801 [15:20:21<1:02:44, 16.02s/it]

Failed to fetch data for 7070974 (Status code: 200)


 97%|█████████▋| 6587/6801 [15:23:22<58:08, 16.30s/it]  

Failed to fetch data for 6961771 (Status code: 200)


 97%|█████████▋| 6617/6801 [15:27:24<49:27, 16.13s/it]

Failed to fetch data for 6974141 (Status code: 200)


 98%|█████████▊| 6632/6801 [15:29:59<47:15, 16.78s/it]

Failed to fetch data for 6958642 (Status code: 200)


 98%|█████████▊| 6648/6801 [15:32:22<40:49, 16.01s/it]

Failed to fetch data for 6942396 (Status code: 200)


 98%|█████████▊| 6667/6801 [15:35:07<36:08, 16.18s/it]

Failed to fetch data for 7299907 (Status code: 200)


 98%|█████████▊| 6682/6801 [15:37:20<31:23, 15.83s/it]

Failed to fetch data for 6917647 (Status code: 200)


 98%|█████████▊| 6686/6801 [15:38:19<35:12, 18.37s/it]

Failed to fetch data for 6951066 (Status code: 200)


 99%|█████████▊| 6704/6801 [15:40:56<25:56, 16.05s/it]

Failed to fetch data for 7437815 (Status code: 200)


 99%|█████████▉| 6721/6801 [15:43:28<21:26, 16.08s/it]

Failed to fetch data for 6978212 (Status code: 200)


 99%|█████████▉| 6727/6801 [15:44:40<21:02, 17.06s/it]

Failed to fetch data for 7470848 (Status code: 200)


 99%|█████████▉| 6745/6801 [15:47:16<15:00, 16.08s/it]

Failed to fetch data for 8149344 (Status code: 200)


100%|█████████▉| 6771/6801 [15:50:59<08:01, 16.05s/it]

Failed to fetch data for 7149322 (Status code: 200)


100%|█████████▉| 6785/6801 [15:53:21<04:27, 16.74s/it]

Failed to fetch data for 8029192 (Status code: 200)


100%|██████████| 6801/6801 [15:55:24<00:00,  8.43s/it]


In [6]:
import os
import shutil

def move_pdfs(source_folder="dfolder", destination_folder="pdfs"):
    # Ensure the destination folder exists
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # Walk through the source folder recursively
    for root, _, files in os.walk(source_folder):
        for file in files:
            # Check if the file is a PDF
            if file.lower().endswith(".pdf"):
                # Build full paths for the source and destination
                source_path = os.path.join(root, file)
                destination_path = os.path.join(destination_folder, file)

                # Move the PDF file to the destination folder
                shutil.move(source_path, destination_path)
                print(f"Moved: {source_path} to {destination_path}")

# Call the function
move_pdfs()


Moved: dfolder/PMC8712250/cureus-0013-00000019934.pdf to pdfs/cureus-0013-00000019934.pdf
Moved: dfolder/PMC6914700/fnagi-11-00347.pdf to pdfs/fnagi-11-00347.pdf
Moved: dfolder/PMC10233237/bhad017.pdf to pdfs/bhad017.pdf
Moved: dfolder/PMC10023984/main.pdf to pdfs/main.pdf
Moved: dfolder/PMC11365916/12264_2024_Article_1218.pdf to pdfs/12264_2024_Article_1218.pdf
Moved: dfolder/PMC11365916/12264_2024_1218_MOESM1_ESM.pdf to pdfs/12264_2024_1218_MOESM1_ESM.pdf
Moved: dfolder/PMC7034269/NRR-15-1111_Suppl1.pdf to pdfs/NRR-15-1111_Suppl1.pdf
Moved: dfolder/PMC7034269/NRR-15-1111.pdf to pdfs/NRR-15-1111.pdf
Moved: dfolder/PMC7772800/13195_2020_Article_744.pdf to pdfs/13195_2020_Article_744.pdf
Moved: dfolder/PMC10172615/main.pdf to pdfs/main.pdf
Moved: dfolder/PMC9547798/401_2022_2469_MOESM2_ESM.pdf to pdfs/401_2022_2469_MOESM2_ESM.pdf
Moved: dfolder/PMC9547798/401_2022_Article_2469.pdf to pdfs/401_2022_Article_2469.pdf
Moved: dfolder/PMC9547798/401_2022_2469_MOESM1_ESM.pdf to pdfs/401_2022_2