# **NIE DOTYKAĆ**

NOTEBOOK TO FETCH ALL THE DATA

In [None]:
import pandas as pd
import requests
import gzip
import xml.etree.ElementTree as ET
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
def parse_articles_from_gz(number):
    number_str = str(number).zfill(4)
    url = f"https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n{number_str}.xml.gz"
    print(f"Downloading {url} ...")
    articles = []
    
    try:
        response = requests.get(url, stream=True, timeout=60)
        response.raise_for_status()
        with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
            context = ET.iterparse(f, events=('end',))
            for event, elem in context:
                if elem.tag == 'PubmedArticle':
                    pmid_elem = elem.find('.//PMID')
                    title_elem = elem.find('.//ArticleTitle')
                    abstract_elem = elem.find('.//Abstract/AbstractText')
                    
                    pmid = pmid_elem.text if pmid_elem is not None else None
                    title = title_elem.text if title_elem is not None else None
                    abstract = abstract_elem.text if abstract_elem is not None else None
                    
                    articles.append((pmid, title, abstract))
                    
                    elem.clear()
    except Exception as e:
        print(f"Failed to process {url}: {e}")
    
    return articles


In [3]:
def collect_all_articles():
    all_articles = []
    numbers = list(range(1, 1275))  # From 0001 to 1274

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(parse_articles_from_gz, number) for number in numbers]
        
        for future in as_completed(futures):
            articles = future.result()
            all_articles.extend(articles)
    
    return all_articles


In [None]:
all_articles = collect_all_articles()

df = pd.DataFrame(all_articles, columns=["PMID", "ArticleTitle", "Abstract"])
print(f"Total articles collected: {len(df)}")


df.to_csv('pubmed_articles_parallel.csv', index=False)

Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0001.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0002.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0003.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0004.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0005.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0006.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0007.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0008.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0009.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0010.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0011.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0012.xml.gz ...
Downloading http