NOTEBOOK TO FETCH SOME RANDOM TRAINING DATA

In [5]:
import pandas as pd
import requests
import gzip
import xml.etree.ElementTree as ET
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed
import random

In [6]:
def parse_articles_from_gz(number):
    number_str = str(number).zfill(4)
    url = f"https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n{number_str}.xml.gz"
    print(f"Downloading {url} ...")
    articles = []
    
    try:
        response = requests.get(url, stream=True, timeout=60)
        response.raise_for_status()
        with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
            context = ET.iterparse(f, events=('end',))
            for event, elem in context:
                if elem.tag == 'PubmedArticle':
                    pmid_elem = elem.find('.//PMID')
                    title_elem = elem.find('.//ArticleTitle')
                    abstract_elem = elem.find('.//Abstract/AbstractText')
                    
                    pmid = pmid_elem.text if pmid_elem is not None else None
                    title = title_elem.text if title_elem is not None else None
                    abstract = abstract_elem.text if abstract_elem is not None else None
                    
                    articles.append((pmid, title, abstract))
                    
                    elem.clear()
    except Exception as e:
        print(f"Failed to process {url}: {e}")
    
    return articles

In [7]:
def collect_all_articles():
    all_articles = []
    numbers = random.sample(range(200, 1000), 20)

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(parse_articles_from_gz, number) for number in numbers]
        
        for future in as_completed(futures):
            articles = future.result()
            all_articles.extend(articles)
    
    return all_articles


In [None]:
all_articles = collect_all_articles()

df = pd.DataFrame(all_articles, columns=["PMID", "ArticleTitle", "Abstract"])
print(f"Total articles collected: {len(df)}")


df.to_csv('random_articles.csv', index=False)

Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0237.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0697.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0780.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0966.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0595.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0506.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0594.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0819.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0255.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0627.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0303.xml.gz ...
Downloading https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed25n0778.xml.gz ...
Downloading http

In [14]:
random_articles = pd.read_csv('random_articles.csv')

In [None]:
random_articles[random_articles['Abstract'].isnull() == False]