In [1]:
# Download the zstandard library for faster decompression
! pip install zstandard

Collecting zstandard
  Downloading zstandard-0.21.0-cp311-cp311-win_amd64.whl (511 kB)
                                              0.0/511.3 kB ? eta -:--:--
     --------------------                   276.5/511.3 kB 5.7 MB/s eta 0:00:01
     -------------------------------------- 511.3/511.3 kB 6.4 MB/s eta 0:00:00
Installing collected packages: zstandard
Successfully installed zstandard-0.21.0


In [2]:
# load the dataset from remote files using the load_dataset function
from datasets import load_dataset

In [5]:
# 15 million biomedical papers from PubMed
data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset('json', data_files=data_files, split='train')
pubmed_dataset

Downloading and preparing dataset json/default to C:/Users/Raj/.cache/huggingface/datasets/json/default-6e3092816c4f845b/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.90G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to C:/Users/Raj/.cache/huggingface/datasets/json/default-6e3092816c4f845b/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


Dataset({
    features: ['meta', 'text'],
    num_rows: 15518009
})

In [None]:
# Datasets will decompress the files needed to load a dataset. To preserve diskspace you can pass DownloadConfig(delete_extracted=True) to the download_config argument of load_dataset(). See https://huggingface.co/docs/datasets/package_reference/builder_classes.html?#datasets.utils.DownloadConfig for more details.

In [6]:
# inspect the contents of the first example
pubmed_dataset[0]

{'meta': {'pmid': 11409574, 'language': 'eng'},
 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age. Systematic review of the published literature. Out-patient clinics, emergency departments and hospitalisation wards in 23 health centres from 10 countries. Cohort studies reporting the frequency of hypoxaemia in children under 5 years of age with ALRI, and the association between hypoxaemia and the risk of dying. Prevalence of hypoxaemia measured in children with ARI and relative risks for the association between the severity of illness and the frequency of hypoxaemia, and between hypoxaemia and the risk of dying. Seventeen published studies were found that i

In [7]:
# a simple way to measure memory usage is to use the psutil library
! pip install psutil



In [8]:
import psutil

# Process class allows us to check the memory usage of the current process
print(f"RAM used: {psutil.Process().memory_info().rss / 1024 ** 2:.2f} MB")

RAM used: 443.49 MB


In [9]:
# let's see how large the dataset is on disk
print(f"Number of files in dataset: {pubmed_dataset.dataset_size}")
size_gb = pubmed_dataset.dataset_size / (1024**3)
print(f"Size of dataset on disk: {size_gb:.2f} GB")

Number of files in dataset: 20978892555
Size of dataset on disk: 19.54 GB


In [13]:
import timeit

code_snippet = """batch_size = 1000

for i in range(0, len(pubmed_dataset), batch_size):
    _ = pubmed_dataset[i:i+batch_size]
"""
time = timeit.timeit(code_snippet, number=1, globals=globals())

print(f"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in "
      f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s")

Iterated over 15518009 examples (about 19.5 GB) in 71.3s, e.e. 0.274 GB/s


In [14]:
pubmed_dataset_streamed = load_dataset('json', data_files=data_files, split='train', streaming=True)

In [15]:
next(iter(pubmed_dataset_streamed))

{'meta': {'pmid': 11409574, 'language': 'eng'},
 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age. Systematic review of the published literature. Out-patient clinics, emergency departments and hospitalisation wards in 23 health centres from 10 countries. Cohort studies reporting the frequency of hypoxaemia in children under 5 years of age with ALRI, and the association between hypoxaemia and the risk of dying. Prevalence of hypoxaemia measured in children with ARI and relative risks for the association between the severity of illness and the frequency of hypoxaemia, and between hypoxaemia and the risk of dying. Seventeen published studies were found that i

In [17]:
# processing elements from a streamed dataset is slower than from a non-streamed dataset
# the outputs are returned one by one

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_dataset = pubmed_dataset_streamed.map(lambda x: tokenizer(x['text']))
next(iter(tokenized_dataset))

{'meta': {'pmid': 11409574, 'language': 'eng'},
 'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age. Systematic review of the published literature. Out-patient clinics, emergency departments and hospitalisation wards in 23 health centres from 10 countries. Cohort studies reporting the frequency of hypoxaemia in children under 5 years of age with ALRI, and the association between hypoxaemia and the risk of dying. Prevalence of hypoxaemia measured in children with ARI and relative risks for the association between the severity of illness and the frequency of hypoxaemia, and between hypoxaemia and the risk of dying. Seventeen published studies were found that i

In [18]:
# shuffling a streamed dataset only shuffles the elements in the current batch
# this is because the dataset is streamed in batches of 1000 elements (default) and can be changed with the batch_size argument
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

{'meta': {'pmid': 11410799, 'language': 'eng'},
 'text': 'Randomized study of dose or schedule modification of granulocyte colony-stimulating factor in platinum-based chemotherapy for elderly patients with lung cancer.\nIt is generally believed that elderly patients are less able to tolerate aggressive cancer chemotherapy than their younger counterparts. Bone marrow cellularity diminishes with age and elderly patients may have decreased tolerance to myelosuppressive agents. Between November 1995 and October 1999, 68 chemotherapy-naive elderly (70 or more years old) patients with histologically or cytologically proven lung cancer who were to receive platinum-based chemotherapy were enrolled in this study. All patients had adequate cardiac, hematological, liver and renal function to receive chemotherapy. Patients were randomized into 3 groups. Patients in groups 1 and 2 received 2 microg/kg and 4 microg/kg granulocyte colony-stimulating factor (G-CSF, lenograstim), respectively, when gra

In [19]:
# select the first five examples from the streamed dataset using the take method
first_five = pubmed_dataset_streamed.take(5)
list(first_five)

[{'meta': {'pmid': 11409574, 'language': 'eng'},
  'text': 'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age. Systematic review of the published literature. Out-patient clinics, emergency departments and hospitalisation wards in 23 health centres from 10 countries. Cohort studies reporting the frequency of hypoxaemia in children under 5 years of age with ALRI, and the association between hypoxaemia and the risk of dying. Prevalence of hypoxaemia measured in children with ARI and relative risks for the association between the severity of illness and the frequency of hypoxaemia, and between hypoxaemia and the risk of dying. Seventeen published studies were found that

In [None]:
# skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)

# take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)