In [None]:
!pip install datasets transformers

In [35]:
from huggingface_hub import notebook_login
notebook_login()

## Loading Dataset in streaming mode


In [38]:
from datasets import load_dataset
dataset = load_dataset('oscar-corpus/OSCAR-2201', 'en', split='train', streaming=True)

In [17]:
print(next(iter(dataset)))

{'id': 0, 'text': 'Founded in 2015, Golden Bees is a leading programmatic recruitment platform dedicated to employers, HR agencies and job boards. The company has developed unique HR-custom technologies and predictive algorithms to identify and attract the best candidates for a job opportunity.', 'meta': {'warc_headers': {'warc-record-id': '<urn:uuid:9951a661-03f4-4256-9c82-98a4e8c2eddd>', 'warc-date': '2021-12-03T19:34:50Z', 'content-type': 'text/plain', 'content-length': 276, 'warc-type': 'conversion', 'warc-identified-content-language': 'eng', 'warc-refers-to': '<urn:uuid:fc4d8c99-8e0e-4870-91d8-4990ca12acb4>', 'warc-target-uri': 'http://www.ecap-partner.com/transactions/golden-bees-completed-a-series-a-with-entrepreneur-venture/', 'warc-block-digest': 'sha1:G2ZQT6JSEY2R6EIDBUL55FZIXEWKC4J7'}, 'identification': {'label': 'en', 'prob': 0.9512954}, 'annotations': ['tiny'], 'line_identifications': [{'label': 'en', 'prob': 0.9512954}]}}


## Shuffle and Split


In [None]:
# shuffle samples in a buffer
shuffled_dataset = dataset.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

In [42]:
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(100)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(100)

In [43]:
print(len(list(validation_dataset)))

100


## Interleave 2 datasets

In [46]:
# Interleaving
from datasets import Dataset, interleave_datasets
seed = 42
probabilities = [0.3, 0.5, 0.2]

d1 = Dataset.from_dict({"a": [0, 1, 2]})
d2 = Dataset.from_dict({"a": [10, 11, 12, 13]})
d3 = Dataset.from_dict({"a": [20, 21, 22]})
dataset = interleave_datasets([d1, d2, d3], stopping_strategy="all_exhausted", probabilities=probabilities, seed=seed)
dataset["a"]

[10, 11, 20, 12, 0, 21, 13, 10, 1, 11, 12, 22, 13, 20, 10, 2]

In [None]:
from datasets import interleave_datasets
en_dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
fr_dataset = load_dataset('oscar', "unshuffled_deduplicated_fr", split='train', streaming=True)

multilingual_dataset_with_oversampling = interleave_datasets([en_dataset, fr_dataset], probabilities=[0.5, 0.5], seed=42)
list(multilingual_dataset_with_oversampling.take(2))

## Rename, Cast and remove

In [60]:
from datasets import load_dataset
dataset = load_dataset('mc4', 'en', streaming=True, split='train')
print(dataset.features)
dataset = dataset.rename_column("text", "content")
dataset = dataset.remove_columns('timestamp')
print(dataset.features)

{'text': Value(dtype='string', id=None), 'timestamp': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None)}
{'content': Value(dtype='string', id=None), 'url': Value(dtype='string', id=None)}


In [61]:
from datasets import load_dataset
from datasets import ClassLabel, Value

dataset = load_dataset('glue', 'mrpc', split='train')
print(dataset.features)

new_features = dataset.features.copy()
new_features["label"] = ClassLabel(names=['negative', 'positive'])
new_features["idx"] = Value('int64')
dataset = dataset.cast(new_features)
print(dataset.features)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}


Casting the dataset:   0%|          | 0/3668 [00:00<?, ? examples/s]

{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive'], id=None), 'idx': Value(dtype='int64', id=None)}


## Training with Stream dataset

In [72]:
seed, buffer_size = 42, 10_000
dataset = load_dataset("mc4", "en", streaming=True, split="train")
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')
dataset = dataset.map(encode, batched=True, remove_columns=["text", "timestamp", "url"])
dataset = dataset.shuffle(seed, buffer_size=buffer_size)

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling, AutoTokenizer
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

dataset = dataset.with_format("torch")
dataloader = DataLoader(dataset, collate_fn=DataCollatorForLanguageModeling(tokenizer), batch_size=16)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")
model.train().to(device)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

for epoch in range(3):
    dataset.set_epoch(epoch)
    for i, batch in enumerate(tqdm(dataloader, total=5)):
        if i == 5:
            break
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")