#### The Datasets library allows you to easily download and cache datasets by using ***load_dataset***

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
print(raw_datasets)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


#### We can access any split of the dataset by its key, then any element by index. We can also get a slice of the dataset

In [3]:
print(raw_datasets["train"])
print(raw_datasets["train"][5])
print(raw_datasets["train"][:5])

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
{'sentence1': 'Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .', 'sentence2': "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .", 'label': 1, 'idx': 5}
{'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .', 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'], 'sentence2': ['Referring to him as only " the witness 

#### The ***features*** attribute gives as more information about each column

In [4]:
raw_datasets["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

#### The ***map*** method allows you to apply a function over all the splits of a given dataset, the result will columnly append to the orignial dataset split

In [5]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["sentence1"], example["sentence2"], padding="max_length", truncation=True, max_length=128
    )

tokenized_datasets = raw_datasets.map(tokenize_function)
print(tokenized_datasets.column_names)

Map: 100%|██████████| 3668/3668 [00:00<00:00, 5640.86 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 5651.54 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4525.11 examples/s]

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}





#### You can preprocess faster by using the option batched=True, the applied function will then receive multiple examples at each call, the process can be accelerate by processing several elements at the same time

In [6]:
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets.column_names)

Map: 100%|██████████| 3668/3668 [00:00<00:00, 12125.89 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 12413.59 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 14246.45 examples/s]

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}





#### With just a few last tweaks, the dataset is then ready for training

In [7]:
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets["train"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [8]:
small_train_dataset = tokenized_datasets["train"].select(range(100))
print(small_train_dataset)

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})


## 1.Load a custom dataset

#### Datasets provides several loading scripts to handle local and remote datasets
<div><img src="image/dataset1.png" width=800></div>

#### Load a local csv dataset

In [33]:
import wget
from pathlib import Path
if not Path("winequality-white.csv").exists():
    wget.download("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv")

In [10]:
from datasets import load_dataset

local_csv_dataset = load_dataset("csv", data_files="winequality-white.csv", sep=";")
# sep is the arguments to pass to pandas.read_csv()
print(local_csv_dataset)
local_csv_dataset["train"].features

DatasetDict({
    train: Dataset({
        features: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'],
        num_rows: 4898
    })
})


{'fixed acidity': Value(dtype='float64', id=None),
 'volatile acidity': Value(dtype='float64', id=None),
 'citric acid': Value(dtype='float64', id=None),
 'residual sugar': Value(dtype='float64', id=None),
 'chlorides': Value(dtype='float64', id=None),
 'free sulfur dioxide': Value(dtype='float64', id=None),
 'total sulfur dioxide': Value(dtype='float64', id=None),
 'density': Value(dtype='float64', id=None),
 'pH': Value(dtype='float64', id=None),
 'sulphates': Value(dtype='float64', id=None),
 'alcohol': Value(dtype='float64', id=None),
 'quality': Value(dtype='int64', id=None)}

#### Load remote csv dataset

In [11]:
from datasets import load_dataset

dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
remote_csv_dataset = load_dataset("csv", data_files=dataset_url, sep=";")
remote_csv_dataset

DatasetDict({
    train: Dataset({
        features: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'],
        num_rows: 4898
    })
})

#### Raw text files are read line by line to build the dataset
<div><img src="image/dataset2.png" width=500></div>

In [12]:
from datasets import load_dataset

dataset_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text_dataset = load_dataset("text", data_files=dataset_url)
text_dataset["train"][:5]

{'text': ['First Citizen:',
  'Before we proceed any further, hear me speak.',
  '',
  'All:',
  'Speak, speak.']}

#### JSON files can be loaded in two main ways - line by line or by sepcifying a ***field*** in nested JSON

In [13]:
dataset_url = "https://raw.githubusercontent.com/hirupert/sede/main/data/sede/train.jsonl"
json_lines_dataset = load_dataset("json", data_files=dataset_url)
json_lines_dataset

DatasetDict({
    train: Dataset({
        features: ['QuerySetId', 'Title', 'Description', 'QueryBody', 'CreationDate', 'validated'],
        num_rows: 10309
    })
})

In [14]:
dataset_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
json_dataset = load_dataset("json", data_files=dataset_url, field="data")
json_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
})

#### You can specify which splits to return with the ***data_files*** argument

In [15]:
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
data_files = {"train": f"{url}train-v2.0.json", "validation": f"{url}dev-v2.0.json"}
json_dataset = load_dataset("json", data_files=data_files, field="data")
json_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    validation: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 35
    })
})

## 2.Slice and dice a dataset
#### Dataset provides several methods to filter and transform a dataset, all the method are not an inplace operation on the current dataset
<div><img src="image/dataset3.png" width=600></div>

In [16]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
print(squad, "\n", squad[0], "\n", squad.features)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
}) 
 {'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Be

#### You can shuffle the whole dataset with ***dataset.shuffle()***

In [17]:
squad_shuffled =squad.shuffle(seed=666)
print(squad_shuffled[0])

{'id': '5727cc873acd2414000deca9', 'title': 'Oklahoma', 'context': 'Oklahoma is the 20th largest state in the United States, covering an area of 69,898 square miles (181,035 km2), with 68,667 square miles (177847 km2) of land and 1,281 square miles (3,188 km2) of water. It is one of six states on the Frontier Strip and lies partly in the Great Plains near the geographical center of the 48 contiguous states. It is bounded on the east by Arkansas and Missouri, on the north by Kansas, on the northwest by Colorado, on the far west by New Mexico, and on the south and near-west by Texas.', 'question': 'Where does Oklahoma rank by land area?', 'answers': {'text': ['20th'], 'answer_start': [16]}}


#### or create your own shuffled train and test splits with ***Dataset.train_test_split()***

In [18]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
dataset = squad.train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 78839
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 8760
    })
})


#### You can return rows according to a lists of indices using ***Dataset.select()***

In [19]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
indices = [0, 10, 20, 40, 80]
examples = squad.select(indices=indices)
print(examples)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5
})


In [20]:
# create a random sample by chaining with Dataset.shuffle()
from datasets import load_dataset

squad = load_dataset("squad", split="train")
sample = squad.shuffle().select(range(5)) 
print(sample)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5
})


#### You can use ***Dataset.filter()*** to return rows that match a condition

In [21]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
squad_filtered = squad.filter(lambda x : x["title"].startswith("L"))

print(squad_filtered, squad_filtered[0])


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 2049
}) {'id': '56de0fef4396321400ee2583', 'title': 'Lighting', 'context': 'Lighting or illumination is the deliberate use of light to achieve a practical or aesthetic effect. Lighting includes the use of both artificial light sources like lamps and light fixtures, as well as natural illumination by capturing daylight. Daylighting (using windows, skylights, or light shelves) is sometimes used as the main source of light during daytime in buildings. This can save energy in place of using artificial lighting, which represents a major component of energy consumption in buildings. Proper lighting can enhance task performance, improve the appearance of an area, or have positive psychological effects on occupants.', 'question': 'What is used a main source of light for a building during the day?', 'answers': {'text': ['Daylighting'], 'answer_start': [245]}}


#### Use the ***rename_column()*** and ***remove_column()*** methods to transform your columns

In [22]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
if "context" in squad.features:
    squad = squad.rename_column("context", "passages")
print(squad)

Dataset({
    features: ['id', 'title', 'passages', 'question', 'answers'],
    num_rows: 87599
})


In [23]:
if "title" in squad.features and "id" in squad.features:
    squad = squad.remove_columns(["id", "title"])
print(squad)

Dataset({
    features: ['passages', 'question', 'answers'],
    num_rows: 87599
})


#### If your dataset has neset columns, flatten them with ***Dataset.faltten()***

In [24]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
print(squad, "\n", squad.features["answers"])
squad = squad.flatten()
print(squad)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
}) 
 Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)
Dataset({
    features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
    num_rows: 87599
})


#### The ***Dataset.map()*** method applies a processing function to each row, the function input to the ***Dataset.map()*** method need to return a dict
#### If the returned dict key is new to the original dataset, a new column will be created, if the returned dict key is one of the orignial dataset key, it will rewrite the original column

In [25]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")

def lowercase_title(example):
    return {"lower_case": example["title"].lower()}

squad_lower_case = squad.map(lowercase_title)
print(squad_lower_case)
print(squad_lower_case.shuffle(seed=666)["title"][:5])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'lower_case'],
    num_rows: 87599
})
['Oklahoma', 'Sumer', 'Insect', 'Tuberculosis', 'Steven_Spielberg']


In [26]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")

def lowercase_title(example):
    return {"title": example["title"].lower()}

squad_lower_case = squad.map(lowercase_title)
print(squad_lower_case)
print(squad_lower_case.shuffle(seed=666)["title"][:5])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})
['oklahoma', 'sumer', 'insect', 'tuberculosis', 'steven_spielberg']


#### The ***Dataset.map()*** method can work with batches, it is especially useful for tokenization, where the tokenizers are backed by the Tokenizers library can use fast multithreading to process batches in parallel

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_title(example):
    return tokenizer(example["title"])

squad = squad.map(tokenize_title, batched=True, batch_size=500)
print(squad)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask'],
    num_rows: 87599
})


## 3.Datasets + DataFrames

In [28]:
from datasets import load_dataset

dataset = load_dataset("swiss_judgment_prediction", "all", split="train", trust_remote_code=True)
print(dataset)
print(dataset[:5])

Dataset({
    features: ['id', 'year', 'text', 'label', 'language', 'region', 'canton', 'legal area', 'source_language'],
    num_rows: 59709
})
{'id': [2, 3, 4, 5, 6], 'year': [2000, 2000, 2000, 2000, 2000], 'text': ['A.- Der 1955 geborene V._ war seit 1. September 1986 hauptberuflich als technischer Kaufmann bei der Firma A._ AG tätig und im Rahmen einer Nebenbeschäftigung (Nachtarbeit) ab Mai 1990 bei einem Bewachungsdienst angestellt gewesen, als er am 10. Februar 1991 in Norwegen beim Hundeschlittenfahren eine Muskelruptur im Bereich des linken Oberschenkels erlitt. Die Verletzung wurde am 26. Februar 1991 mittels Muskelnaht operativ versorgt (Bericht des Dr. med. B._, Oberarzt, Chirurgische Klinik X._ vom 28. Februar 1991). Beweglichkeits- und Sensibilitätsausfälle führten zum Beizug des Dr. med. W._, Spezialarzt FMH Neurologie, welcher eine Ischiadicusparese links, wahrscheinlich traumatisch bedingt, diagnostizierte (Bericht vom 5. März 1991). Dr. med. S._, Spezialarzt für Chiru

#### The ***Dataset.with_format()*** or ***Dataset.set_format()*** can change the dataset format, ***Dataset.with_format()*** is not inplace and ***Dataset.set_format()*** is inplace 

In [59]:
dataset.set_format("pandas")
print(dataset[:5], "\n", type(dataset), type(dataset[:]))

   id  year                                               text  label  \
0   2  2000  A.- Der 1955 geborene V._ war seit 1. Septembe...      0   
1   3  2000  Ansprüche nach OHG, hat sich ergeben: A.- X._ ...      1   
2   4  2000  Art. 4 aBV (Strafverfahren wegen falschen Zeug...      0   
3   5  2000  Art. 5 Ziff. 1 EMRK (Haftentlassung), hat sich...      1   
4   6  2000  Mietvertrag, hat sich ergeben: A.- Die CT Cond...      0   

  language                    region canton     legal area source_language  
0       de                    Zürich     zh  insurance law             n/a  
1       de       Central Switzerland     lu     public law             n/a  
2       de  Northwestern Switzerland     ag     public law             n/a  
3       de                       n/a    n/a     public law             n/a  
4       de                       n/a    n/a      civil law             n/a   
 <class 'datasets.arrow_dataset.Dataset'> <class 'pandas.core.frame.DataFrame'>


In [42]:
df = dataset[:]
print(type(df))
df.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,id,year,text,label,language,region,canton,legal area,source_language
0,2,2000,A.- Der 1955 geborene V._ war seit 1. Septembe...,0,de,Zürich,zh,insurance law,
1,3,2000,"Ansprüche nach OHG, hat sich ergeben: A.- X._ ...",1,de,Central Switzerland,lu,public law,
2,4,2000,Art. 4 aBV (Strafverfahren wegen falschen Zeug...,0,de,Northwestern Switzerland,ag,public law,
3,5,2000,"Art. 5 Ziff. 1 EMRK (Haftentlassung), hat sich...",1,de,,,public law,
4,6,2000,"Mietvertrag, hat sich ergeben: A.- Die CT Cond...",0,de,,,civil law,


#### Under the hood, ***Dataset.set_format()*** change the ***__getitem__()*** method, originally return a dict, change to return a DataFrame
<div><img src="image/dataset4.png" width=800></div>

#### Another way to create a DataFrame is with the ***Dataset.to_pandas()*** method

In [61]:
from datasets import load_dataset

dataset = load_dataset("swiss_judgment_prediction", "all", split="train", trust_remote_code=True)
df = dataset.to_pandas()
df.head()

Unnamed: 0,id,year,text,label,language,region,canton,legal area,source_language
0,2,2000,A.- Der 1955 geborene V._ war seit 1. Septembe...,0,de,Zürich,zh,insurance law,
1,3,2000,"Ansprüche nach OHG, hat sich ergeben: A.- X._ ...",1,de,Central Switzerland,lu,public law,
2,4,2000,Art. 4 aBV (Strafverfahren wegen falschen Zeug...,0,de,Northwestern Switzerland,ag,public law,
3,5,2000,"Art. 5 Ziff. 1 EMRK (Haftentlassung), hat sich...",1,de,,,public law,
4,6,2000,"Mietvertrag, hat sich ergeben: A.- Die CT Cond...",0,de,,,civil law,


In [62]:
# How are languages distributed across regions?
df.groupby("region")["language"].value_counts()

region                    language
Central Switzerland       de           4778
                          it              1
Eastern Switzerland       de           5650
                          it             57
Espace Mittelland         de           5150
                          fr           3104
                          it              3
Federation                de           1011
                          fr            227
                          it             70
Northwestern Switzerland  de           5654
                          fr              1
Région lémanique          fr          13100
                          de            336
Ticino                    it           2249
                          de              6
Zürich                    de           8785
                          fr              3
n/a                       fr           4744
                          de           4088
                          it            692
Name: count, dtype: int64

In [63]:
# which legal area is most common?
df["legal area"].value_counts()

legal area
public law       15173
penal law        11795
civil law        11477
insurance law    11142
social law        9727
other              395
Name: count, dtype: int64

#### Remember to rest the format when you are finished by using the ***datset.reset_format()*** method to reset back to arrow format

In [64]:
dataset.set_format("pandas")
print(dataset[:5], "\n", type(dataset[:]))

dataset.reset_format()
print(dataset[:5], "\n", type(dataset[:]))

   id  year                                               text  label  \
0   2  2000  A.- Der 1955 geborene V._ war seit 1. Septembe...      0   
1   3  2000  Ansprüche nach OHG, hat sich ergeben: A.- X._ ...      1   
2   4  2000  Art. 4 aBV (Strafverfahren wegen falschen Zeug...      0   
3   5  2000  Art. 5 Ziff. 1 EMRK (Haftentlassung), hat sich...      1   
4   6  2000  Mietvertrag, hat sich ergeben: A.- Die CT Cond...      0   

  language                    region canton     legal area source_language  
0       de                    Zürich     zh  insurance law             n/a  
1       de       Central Switzerland     lu     public law             n/a  
2       de  Northwestern Switzerland     ag     public law             n/a  
3       de                       n/a    n/a     public law             n/a  
4       de                       n/a    n/a      civil law             n/a   
 <class 'pandas.core.frame.DataFrame'>
{'id': [2, 3, 4, 5, 6], 'year': [2000, 2000, 2000, 2000, 20

## 4.Memory mapping & streaming

#### Datasets use Arrow and Streaming to handle data at scale
#### Arrow is designed for high-performance data processing and represents each table-like dataset with column based format, which groups the elements in the table in consecutive blocks of RAM and unlocks fast access and processing
#### Streaming allows you to progressively download the raw data one element at a time, the result is a special object called IterableDataset
<div><img src="image/dataset5.png" width=800></div>

#### The IterableDataset can be obtained by setting ***streaming=True*** when in the ***load_dataset()*** function. It is an iterable, can be indexed but can be iterated using the iter() and next() method

In [73]:
dataset_streamed = load_dataset("swiss_judgment_prediction", "all", split="train", streaming=True)
print(type(dataset_streamed))
print(next(iter(dataset_streamed)))

<class 'datasets.iterable_dataset.IterableDataset'>
{'id': 2, 'year': 2000, 'text': 'A.- Der 1955 geborene V._ war seit 1. September 1986 hauptberuflich als technischer Kaufmann bei der Firma A._ AG tätig und im Rahmen einer Nebenbeschäftigung (Nachtarbeit) ab Mai 1990 bei einem Bewachungsdienst angestellt gewesen, als er am 10. Februar 1991 in Norwegen beim Hundeschlittenfahren eine Muskelruptur im Bereich des linken Oberschenkels erlitt. Die Verletzung wurde am 26. Februar 1991 mittels Muskelnaht operativ versorgt (Bericht des Dr. med. B._, Oberarzt, Chirurgische Klinik X._ vom 28. Februar 1991). Beweglichkeits- und Sensibilitätsausfälle führten zum Beizug des Dr. med. W._, Spezialarzt FMH Neurologie, welcher eine Ischiadicusparese links, wahrscheinlich traumatisch bedingt, diagnostizierte (Bericht vom 5. März 1991). Dr. med. S._, Spezialarzt für Chirurgie FMH, Chefarzt Spital X._ (Bericht vom 28. Oktober 1992) bestätigte, dass es bei der Operation vom 10. Februar 1991 zu einer Druck

#### As it can not be indexed the ***Dataset.select()*** can not be used for IterableDataset, we use ***IterableDataset.take()*** and ***IterableDataset.skip()***

In [78]:
# Skip the first 1000 examples and include the rest in the training set
train_dataset = dataset_streamed.skip(1000)
# Take the first 1000 examples for the validation
validation_dataset = dataset_streamed.take(1000)

print("training dataset length:", len(list(train_dataset)), "\nvalidation dataset length:", len(list(validation_dataset)))

training dataset length: 58709 
validation dataset length: 1000
