# Creating Deeplake dataset

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import deeplake

# env variable ACTIVELOOP_TOKEN must be set with your API token

# create dataset on deeplake
username = "taahasbajwa"
dataset_name = "test_dataset"
ds = deeplake.dataset(f"hub://{username}/{dataset_name}")

# create column text
ds.create_tensor('text', htype="text")

# add some texts to the dataset
texts = [f"text {i}" for i in range(1, 11)]
for text in texts:
    ds.append({"text": text})

\

Your Deep Lake dataset has been successfully created!



/

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/taahasbajwa/test_dataset



-

hub://taahasbajwa/test_dataset loaded successfully.



 

In [3]:
ds.commit("added texts")

 

'firstdbf9474d461a19e9333c2fd19b46115348f'

# Retrieve Data from Deeplake

### Deep Lake Data Loader for PyTorch

In [6]:
# create PyTorch data loader
batch_size = 3
train_loader = ds.dataloader()\
    .batch(batch_size)\
    .shuffle()\
    .pytorch()

# loop over the elements
for i, batch in enumerate(train_loader):
    print(f"Batch {i}")
    samples = batch.get("text")
    for j, sample in enumerate(samples):
        print(f"Sample {j}: {sample}")
    print()
    pass

Please wait, filling up the shuffle buffer with samples.:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

2023-10-21 13:32:55,330 | INFO | SingleProcessIterator initialized 6413



Please wait, filling up the shuffle buffer with samples.:   0%|          | 101/2.00G [00:00<2288:24:27, 261B/s]   

Shuffle buffer filling is complete.
Batch 0
Sample 0: text 5
Sample 1: text 9
Sample 2: text 3

Batch 1
Sample 0: text 7
Sample 1: text 6
Sample 2: text 8

Batch 2
Sample 0: text 4
Sample 1: text 10
Sample 2: text 1

Batch 3
Sample 0: text 2






### PyTorch Datasets and PyTorch Data Loaders using Deep Lake

In [7]:
from torch.utils.data import DataLoader, Dataset

class DeepLakePyTorchDataset(Dataset):
    def __init__(self, ds):
        self.ds = ds

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        texts = self.ds.text[idx].text().astype(str)
        return { "text": texts }

In [8]:
# create PyTorch dataset
ds_pt = DeepLakePyTorchDataset(ds)

# create PyTorch data loader from PyTorch dataset
dataloader_pytorch = DataLoader(ds_pt, batch_size=3, shuffle=True)

# loop over the elements
for i, batch in enumerate(dataloader_pytorch):
    print(f"Batch {i}")
    samples = batch.get("text")
    for j, sample in enumerate(samples):
        print(f"Sample {j}: {sample}")
    print()
    pass

Batch 0
Sample 0: text 10
Sample 1: text 7
Sample 2: text 8

Batch 1
Sample 0: text 3
Sample 1: text 1
Sample 2: text 5

Batch 2
Sample 0: text 6
Sample 1: text 2
Sample 2: text 4

Batch 3
Sample 0: text 9



# Filtering and Querying datasets (to get high quality data)

In [10]:
ds_view = ds.query("select * where contains(text, '1')")

# code that creates a data loader and prints the batches
# create PyTorch data loader
batch_size = 3
train_loader = ds_view.dataloader()\
    .batch(batch_size)\
    .shuffle()\
    .pytorch()

# loop over the elements
for i, batch in enumerate(train_loader):
    print(f"Batch {i}")
    samples = batch.get("text")
    for j, sample in enumerate(samples):
        print(f"Sample {j}: {sample}")
    print()
    pass

Please wait, filling up the shuffle buffer with samples.:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

2023-10-21 13:43:12,555 | INFO | SingleProcessIterator initialized 6413

2023-10-21 13:43:12,555 | INFO | SingleProcessIterator initialized 6413



Please wait, filling up the shuffle buffer with samples.:   0%|          | 21.0/2.00G [00:00<60:29:06, 9.86kB/s]

Shuffle buffer filling is complete.
Batch 0
Sample 0: text 1
Sample 1: text 10






In [11]:
# saving the filtered dataset
ds_view.save_view(id="strings_with_1")

100%|██████████| 2/2 [00:00<00:00,  7.43it/s]
 

'hub://taahasbajwa/test_dataset/.queries/strings_with_1'

In [13]:
ds = deeplake.dataset("hub://taahasbajwa/test_dataset/.queries/strings_with_1")

 

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/taahasbajwa/test_dataset?view=strings_with_1



-

hub://taahasbajwa/test_dataset/.queries/strings_with_1 loaded successfully.



 