<a href="https://colab.research.google.com/github/vkjadon/llm/blob/main/04DataLoader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from torch.utils.data import Dataset, DataLoader


In [1]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer=str.split):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        return tokens, self.labels[idx]


texts = ["I love PyTorch", "This is bad"]
labels = [1, 0]

ds = TextDataset(texts, labels)
loader = DataLoader(ds, batch_size=2, shuffle=True)

for batch in loader:
    print(batch)


[[('I', 'This'), ('love', 'is'), ('PyTorch', 'bad')], tensor([1, 0])]


In [2]:
sentences = [
    "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
    "Fame's a fickle friend, Harry.",
    "It is our choices, Harry, that show what we truly are, far more than our abilities.",
    "Soon we must all face the choice between what is right and what is easy.",
    "Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.",
    "You are awesome!"
]

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

# Create an instance of your custom dataset
custom_dataset = CustomDataset(sentences)

# Define batch size
batch_size = 2

# Create a DataLoader
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

# Iterate through the DataLoader
for batch in dataloader:
    print(batch)

["If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.", "Fame's a fickle friend, Harry."]
['Soon we must all face the choice between what is right and what is easy.', 'It is our choices, Harry, that show what we truly are, far more than our abilities.']
['You are awesome!', 'Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.']


In [4]:
import torch
from torch.utils.data import Dataset
from torchtext.transforms import BasicEnglishNormalize
from torchtext.vocab import build_vocab_from_iterator

sentences = [
    "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
    "Fame's a fickle friend, Harry.",
    "It is our choices, Harry, that show what we truly are, far more than our abilities.",
    "Soon we must all face the choice between what is right and what is easy.",
    "Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.",
    "You are awesome!"
]

# ---- FIX 1: Modern tokenizer ----
tokenizer = BasicEnglishNormalize()

# ---- FIX 2: Build vocabulary ----
def yield_tokens(data):
    for text in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(sentences), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# ---- FIX 3: Dataset using new tokenizer ----
class CustomDataset(Dataset):
    def __init__(self, sentences, tokenizer, vocab):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.sentences[idx])
        tensor_indices = torch.tensor([self.vocab[token] for token in tokens])
        return tensor_indices

# Create dataset
custom_dataset = CustomDataset(sentences, tokenizer, vocab)

print("Custom Dataset Length:", len(custom_dataset))

# Display items
for i in range(len(custom_dataset)):
    print(f"Item {i+1}: {custom_dataset[i]}")


ModuleNotFoundError: No module named 'torchtext'