# Data

> Create dataloader

In [1]:
#| default_exp data

In [2]:
#|export 
from fastcore.all import *
path = Path('../static')
path.mkdir(exist_ok=True)

In [3]:
#| hide
from nbdev.showdoc import *

## Text data
The text is taken from [Karpathy's nanogpt](https://github.com/karpathy/build-nanogpt) .

In [4]:
#|export 
urlsave("https://raw.githubusercontent.com/karpathy/build-nanogpt/refs/heads/master/input.txt", path) 

Path('../static/input.txt')

### Tokenizer
Maps each char to unique index. It have some key attributes i.e.
1. **voacb**: where it maped to all the char present text field
1. **encode**: to encode given string to list of tokens
1. **decode**: to decode given tokens to str
1. **c2i** and **i2c**: helper function to convert char to tokens and tokens to char respectively

In [5]:
#|export
from typing import List
class Tokenizer:
    def __init__(self):
        self.setup_vocab()

    def setup_vocab(self):
        with open(path/'input.txt', 'r') as file:
            self.txt = file.read()

        self.vocab = sorted(list(set(list(self.txt))))
        print(f"After reading file got the vocab of shape : {len(self.vocab)}")

    def c2i(self, ch:str) -> int:
        """
        returns index of char ch from vocab
        """
        return self.vocab.index(ch)

    def i2c(self, idx:int) -> str:
        """
        returns char from vocab given index
        """
        return self.vocab[idx]

    def encode(self, inp:str) -> List[int]:
        """
        returns the encoded string
        """
        return [self.c2i(i) for i in inp]

    def decode(self, inp:List[int]) -> str:
        """
        returns the string represntation of the
        """
        return ''.join([self.i2c(i) for i in inp])

tokenizer = Tokenizer()

After reading file got the vocab of shape : 65


In [6]:
s = 'abc'
assert tokenizer.decode(tokenizer.encode(s))  == s

The tokenizer is loss less.

### DataLoader
The Dataset should be of **non-overlapping chunks**:
1. `__init__` - store the encoded text and config (seq_len, etc.)
1. `__len__`: return `len(encoded_text) // seq_len - 1` (divide, not subtract)
1. `__getitem__`: use `idx * seq_len` as the starting position

In [7]:
#|export
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, text, seq_len:int):
        self.text = text
        self.seq_len = seq_len
        self.encoded_text = tokenizer.encode(text)

    def __len__(self):
        return len(self.encoded_text) // self.seq_len -1

    def __getitem__(self, idx):
        inp = self.encoded_text[idx * self.seq_len : (idx + 1) * self.seq_len]
        op = self.encoded_text[idx * self.seq_len + 1 : (idx + 1) * self.seq_len + 1]
        return torch.tensor(inp), torch.tensor(op)

In [8]:
#|export
def get_text_dl(bs:int=64, seq_len:int=128):
    split_idx = int(len(tokenizer.txt) * 0.9)                         #split text with 9:
    train_dataset = GPTDataset(tokenizer.txt[:split_idx], seq_len )
    val_dataset = GPTDataset(tokenizer.txt[split_idx:], seq_len)

    return {
        'train': DataLoader(train_dataset, batch_size=bs, shuffle=True),
        'valid': DataLoader(val_dataset, batch_size=bs, shuffle=False)
    }

In [9]:
dl = get_text_dl()
for x, y in dl['train']:
    break
assert x.shape == y.shape

In [12]:
x[0, :10], y[0,:10]

(tensor([ 0, 14, 63,  1, 57, 59, 41, 46,  1, 42]),
 tensor([14, 63,  1, 57, 59, 41, 46,  1, 42, 43]))

In [10]:
#| hide
import nbdev; nbdev.nbdev_export()