In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-09-03 09:59:52--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-09-03 09:59:52 (19.7 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [4]:
print(f"Length of dataset: {len(text)}")

Length of dataset: 1115394


In [5]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [11]:
text[]

'r'

In [10]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Length of set: {vocab_size}")

Length of set: 65


In [8]:
chars[:10]

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']

In [13]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

def encode(s):
  return [stoi[c] for c in s]

def decode(ids_ls):
  return [itos[i] for i in ids_ls]

In [17]:
sample_string = "Hello I'm Tai"

encode_ls = encode(sample_string)

print(f"Encode values: {encode(sample_string)}")
print(f"Decode values: {''.join(decode(encode_ls))}")

Encode values: [20, 43, 50, 50, 53, 1, 21, 5, 51, 1, 32, 39, 47]
Decode values: Hello I'm Tai


In [18]:
import torch

In [19]:
data = torch.tensor(encode(text), dtype=torch.int)

In [20]:
data.shape

torch.Size([1115394])

## Divide dataset into train and validation

In [25]:
total_samples = len(text)
t_n = int(0.9 * total_samples)
train_data = data[:t_n]
val_data = data[t_n:]

In [27]:
# Generate training dataset
block_size = 8
train_data[:block_size]

tensor([18, 47, 56, 57, 58,  1, 15, 47], dtype=torch.int32)

In [29]:
x = data[:block_size]
y = data[1:block_size+1]
print(f"X: {x}")
print(f"Y: {y}")
for t in range(block_size):
  print(f"Current context: {x[:t+1]}")
  print(f"Label: {y[t]}")


X: tensor([18, 47, 56, 57, 58,  1, 15, 47], dtype=torch.int32)
Y: tensor([47, 56, 57, 58,  1, 15, 47, 58], dtype=torch.int32)
Current context: tensor([18], dtype=torch.int32)
Label: 47
Current context: tensor([18, 47], dtype=torch.int32)
Label: 56
Current context: tensor([18, 47, 56], dtype=torch.int32)
Label: 57
Current context: tensor([18, 47, 56, 57], dtype=torch.int32)
Label: 58
Current context: tensor([18, 47, 56, 57, 58], dtype=torch.int32)
Label: 1
Current context: tensor([18, 47, 56, 57, 58,  1], dtype=torch.int32)
Label: 15
Current context: tensor([18, 47, 56, 57, 58,  1, 15], dtype=torch.int32)
Label: 47
Current context: tensor([18, 47, 56, 57, 58,  1, 15, 47], dtype=torch.int32)
Label: 58


In [31]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

torch.randint(10-2, (10,))

tensor([7, 5, 4, 0, 7, 1, 7, 2, 2, 2])