In [3]:
with open('data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print('length of the dataset in characters:', len(text))

length of the dataset in characters: 1115394


In [5]:
# Let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [15]:
# here are all the unique characters that appear in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size) 


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [23]:
stoi = { ch: i for i, ch in enumerate(chars)}
itos = { i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[ch] for ch in s] # encoder: takes a string, outputs a list of integers
decode = lambda l: ''.join(itos[ix] for ix in l) # decoder: takes a list of integers, outputs a string

print(encode('hi there'))
print(decode(encode('hi there')))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


what is the difference between the two below code?

```decode = lambda l: ''.join([itos[ix] for ix in l])```

```decode = lambda l: ''.join(itos[ix] for ix in l)```


Let me explain the difference between these two code snippets. 

While they achieve the same result, there's a subtle syntactic difference:

decode = lambda l: ''.join([itos[ix] for ix in l])

This version uses a list comprehension [itos[ix] for ix in l]
It creates a full list in memory first, then joins it
The square brackets [] indicate that a new list object is being created

decode = lambda l: ''.join(itos[ix] for ix in l)

This version uses a generator expression (itos[ix] for ix in l)
It generates values one at a time without creating a full list
No square brackets means it's a generator expression
The main practical differences are:
Memory efficiency: The generator version (second one) is more memory efficient because it doesn't create an intermediate list. It generates each value on-the-fly as needed by join().

Performance:
For small sequences, the list comprehension might be slightly faster since it has less overhead

For large sequences, the generator version is better as it uses less memory
In most cases, the difference in performance would be negligible, but if you're working with very large sequences, the generator version (second one) would be the better choice due to its memory efficiency. Both will produce exactly the same output - they just differ in how they get there.

In [32]:
# tokenize the enitre dataset and store it in a tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
# data = data.unsqueeze(0)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [35]:
# split the dataset into train and validation sets (90%)
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

In [37]:
# Time Dimention of the input tensors feeding into the transformer

block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, the target: {target}")

when input is tensor([18]), the target: 47
when input is tensor([18, 47]), the target: 56
when input is tensor([18, 47, 56]), the target: 57
when input is tensor([18, 47, 56, 57]), the target: 58
when input is tensor([18, 47, 56, 57, 58]), the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]), the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target: 58


In [43]:
# Batch Dimention
# We are going to have mini batches of multiple chunks of text all stacked up in a single tensor
# We want to process multiple chunks at the same time since we are using GPUs which are good at parallel processing

torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?


def get_batch(split):
    # generate a small batch of data of inputs x and taregs y
    data = train_data if split == 'train' else val_data
    # The reason we subtract block_size from len(data) in torch.randint(len(data) - block_size, (batch_size,)) is to ensure we can always get a complete sequence of length block_size starting from any randomly chosen index.
    ix = torch.randint(len(data)- block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs: ')
print(xb.shape)
print(xb)
print('targets: ')
print(yb.shape)
print(yb)

print('-----')
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, : t+1]
        target = yb[b, t]
        print("When input is ", context.tolist(), "the target: ", target.tolist())

inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
-----
When input is  [24] the target:  43
When input is  [24, 43] the target:  58
When input is  [24, 43, 58] the target:  5
When input is  [24, 43, 58, 5] the target:  57
When input is  [24, 43, 58, 5, 57] the target:  1
When input is  [24, 43, 58, 5, 57, 1] the target:  46
When input is  [24, 43, 58, 5, 57, 1, 46] the target:  43
When input is  [24, 43, 58, 5, 57, 1, 46, 43] the target:  39
When input is  [44] the target:  53
When input is  [44, 53] the target:  56
When input is  [44, 53, 56] the target:  1
When input is  [44, 53, 56, 1] the target:  58
When input is  [44, 53, 56, 1, 58] the targ

In [44]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [45]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

class BigramLanguageModel(nn.Module):
    def __init__(self):
        pass
    def forward(self):
        pass