In [188]:
#my imports
import pandas as pd
#requests is a popular python libarary for making HTTP requests, such as downloading from URLs
import requests
import urllib.request
import sentencepiece as spm
import tiktoken 
import torch 
import torch.nn as nn
from torch.nn import functional as F

## My Variables 
- vocab size=size of unique characters
- characters=the sorted list of unique characters
- train_data=data used to train the model
- test_data=data used to test the model
- context_length=# of characters the model references to make a prediction


In [189]:
## hyper-parameters 
characters=sorted(list(set(text)))
size_vocab=len(characters)
batch_size=35
sequence_length=8
num_epochs=5000
interval_eval=500
learning_rate=1e-2
device='cuda' if torch.cuda.is_available() else 'cpu' 
iters_eval=200
dim_embd=32
vocab_size=32



## General Notes
- tensor: a gernalization of scalers, vectors, matrices to a higher dimensional space/multi-dimensional arrays/primary data structure used to store and manipulate date 

In [190]:

#got the url by open the raw version of the file in github 
url = "https://raw.githubusercontent.com/yasminho/Shakespeare-gpt/main/input.txt?token=GHSAT0AAAAAACDEOEB7HCMDF43ROSZND5FAZF6U3OA"
try:
    # Download the dataset using urllib
    #downloads the contents from url and saves it as input.txt 
    urllib.request.urlretrieve(url, 'input.txt')
    print("Dataset downloaded successfully.")
#if there is an exception, the code will enter the exception block 
except Exception as e:
    print(f"the dataset did not download: {e}")

#will enter the exception block and print the exception 

Dataset downloaded successfully.


In [191]:
#reading in the dataset 
#'r' means read 
#common encoding for text files 
with open('input.txt', 'r', encoding='utf-8') as f:
    text=f.read()

print(len(text))


1115394


In [192]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [193]:
#list of the set of characters in the text 
characters=sorted(list(set(text)))
#number of unique characters in the text 
size_vocab=len(characters)

print(size_vocab)

65


## Tokenizing

In [194]:
#creating a mappings
#dictionary mapping characters to corresponding integers using enumerate
#enumerate returns an iterator that produces tuples containing index and the element from input sequence 
chars_int_mapping={char:int for int, char in enumerate(characters)}
#dictionary mpping integers to corresponding characters 
int_chars_mapping={int: char for int, char in enumerate(characters)}
#takes in a string and encodes it into a list of integers using our mapping/converts each character in string to corresponding integer value and returns list of integers
encoder=lambda str:[chars_int_mapping[char] for char in str]
#takes in a list of integers and decoes it to a string using our int_chars_mapping
decoder=lambda int_list: ''.join([int_chars_mapping[i] for i in int_list])

#when we use this encoder, we will get a list of numbers between 0-65 because that is the size of vocabulary 
#notice that we get a token for each character
encoder("Hi Yas")



[20, 47, 1, 37, 39, 57]

In [195]:
#tokenizing with tiktoken 
tik_token_encoder=tiktoken.get_encoding('gpt2')
#instead of 65 tokens (our original vocabulary size), it has 50257 tokens 
#when we use this encoder 
tik_token_encoder.n_vocab

#when we use tik_token_encoder, we get a number that anywhere from 0-50257
#notice that we get a token for each word 
tik_token_encoder.encode("Hi")

[17250]

# splitting data
- splitting into train and test set
- 80% will train and 20% will test 

In [196]:
#look into nested-cross-validation
encoded_data=torch.tensor(encoder(text), dtype=torch.long)
train_length=int(len(encoded_data)*0.90)
train_data=encoded_data[:train_length]
test_data=encoded_data[train_length:]


# Training

In [197]:
#working with multiple chunks/batchs together
#working with batch dimension 

#random number generator/changes this 
torch.manual_seed(1337) 


def batch(input):
    if input=='train':
        data=train_data
    else: 
        data=test_data
    index=torch.randint(len(data)-sequence_length, (batch_size,)) #grabbing head_size number of random offsets/index is going to be #defined by head_size randomly generated between the length of data and our contextual_length
    contextual=torch.stack([data[i:i+sequence_length] for i in index]) #all become a row in a 4 by 8 tensor 
    prediction=torch.stack([data[i+1:i+sequence_length+1] for i in index])
    contextual, prediction=contextual.to(device), prediction.to(device)
    return contextual, prediction

train_context, train_labels=batch('train')
print('inputs:')
print(train_context.shape)
print(train_context)
print('prediction:')
print(train_labels.shape)
print(train_labels)

print('----')

for b in range(batch_size): #number of heads
    for c in range(contextual_length): 
        context=train_context[b, :c+1]
        prediction=train_labels[b, c]
        print(f"when input is {context.tolist()} the prediction: {prediction}") 


#we are getting 4 rows (represent the heads) 
#we get 8 columns (represent the contextual part of it)


    
    

inputs:
torch.Size([35, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54],
        [57, 43, 60, 43, 52,  1, 63, 43],
        [60, 43, 42,  8,  0, 25, 63,  1],
        [56, 42,  5, 57,  1, 57, 39, 49],
        [43, 57, 58, 63,  6,  1, 58, 46],
        [43,  1, 51, 39, 63,  1, 40, 43],
        [58, 46, 43,  1, 43, 39, 56, 57],
        [39, 58, 47, 53, 52, 12,  1, 37],
        [53, 56, 43,  1, 21,  1, 41, 39],
        [50, 39, 52, 63,  1, 47, 58, 57],
        [56, 53, 63,  1, 42, 47, 42,  1],
        [39, 51,  1, 39, 44, 56, 39, 47],
        [17, 24, 21, 38, 13, 14, 17, 32],
        [ 1, 39, 52, 42,  1, 45, 43, 50],
        [ 1, 58, 46, 39, 58,  1, 42, 53],
        [ 1, 61, 53, 59, 50, 42,  1, 21],
        [59, 57, 40, 39, 52, 42,  1, 40],
        [52, 42,  8,  0,  0, 23, 21, 26],
        [45, 53, 42, 57,  0, 23, 43, 43],
        [52,  1, 61, 39, 57,  1, 51, 53],
      

# bigram language model 
- looks at a pair of words instead of just one just one word at a time
- looks to see how often each pair of words appear together
- this helps figure out the likelihood of certain words following each other
- a simple way to understand and predict the next word in a sentence
- another possible model to look into are n-grams
- this model helps in various autocorrect, text generation, and speech recognition models
- nn module is deep learning framework/providdes a collection of classes and functions that make it easier to define and train neural networks

### Constructor
- embeddings are a way to represent categorical varialbes as vectors 
- nn.embedding= specificies the number of unique categories (the size of the vocab in NLP) and desired size of the embedding vectors (a hyperparameter that determines the length of the embedding vector/eg if you have a vocab size of 10000 and embedding dimension of 300, this means that each word in the vocab will be represented as a 300-dim vector/smaller values are used when computational resources are limited and larer values might be preferred for more complex tasks):
     - input: an index of sequence of indices that represent the category you want to convert into an embedding (these indices could be word IDs where each word in the vocab is mapped to a unique integer)
     - output: the dense representation (embedding vector) of the input index. The output is a tensor with shape('batch size, embedding_dim)
 
### Forward Method 
- self.token_embedding_table(index): an embedding lookup for each index in the index tensor occurs. It retrieves the embedding from lookup table/return value is a tensor where each word in index has been converted into a desne vector representation
- if predictions is None: this will check whether we are training the model or not/if we are training then no loss calculation is required as the model is generating predictions
- if predictions is provided, this means that the model is being used for training and we calculate the loss using the cross-entropy function
- B, T, C:
    - B: Batch size
    - T: sequence length
    - C: embedding Dimension
- our_pred=our_pred.view(B*T, C):
    - breaks down our_pred into a 2-dimensional tensor 
    - this is done in preparation for the corss-entropy loss function
- prediction.view(B*T):
    - the prediction tensor is re-shaped into a 1 dimensional tensor
    - this is essnetial for matching the shapes of our_pred and prediction for entropy loss function
    - it expects the total number of elements in the batch 
- F.cross_entropy(our_pred, predictions)
    - the function is provided by PyTorch's functional interface
    - this function is particularly used in classification tasks
    - calculates the difference between predicted probability distribution and true probability distribution of the target lables
    - the goal of the cross-entropy loss is to miniize the dissimilarity between these distributions, and improve model accuracy
    - if the model's predicted probability for the true class is close to 1 (high confidence), the cross entropy loss will be close to zero
    - inputs:
         - Input tensor: representes the predicted raw scores from the model. It should be a 2-dimesnional tensor with shape (B, C), where    B=batch size and C is the number of classes/vocab size
         - Target tensor: represente the true class labels for each sample in the batch. It should be 1-dimensional tensor with shape (B), where B is the batch size/total number of elements in the batch=
    - outputs: a scaler value that represents the average loss across the batch
 
This may look like: 
- input_tensor=torch.tensor([[0.2, 0.5, 0.3], [1.2, 0.1,-0.2], [-0.5, 0.2,1.2], [0.1,0.2, 0.7]]]): batch_size=4, number of class/vocab size=3
- target_tensor=torch.tensor([1, 0, 2, 0])


### Generate Method 
- index is an array of indices for the current context/it is (B,T) array of Batch size and sequence length (number of tokens in the context)
- aim is to extent to be (B, T+1), (B,T+2), (B,T+3)...../continues generation until we hit the end of the sequence length
- it will do this for new_tokens
- we call .forward method, which computes the next token prediction
- we then slice our_pred: [:, -1,:]: we want to include all of the batch dimension/vocab_size, but we only want the last element (so we slice our sequence length to get the last element)/after slicing, we a re left with a tensor with a new shape of (B,C)/slicing effectively removes the second dimension (T)/we do this to focus on th e logits for the last time step 
- we convert to probabilities with the soft_max function is applied to convert the into probability scores/softmax function scales our_pred so that they will sum to 1/this produces a tensor of shape (B,C) representing the probabilities of each token in the vocabulary being the next token in the sequence/each element in probabilities represet the probabilitity of the token being the next token in the sequence
     - the dimension parameter determines the dimension the softmax operaiton is applied/it specifies the axis of the tensor on which the sum of exponential is calculated. The result is then normalized by dividing each element by the sum of the exponentials along that axis
     - dim=-1/is because we want the last dimension of the tensor, so probability distribution over the vocabulary/this calculates the probabilities for each batch sample seperatly, treating the tokens in the vocabulary as indpendent classes for each sample 
- torch.multinomial samples from those probabilities/we want one sample/used to sample a token index from the probabilitiy distribution:
    - probabilitiesL the probability distribution
    - samples_num: how many tokens to draw from the multinomial distribution for each batch element/we want 1 token for each batch element
    - the randomly sampled tokens is crucial for generating diverse and unpredictable sequences in a language model or text generation task
    - we get back the tensor (B,1): a tensor containing the sampled indices of the outcomes for each element in the batch 
         - contains the sampled token indices for each sequence in the batch
         - the elements in next_index tensor are integers that represent the indices of the tokens sampled from the probabilitiy distribution
         - For example:
              - if probs=torch.tensor([[0.2,0.3,0.1, 0.15, 0.25], [0.1, 0.05, 0.4, 0.15, 0.3]]) (these represent sample probabilities for each sequence in the batch
              - next_index=torch.multinomial(probs, num_samples=1)
              - if we were to print next_index we would get a tensor of shape tensor([4], [2]]): this rempresents the index of the sample token in the batch (the tensor has the shape (2,1))
          
- index=torch.cat((index, next_index), dimension=1):
          - has the effect of extending the existing context tensor
  
        





In [198]:
#reproducability 
torch.manual_seed(1337)

#Big gram language model 
#inherits from nn module  

class BigramLM(nn.Module):

    #constructor
    def __init__(self):
        super().__init__()
        self.token_embedding_table=nn.Embedding(size_vocab, dim_embd)
        #positional encoding
        self.pos_embedding_table=nn.Embedding(sequence_length, dim_embd) 
        #creating a self-attention head
        self.head_sa=Head(dim_embd)
        #creating a linear layer 
        self.head_lm=nn.Linear(dim_embd, size_vocab)

    def forward(self, index, predictions=None):
        B,T=index.shape

        #gives us token embeddings 
        token_embd=self.token_embedding_table(index) #(B,T,C)/this is the identity of the token 

        #we need a positional encoding 
        pos_embd=self.pos_embedding_table(torch.arange(T, device=device)) #this is the position of the token  
        
        #gives us our predictions, but be careful because the c in the token_emb is different from our_pred c 
        y=token_embd+pos_embd 
        y=self.head_sa(y)                            
        our_pred=self.head_lm(y)  #(B,T, vocab_size)
        
        
        if predictions is None:
            loss=None 
        else:
            B, T, C=our_pred.shape
            our_pred=our_pred.view(B*T,C)
            predictions=predictions.view(B*T)
            loss=F.cross_entropy(our_pred, predictions)
        return our_pred, loss
        
    def generate(self, index, new_tokens):
        for _ in range(new_tokens):
            condition_index=index[:, -sequence_length:]
            our_pred, loss=self.forward(condition_index)
            our_pred=our_pred[:, -1, :]
            probability=F.softmax(our_pred, dim=-1)
            next_index=torch.multinomial(probability, num_samples=1)
            index=torch.cat((index,next_index), dim=1)
        return index 



model=BigramLM()
model=model.to(device)
output, loss=model.forward(train_context, train_labels)
print(output.shape)
print(loss) #we expecting loss to be 4.17/we are guessing wrong/getting some entropy  
index=torch.zeros((1,1), dtype=torch.long, device=device) #feeding it a tensor of (1,1) that holds a zero and it how we kick of the generate function with type being integer/represents an empty context to start the sequence generation
print(decoder(model.generate(index, new_tokens=300)[0].tolist())) #we call the generate method with our empty context/we want to generate 300 characters here/we extract the first element of the tuple (this is because the generate function returns a tuple of (predicted value, loss)/we want predicted value 


torch.Size([280, 65])
tensor(4.2175, grad_fn=<NllLossBackward0>)


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [1, 8] but got: [1, 1].

In [184]:
#@torch.no_grad() tells pytorch we don't intend to do back propogation: doesn't save intermediate variables 
#averages out the loss over multiple batches 
#we do this iters_eval times and get the loss for both splits 

@torch.no_grad()
def loss_estimator(): 
    output={}
    model.eval()
    for type in ['train', 'test']:
        lossed=torch.zeros(iters_eval)
        for x in range(iters_eval):
            contextual, labeled=batch(type) 
            our_pred, loss=model(contextual, labeled)
            lossed[x]=loss.item()
        output[type]=lossed.mean()
    model.train()
    return output
    

# Training 

## Notes: 
- an optimizer object is an essnetial componenet used during the trianing of machine learning models, specifically neural networks
- it is responsible for updating the parameters durin the training process to minimize the loss function
- there are various optimizer classes that implement different optimization algorithms, such as Adam, SGD
- To use optimizer:
  - create an instance of the optimizer (what we did below) 
  - training loop:
       1. forward pass (pass the input data through our model to get predictions
       2.  calculate the loss (using predicted values and true labels (what we did with nn.CrossEntropyLoss())
       3.  backward pass (calculate gradient of the loss with respect to model paramaters)
       4.  update paramters: the optimizer updates the model's parameters based on the gradients and optimization algorithm
   
### Forward Pass 
- num_epochs: a hyperparameter that represents the total number of times the training dataset is passed through the learning algorithm during the training process/it is basically the number of times the model will see the entire training dataset and update its parameters (weights/biases) accordingly


In [185]:
#creating an optimizer object
optimizer=torch.optim.AdamW(model.parameters(), lr=1e-3)

In [199]:

#forward loop 

for iter in range(num_epochs):
    if iter % interval_eval==0:
        lossed=loss_estimator()
        print(f"step {iter}: train loss {lossed['train']}, test loss {lossed['test']}")

    
    #sample our batch of data 
    contextual, correct_labels=batch('train')
    #forward pass/including the loss calculation 
    our_pred, loss=model(contextual, correct_labels)
    #backward pass 
    optimizer.zero_grad(set_to_none=True) #getting the gradients for all our parameters 
    loss.backward() #using those gradients to update our parameters 
    #update our values 
    optimizer.step()
    
print(loss.item())

step 0: train loss 4.198831081390381, test loss 4.202894687652588
step 500: train loss 4.200320243835449, test loss 4.2017645835876465
step 1000: train loss 4.199172496795654, test loss 4.200868129730225
step 1500: train loss 4.200717449188232, test loss 4.202966213226318
step 2000: train loss 4.200040340423584, test loss 4.205711841583252
step 2500: train loss 4.200659275054932, test loss 4.202916145324707
step 3000: train loss 4.201669216156006, test loss 4.202471733093262
step 3500: train loss 4.199456691741943, test loss 4.2001214027404785
step 4000: train loss 4.19920015335083, test loss 4.202147006988525
step 4500: train loss 4.199309825897217, test loss 4.201878547668457
4.203585147857666


In [179]:
print(decoder(model.generate(index, new_tokens=800)[0].tolist())) #we call the generate method with our empty context/we want to generate 300 characters here/we extract the first element of the tuple (this is because the generate function returns a tuple of (predicted value, loss)/we want predicted value 


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [1, 8] but got: [1, 1].

## Self-Attention: 


- torch.trial: returns the lower triangle part of a matrix/prevents the matrix from using tokens in front of it/can only get context
- we can get the average of rows of a matrix by:
   1. take our matrix a/torch.sum(a, 1, keepdim=True): basically ensuring that each row in A adds up to 1
   2. we can then multiple a with another matrix to get matrix c
   3. matrix c will give us a matrix with the rows being averaged from a/b
 
Explaination of version with soft-max: 
- tril=torch.tril(torch.ones(T,T)): creates a square matrix of T x T filled with ones/torch.tril sets all elements above the main diagonal to zero
- weights=torch.zeros((T,T)): create another square matrix of Size T x T filled with zeros/used to store weights that determine how much information from each token should be aggregated
- weights=weights.masked_fill(tril==0, float('-inf')) we modify the weights matrix. basically we look at when tril==0, which occurs outside of the diagonal triangle matrix and set those to -inf-> results in zeros from the downward triangle matrix and a bunch of -inf that aren't a part of it
- F.softmax: apply to the weights matrix along the last dimension because weights matrix is size Tx T, the last dimension is the column dimension/some tokens find others more or more interesting
- After .softmax we get the weights matrix which contains probabilities for aggregating information from each token
- every single token will admit two vectors: query (what am i looking for), key vector (what I contain):
   - my query dot products with key vector and that becomes weights 

In [107]:
#Example 
torch.manual_seed(1337)
B,T,C=4, 8, 32
x=torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 32])

In [35]:
xbagofwords=torch.zeros((B,T,C))
for v in range(B):
    for y in range(T):
        prev=x[v,:y+1] #at this batch dimension/everything up to and including the t toke/slicing x to be shaped (t,C)-> keeps all the channels in-tact
        xbagofwords[v,y]=torch.mean(prev, 0) #we are averaging out the time/which gives us a 1-dimensional vector C/we store it in xbagofwords
        

In [54]:
weights=torch.tril(torch.ones(T,T))
weights=weights/torch.sum(weights,1, keepdim=True)
xbagofwords2=weights@x #multiplying the matrix weights with our matrix x  (B,T,T) @ (B,T,C)->(B,T,C) 


In [63]:
#Version with softmax

tril=torch.tril(torch.ones(T,T))
weights=torch.zeros((T,T)) #weights begin with zero/how much from each token do we want to aggregate
weights=weights.masked_fill(tril==0, float('-inf')) #tokens from the past cannot communicate
weights=F.softmax(weights, dim=-1) #normalization/exponenetiate and then divde by sum/how much each element fuses into this position
xbagofwords3=weights@x 
weights


tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [113]:
# single head of self attention 
torch.manual_seed(1337) 
B,T,C=4,8,32
x=torch.randn(B,T,C)

size_head=16
query=nn.Linear(C, size_head, bias=False)
key=nn.Linear(C, size_head, bias=False)
value=nn.Linear(C, size_head, bias=False) 
v=value(x)
q=query(x) # (B,T, 16)
k=key(x) #(B,T,16)
weights=q@k.transpose(-2,-1) #we need transpose the last two dimension (B,T,16) @ (B, 16, T) -> (B, T,T)

tril=torch.tril(torch.ones(T,T))
weights=torch.zeros((T,T)) #weights begin with zero/how much from each token do we want to aggregate
weights=weights.masked_fill(tril==0, float('-inf')) #tokens from the past cannot communicate
weights=F.softmax(weights, dim=-1) #normalization/exponenetiate and then divde by sum/how much each element fuses into this position
output=weights@v
output.shape





torch.Size([4, 8, 16])

In [200]:
#Single-head of self Attention 

class Head(nn.Module): 
    def __init__(self, size_head):
        super().__init__()
        self.query=nn.Linear(dim_embd, size_head, bias=False) 
        self.key=nn.Linear(dim_embd, size_head, bias=False)
        self.value=nn.Linear(dim_embd, size_head, bias=False)
        #no a paramteter of the model, so in py-torch we have to call it a register_buffer
        self.register_buffer('tril', torch.tril(torch.ones(sequence_length, sequence_length))) #lower triangular matrix 

    def forward(self, x):
        q=self.query(x)
        k=self.key(x)
        v=self.value(x)
        #we now compute the self-attention scores 
        weights=q@k.transpose(-2,-1) * k.shape[-1]**-0.5
        weights=weights.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        weights=F.softmax(weights, dim=-1)
        output=weights @ v
        return output 
        