<a href="https://colab.research.google.com/github/vektor8891/llm/blob/main/projects/10_gpt/10_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
# !pip install torchtext==0.17.2
# !pip install portalocker==2.8.2
# !pip install transformers==4.35.2
# !pip install torch==2.2.0

# Text pipeline
## Dataset

In [58]:
from torchtext.datasets import IMDB

# Load the dataset
train_iter, val_iter = IMDB()

In [3]:
data_itr=iter(train_iter)
# retrieving the third first record
next(data_itr)
next(data_itr)
next(data_itr)

(1,
 "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />")

In [4]:
import torch

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cpu')

## Preprocessing data

In [5]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, EOS_IDX = 0, 1, 2
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<|endoftext|>' ]

In [6]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

In [7]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):

    for _,data_sample in data_iter:
        yield  tokenizer(data_sample)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=special_symbols, special_first=True)
vocab.set_default_index(UNK_IDX)



###  Text to index and index to Text

In [8]:
text_to_index=lambda text: [vocab(token) for token in tokenizer(text)]
index_to_en = lambda seq_en: " ".join([vocab.get_itos()[index] for index in seq_en])

In [9]:
#check
index_to_en(torch.tensor([0,1,2]))

'<unk> <pad> <|endoftext|>'

### Collate function

In [10]:
def get_sample(block_size, text):
    # Determine the length of the input text
    sample_leg = len(text)
    # Calculate the stopping point for randomly selecting a sample
    # This ensures the selected sample doesn't exceed the text length
    random_sample_stop = sample_leg - block_size


    # Check if a random sample can be taken (if the text is longer than block_size)
    if random_sample_stop >= 1:
        # Randomly select a starting point for the sample
        random_start = torch.randint(low=0, high=random_sample_stop, size=(1,)).item()
        # Define the endpoint of the sample
        stop = random_start + block_size

        # Create the input and target sequences
        src_sequence = text[random_start:stop]
        tgt_sequence= text[random_start + 1:stop + 1]

    # Handle the case where the text length is exactly equal or less the block size
    elif random_sample_stop <= 0:
        # Start from the beginning and use the entire text
        random_start = 0
        stop = sample_leg
        src_sequence= text[random_start:stop]
        tgt_sequence = text[random_start + 1:stop]
        # Append an empty string to maintain sequence alignment
        tgt_sequence.append( '<|endoftext|>')

    return src_sequence, tgt_sequence

In [11]:
BATCH_SIZE=1

batch_of_tokens=[]

for i in range(BATCH_SIZE):
  _,text =next(iter(train_iter))
  batch_of_tokens.append(tokenizer(text))

In [12]:
text=batch_of_tokens[0][0:100]
text[0:100]
batch_of_tokens

[['i',
  'rented',
  'i',
  'am',
  'curious-yellow',
  'from',
  'my',
  'video',
  'store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when',
  'it',
  'was',
  'first',
  'released',
  'in',
  '1967',
  '.',
  'i',
  'also',
  'heard',
  'that',
  'at',
  'first',
  'it',
  'was',
  'seized',
  'by',
  'u',
  '.',
  's',
  '.',
  'customs',
  'if',
  'it',
  'ever',
  'tried',
  'to',
  'enter',
  'this',
  'country',
  ',',
  'therefore',
  'being',
  'a',
  'fan',
  'of',
  'films',
  'considered',
  'controversial',
  'i',
  'really',
  'had',
  'to',
  'see',
  'this',
  'for',
  'myself',
  '.',
  'the',
  'plot',
  'is',
  'centered',
  'around',
  'a',
  'young',
  'swedish',
  'drama',
  'student',
  'named',
  'lena',
  'who',
  'wants',
  'to',
  'learn',
  'everything',
  'she',
  'can',
  'about',
  'life',
  '.',
  'in',
  'particular',
  'she',
  'wants',
  'to',
  'focus',
  'her',
  'attentions',
  'to',
  'making',
 

In [13]:
block_size=10
src_sequences, tgt_sequence=get_sample( block_size, text)

In [14]:
print("src: ",src_sequences)
print("tgt: ",tgt_sequence)

src:  ['in', 'particular', 'she', 'wants', 'to', 'focus', 'her', 'attentions', 'to', 'making']
tgt:  ['particular', 'she', 'wants', 'to', 'focus', 'her', 'attentions', 'to', 'making', 'some']


In [15]:
# Initialize empty lists to store source and target sequences
src_batch, tgt_batch = [], []

# Define the batch size
BATCH_SIZE = 2

# Loop to create batches of source and target sequences
for i in range(BATCH_SIZE):
    # Retrieve the next data point from the training iterator
    _,text = next(iter(train_iter))

    # Generate source and target sequences using the get_sample function
    src_sequence_text, tgt_sequence_text = get_sample(block_size, tokenizer(text))

    # Convert source and target sequences to tokenized vocabulary indices
    src_sequence_indices = vocab(src_sequence_text)
    tgt_sequence_indices = vocab(tgt_sequence_text)

    # Convert the sequences to PyTorch tensors with dtype int64
    src_sequence = torch.tensor(src_sequence_indices, dtype=torch.int64)
    tgt_sequence = torch.tensor(tgt_sequence_indices, dtype=torch.int64)

    # Append the source and target sequences to their respective batches
    src_batch.append(src_sequence)
    tgt_batch.append(tgt_sequence)

    # Print the output for every 2nd sample (adjust as needed)
    print(f"Sample {i}:")
    print("Source Sequence (Text):", src_sequence_text)
    print("Source Sequence (Indices):", src_sequence_indices)
    print("Source Sequence (Shape):", src_sequence.shape)
    print("Target Sequence (Text):", tgt_sequence_text)
    print("Target Sequence (Indices):", tgt_sequence_indices)
    print("Target Sequence (Shape):", tgt_sequence.shape)

Sample 0:
Source Sequence (Text): ['purposes', 'rather', 'than', 'just', 'to', 'shock', 'people', 'and', 'make', 'money']
Source Sequence (Indices): [4919, 253, 82, 45, 10, 1352, 89, 7, 94, 215]
Source Sequence (Shape): torch.Size([10])
Target Sequence (Text): ['rather', 'than', 'just', 'to', 'shock', 'people', 'and', 'make', 'money', 'to']
Target Sequence (Indices): [253, 82, 45, 10, 1352, 89, 7, 94, 215, 10]
Target Sequence (Shape): torch.Size([10])
Sample 1:
Source Sequence (Text): ['meat', 'and', 'potatoes', '(', 'no', 'pun', 'intended', ')', 'of', 'swedish']
Source Sequence (Indices): [2876, 7, 14661, 29, 56, 4419, 1218, 27, 9, 3994]
Source Sequence (Shape): torch.Size([10])
Target Sequence (Text): ['and', 'potatoes', '(', 'no', 'pun', 'intended', ')', 'of', 'swedish', 'cinema']
Target Sequence (Indices): [7, 14661, 29, 56, 4419, 1218, 27, 9, 3994, 534]
Target Sequence (Shape): torch.Size([10])


In [16]:
from torch.nn.utils.rnn import pad_sequence

BLOCK_SIZE=30
def collate_batch(batch):
    src_batch, tgt_batch = [], []
    for _,_textt in batch:
      src_sequence,tgt_sequence=get_sample(BLOCK_SIZE,tokenizer(_textt))
      src_sequence=vocab(src_sequence)
      tgt_sequence=vocab(tgt_sequence)
      src_sequence= torch.tensor(src_sequence, dtype=torch.int64)
      tgt_sequence = torch.tensor(tgt_sequence, dtype=torch.int64)
      src_batch.append(src_sequence)
      tgt_batch.append(tgt_sequence)


    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=False)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=False)

    return src_batch.to(DEVICE), tgt_batch.to(DEVICE)

In [17]:
from torch.utils.data import DataLoader

BATCH_SIZE=1
dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_dataloader= DataLoader(val_iter , batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

### Iterating through data samples

In [18]:
dataset=iter(dataloader)
for sample in range(10):
  src,trt=next(dataset)
  print("sample",sample)
  print("sorce:",index_to_en(src))
  print("\n")
  print("target:",index_to_en(trt))
  print("\n")

sample 0
sorce: scenes i saw made it obvious that the first howling was a great movie . so great , that seven horrible sequels had to be made . they started off


target: i saw made it obvious that the first howling was a great movie . so great , that seven horrible sequels had to be made . they started off with


sample 1
sorce: apparent that they are the only two people left on earth--as you learn in the really stupid and totally unconvincing conclusion . usually the twist at the end makes the


target: that they are the only two people left on earth--as you learn in the really stupid and totally unconvincing conclusion . usually the twist at the end makes the episode


sample 2
sorce: craven chose to go in the style of his older films , having no good tie but the main villain ' s name . even the actor playing dracula was


target: chose to go in the style of his older films , having no good tie but the main villain ' s name . even the actor playing dracula was different


sample 3
s

In [19]:
for  src,trt in dataset:
    print(trt.shape)
    print(src.shape)
    print(index_to_en(src[0,:]))
    print(index_to_en(trt[0,:]))
    break

torch.Size([30, 1])
torch.Size([30, 1])
trailer
i


In [20]:
print("source:",index_to_en(src))
print("target:",index_to_en(trt))

source: trailer i saw was very dynamic , that is not true for the film . that is to say the discrepancy between the trailer and the actual film is something
target: i saw was very dynamic , that is not true for the film . that is to say the discrepancy between the trailer and the actual film is something very


## Masking

In [21]:
def generate_square_subsequent_mask(sz,device=DEVICE):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [22]:
def create_mask(src,device=DEVICE):
    src_seq_len = src.shape[0]
    src_mask = generate_square_subsequent_mask(src_seq_len)
    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    return src_mask,src_padding_mask

## Positional encoding

In [23]:
import torch.nn as nn
from torch import Tensor

# add positional information to the input tokens
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

## Token embedding

In [24]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

## Custom GPT model architecture

`CustomGPTModel`: transformer-based model architecture for generative pre-trained models. Purpose: to generate text and perform various NLP tasks. Main components:

- **Initialization (`__init__`)**: initializes the embedding layer, positional encoding, transformer encoder layers, and a linear layer (`lm_head`) for generating logits over the vocabulary.

- **Weight initialization (`init_weights`)**: initializes model weights using Xavier uniform initialization.

- **Decoder (`decoder`)**: currently functions as the forward pass through the transformer encoder layers, followed by the generation of logits for the language modeling task. Adds positional encodings to the embeddings and applies a mask if necessary.

- **Forward pass (`forward`)**: similar to `decoder`. Defines the forward computation of the model. Processes the input through embedding layers, positional encoding, transformer encoder layers, and produces the final output using the `lm_head`.

- **Mask generation**: included in both `decoder` and `forward`. Purpose: to ensure prediction does not depend on future tokens.

In [25]:
import math

class CustomGPTModel(nn.Module):
    def __init__(self, embed_size,vocab_size, num_heads, num_layers, max_seq_len=500,dropout=0.1):

        super().__init__()

        self.init_weights()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, dropout=dropout)

        # Remaining layers are part of the TransformerDecoder
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        self.embed_size = embed_size
        self.lm_head = nn.Linear(embed_size, vocab_size)

    def init_weights(self):
      for p in self.parameters():
          if p.dim() > 1:
              nn.init.xavier_uniform_(p)

    def create_mask(src,device=DEVICE):
        src_seq_len = src.shape[0]
        src_mask = nn.Transformer.generate_square_subsequent_mask(src_seq_len)
        src_padding_mask = (src == PAD_IDX).transpose(0, 1)
        return src_mask,src_padding_mask

    def decoder(self, x,src_mask):
        seq_length = x.size(0)

        # Add positional embeddings to the input embeddings
        x = self.embed(x)* math.sqrt(self.embed_size)
        x = self.positional_encoding(x)

        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask, src_padding_mask = create_mask(x)

        output = self.transformer_encoder(x, src_mask)
        logits = self.lm_head(x)
        return logits

    def forward(self,x,src_mask=None,key_padding_mask=None):

        seq_length = x.size(0)

        # Add positional embeddings to the input embeddings
        x = self.embed(x)* math.sqrt(self.embed_size) #src = self.embedding(src) * math.sqrt(self.d_model)
        x = self.positional_encoding(x)


        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask, src_padding_mask = create_mask(x)

        output = self.transformer_encoder(x, src_mask,key_padding_mask)
        x = self.lm_head(x)

        return x


### Model configuration and initialization

- `ntokens`: # unique tokens in the vocabulary
- `emsize`: size of each embedding vector
- `nlayers`: # transformer encoder layers
- `nhead`: # attention heads
- `dropout`: regularization technique to ignore randomly selected neurons during training to prevent overfitting


In [26]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability

model = CustomGPTModel(embed_size=emsize, num_heads=nhead, num_layers=nlayers, vocab_size=ntokens,dropout=dropout).to(DEVICE)



### Prompting

Prompt: starting point for the model to generate text.

In [27]:
def encode_prompt(prompt, block_size=BLOCK_SIZE):
    # Handle None prompt
    while prompt is None:
        prompt = input("Sorry, prompt cannot be empty. Please enter a valid prompt: ")

    tokens = tokenizer(prompt)
    number_of_tokens = len(tokens)

    # Handle long prompts
    if number_of_tokens > block_size:
        tokens = tokens[-block_size:]  # Keep last block_size characters

    prompt_indices = vocab(tokens)
    prompt_encoded = torch.tensor(prompt_indices, dtype=torch.int64).reshape(-1, 1)
    return prompt_encoded

In [28]:
# print(index_to_en(encode_prompt(None)))

Sorry, prompt cannot be empty. Please enter a valid prompt: 0
0


In [29]:
print(index_to_en(encode_prompt("This is a prompt to get model generate next words." ) ))

this is a prompt to get model generate next words .


In [30]:
prompt_encoded=encode_prompt("This is a prompt to get model generate next words.").to(DEVICE)
prompt_encoded

tensor([[   15],
        [   11],
        [    6],
        [33700],
        [   10],
        [   86],
        [ 2076],
        [ 5673],
        [  388],
        [  665],
        [    3]])

In [31]:
logits = model.decoder(prompt_encoded,src_mask=None).to(DEVICE)
logits = logits.transpose(0, 1)
logits.shape

torch.Size([1, 11, 68813])

In [32]:
logit_preiction =logits[:,-1]
logit_preiction.shape

torch.Size([1, 68813])

In [33]:
 _, next_word_index = torch.max(logit_preiction, dim=1)
 next_word_index

tensor([15159])

In [34]:
index_to_en(next_word_index)

'booed'

## Autoregressive text generation

In [35]:
prompt="this is the beginning of"
prompt_encoded = encode_prompt(prompt).to(DEVICE)
print("Device for prompt_encoded:", prompt_encoded.shape)

Device for prompt_encoded: torch.Size([5, 1])


In [36]:
max_new_tokens=10

for i in range(max_new_tokens):
    logits = model.decoder(prompt_encoded,src_mask=None)
    logits = logits.transpose(0, 1)
    print(" ")
    print(f"Shape of logits at step {i}: {logits.shape}")

    logit_preiction = logits[:, -1]
    print(f"Shape of logit_prediction at step {i}: {logit_preiction.shape}")

    next_token_encoded = torch.argmax(logit_preiction, dim=-1).reshape(-1, 1)
    print(f"Shape of next_token_encoded at step {i}: {next_token_encoded.shape}")

    prompt_encoded = torch.cat((prompt_encoded, next_token_encoded), dim=0).to(DEVICE)
    print(f"Sequence for step {i}: {[index_to_en(j) for j in prompt_encoded]}")
    print(f"Shape of prompt_encoded after concatenation at step {i}: {prompt_encoded.shape}")

 
Shape of logits at step 0: torch.Size([1, 5, 68813])
Shape of logit_prediction at step 0: torch.Size([1, 68813])
Shape of next_token_encoded at step 0: torch.Size([1, 1])
Sequence for step 0: ['this', 'is', 'the', 'beginning', 'of', 'really--who']
Shape of prompt_encoded after concatenation at step 0: torch.Size([6, 1])
 
Shape of logits at step 1: torch.Size([1, 6, 68813])
Shape of logit_prediction at step 1: torch.Size([1, 68813])
Shape of next_token_encoded at step 1: torch.Size([1, 1])
Sequence for step 1: ['this', 'is', 'the', 'beginning', 'of', 'really--who', 'flinstones']
Shape of prompt_encoded after concatenation at step 1: torch.Size([7, 1])
 
Shape of logits at step 2: torch.Size([1, 7, 68813])
Shape of logit_prediction at step 2: torch.Size([1, 68813])
Shape of next_token_encoded at step 2: torch.Size([1, 1])
Sequence for step 2: ['this', 'is', 'the', 'beginning', 'of', 'really--who', 'flinstones', 'injuring']
Shape of prompt_encoded after concatenation at step 2: torch.S

In [37]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, EOS_IDX = 0, 1, 2
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<|endoftext|>' ]
BLOCK_SIZE

30

In [38]:
#auto-regressive Language Model text generation
def generate(model, prompt=None, max_new_tokens=500, block_size=BLOCK_SIZE, vocab=vocab, tokenizer=tokenizer):
    # Move model to the specified device (e.g., GPU or CPU)
    model.to(DEVICE)

    # Encode the input prompt using the provided encode_prompt function
    prompt_encoded = encode_prompt(prompt).to(DEVICE)
    tokens = []

    # Generate new tokens up to max_new_tokens
    for _ in range(max_new_tokens):
        # Decode the encoded prompt using the model's decoder
        logits = model(prompt_encoded,src_mask=None,key_padding_mask=None)

        # Transpose the logits to bring the sequence length to the first dimension
        logits = logits.transpose(0, 1)

        # Select the logits of the last token in the sequence
        logit_prediction = logits[:, -1]

        # Choose the most probable next token from the logits(greedy decoding)
        next_token_encoded = torch.argmax(logit_prediction, dim=-1).reshape(-1, 1)

        # If the next token is the end-of-sequence (EOS) token, stop generation
        if next_token_encoded.item() == EOS_IDX:
            break

        # Append the next token to the prompt_encoded and keep only the last 'block_size' tokens
        prompt_encoded = torch.cat((prompt_encoded, next_token_encoded), dim=0)[-block_size:]

        # Convert the next token index to a token string using the vocabulary
        # Move the tensor back to CPU for vocab lookup if needed
        token_id = next_token_encoded.to('cpu').item()
        tokens.append(vocab.get_itos()[token_id])

    # Join the generated tokens into a single string and return
    return ' '.join(tokens)

In [39]:
generate(model,prompt="this is the beginning of",max_new_tokens=30,vocab=vocab,tokenizer=tokenizer)

'moroni **spoilers sequined name-homages high-story hurt sleep haystack demofilo point-they allende ridiculously-timed vietnamese poverty low caridad seize flame-throwing marriages trickle legions paradox chainsmoking willims raubal propelling kato dictate sharply cloth'

### Decoding the differences: Training vs. inference

- Training: using ground truth ("teacher forcing")
- Interence: use previous predictions

In [40]:
from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss(ignore_index=PAD_IDX)

In [41]:
src,tgt=next(iter(dataloader))

mask,padding_mask = create_mask(src)

In [42]:
logits = model(src,src_mask=mask,key_padding_mask=padding_mask)
print(logits.shape)

torch.Size([30, 1, 68813])




In [43]:
print("output shape",logits.shape)
print("source shape ",src)

output shape torch.Size([30, 1, 68813])
source shape  tensor([[  137],
        [    5],
        [   22],
        [ 2420],
        [ 1191],
        [ 2033],
        [    3],
        [   14],
        [  203],
        [    9],
        [  198],
        [    5],
        [    4],
        [  206],
        [   19],
        [   11],
        [   69],
        [ 1191],
        [    5],
        [   74],
        [   11],
        [  117],
        [   13],
        [   59],
        [    8],
        [   24],
        [  291],
        [13237],
        [   12],
        [   88]])


In [44]:
# drop the the first sample of the target
tgt
print(tgt.shape)
print(logits.reshape(-1, logits.shape[-1]).shape)
print(tgt.reshape(-1).shape)

torch.Size([30, 1])
torch.Size([30, 68813])
torch.Size([30])


In [45]:
def evaluate(model: nn.Module, eval_data) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        for src,tgt in eval_data:
            tgt = tgt.to(DEVICE)
            #seq_len = src.size(0)
            logits = model(src,src_mask=None,key_padding_mask=None)
            total_loss +=  loss_fn(logits.reshape(-1, logits.shape[-1]), tgt.reshape(-1)).item()
    return total_loss / (len(list(eval_data)) - 1)

In [46]:
# evaluate(model,val_dataloader) # NOTE: this takes a long time

## Training the model

(Only if you have GPU)

In [47]:
# from torch.optim import Adam

# optimizer = Adam(model.parameters(), lr=1e-2, weight_decay=0.01, betas=(0.9, 0.999))
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10000, gamma=0.9)

# def train(model: nn.Module,train_data) -> None:
#     model.train()  # turn on train mode
#     total_loss = 0.
#     log_interval = 10000
#     start_time = time.time()

#     num_batches = len(list(train_data)) // block_size
#     for batch,srctgt in enumerate(train_data):
#         src= srctgt[0]
#         tgt= srctgt[1]
#         logits = model(src,src_mask=None)
#         logits_flat = logits.reshape(-1, logits.shape[-1])
#         loss = loss_fn(logits_flat, tgt.reshape(-1))

#         optimizer.zero_grad()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
#         optimizer.step()
#         total_loss += loss.item()

#         if (batch % log_interval == 0 and batch > 0) or batch==42060:
#             lr = scheduler.get_last_lr()[0]
#             ms_per_batch = (time.time() - start_time) * 1000 / log_interval
#             #cur_loss = total_loss / log_interval
#             cur_loss = total_loss / batch
#             ppl = math.exp(cur_loss)
#             print(f'| epoch {epoch:3d} | {batch//block_size:5d}/{num_batches:5d} batches | '
#                   f'lr {lr:02.4f} | ms/batch {ms_per_batch:5.2f} | '
#                   f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
#             start_time = time.time()

#     return total_loss

In [48]:
# import time

# best_val_loss = float('inf')
# epochs = 30
# Train_losses= []
# Val_losses = []
# for epoch in range(1, epochs + 1):
#     epoch_start_time = time.time()
#     train_loss = train(model,dataloader)
#     val_loss = evaluate(model, val_dataloader)
#     val_ppl = math.exp(val_loss)
#     Train_losses.append(train_loss)
#     Val_losses.append(val_loss)

#     elapsed = time.time() - epoch_start_time
#     print('-' * 89)
#     print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
#         f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
#     print('-' * 89)

#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         torch.save(model.state_dict(), 'model_best_val_loss.pt')

In [49]:
# import matplotlib.pyplot as plt

# # Calculate the number of epochs (assuming the lengths of train_losses and val_losses are equal)
# num_epochs = len(Train_losses)

# # Create a figure and a set of subplots
# fig, ax = plt.subplots()

# # Plot the training losses
# ax.plot(range(num_epochs), Train_losses, label='Training Loss', color='blue')

# # Plot the validation losses
# ax.plot(range(num_epochs), Val_losses, label='Validation Loss', color='orange')

# # Set the x-axis label
# ax.set_xlabel('Epoch')

# # Set the y-axis label
# ax.set_ylabel('Loss')

# # Set the title of the plot
# ax.set_title('Training and Validation Losses')

# # Add a legend to the plot
# ax.legend()

# # Show the plot
# plt.show()

## Loading the saved model

In [50]:
!wget 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/kyn1_OsXrzjef0xihlsXmg.pt'
model.load_state_dict(torch.load('kyn1_OsXrzjef0xihlsXmg.pt',map_location=torch.device('cpu')))

--2025-04-10 19:59:00--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/kyn1_OsXrzjef0xihlsXmg.pt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 122251138 (117M) [binary/octet-stream]
Saving to: ‘kyn1_OsXrzjef0xihlsXmg.pt.1’


2025-04-10 19:59:06 (23.5 MB/s) - ‘kyn1_OsXrzjef0xihlsXmg.pt.1’ saved [122251138/122251138]



<All keys matched successfully>

In [51]:
print(generate(model,prompt="the movie was",max_new_tokens=10,vocab=vocab,tokenizer=tokenizer))

paid what makes for making such a flashback for making


## Loading GPT2 model from HuggingFace

In [52]:
# !pip install --upgrade torch

In [55]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load the tokenizer and model
tokenizer1 = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define the input prompt
#input_text = "Once upon a time in a faraway land,"
input_text = "the movie was"

# Tokenize the input text and prepare the input for the model
input_ids = tokenizer1.encode(input_text, return_tensors="pt")

# Generate text using the model
# Set the desired length of the generated text (max_length),
# and other generation parameters like temperature, top_k, and top_p
max_length = 15
temperature = 0.7
top_k = 50
top_p = 0.95

generated_ids = model.generate(
    input_ids,
    max_length=max_length,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p,
    pad_token_id=tokenizer1.eos_token_id,
)

# Decode the generated text
generated_text = tokenizer1.decode(generated_ids[0], skip_special_tokens=True)

# Print the input prompt and the generated text
print(f"Input: {input_text}")
print(f"Generated Text: {generated_text}")



Input: the movie was
Generated Text: the movie was a bit of a disappointment, but it was a great movie


## Exercise: Creating a decoder model

In [54]:
ntokens = len(vocab)
emsize = 200
nlayers = 2
nhead = 2
dropout = 0.2

model = CustomGPTModel(embed_size=emsize, num_heads=nhead, num_layers=nlayers, vocab_size=ntokens,dropout=dropout).to(DEVICE)

print(generate(model,prompt="spring is",max_new_tokens=15,vocab=vocab,tokenizer=tokenizer))



vampire-signature dont britcoms manuel norge meandered demy hibernation arnold-film entropy double-whammy replies methane darwinism rest-stop


In [56]:
# !pip freeze > requirements.txt

[0m