# Constants and Setup

In [6]:
path = './'
root = '../'

SEED = 23

LR = 1e-3
BATCH_SIZE = 16
SEQ_LEN = 128
MAX_ITERS = 50000 # max num batches to train
PRINT_ITERS = 50 # frequency to print train loss
EVAL_ITERS = 250 # frequency to evaluate val loss and generate text from model
EVAL_ITER_COUNT = 500 # number of batches to estimate val loss with
SAVE_ITERS = 1000 # frequency to save model and losses
N_EMBD = 128
N_FF = N_EMBD * 4
N_HEAD = 4
N_LAYER = 4

MODEL_NAME = f"switch_{N_LAYER}_LAYERs_{N_HEAD}_HEAD_{N_EMBD}_EMBD_DIM_{SEQ_LEN}_SEQ_LEN"
print("Model Name:", MODEL_NAME)

Model Name: vt_4_LAYERs_4_HEAD_128_EMBD_DIM_128_SEQ_LEN


# Imports

In [7]:
import json
import re
import time

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torchinfo import summary
from tqdm import tqdm

In [8]:
import sys
sys.path.append(root)

from utils import set_seed, device

In [9]:
set_seed(SEED)

# Model
#### (TEMP). once validated, move into /models

In [None]:
class MLP(nn.Module):
    def __init__(self, n_embd, n_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_ff),
            nn.GELU(),
            nn.Dropout(p=dropout),
            nn.Linear(n_ff, n_embd),
            
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_embd, n_head, dropout=0.1):
        super().__init__()

        self.n_embd = n_embd
        self.n_head = n_head
        self.head_dim = n_embd // n_head # Dimension of each head's key, query, and value
        
        self.drop = nn.Dropout(p=dropout)

        self.query = nn.Linear(n_embd, n_embd, bias=False)
        self.key = nn.Linear(n_embd, n_embd, bias=False)
        self.value = nn.Linear(n_embd, n_embd, bias=False)
        self.out = nn.Linear(n_embd, n_embd, bias=False)

    def split_heads(self, x):
        B, S, D = x.size()
        # split dimension into n_head * head_dim, then transpose the sequence length w/ n_head
        # output: [B, n_head, S, head_dim]
        return x.view(B, S, self.n_head, self.head_dim).transpose(1, 2)

    def combine_heads(self, x):
        B, _, S, head_dim = x.size() # _ is n_head which we will merge
        # output: [B, S, n_embd]
        return x.transpose(1, 2).contiguous().view(B, S, self.n_embd)

    def scaled_dot_product(self, q, k, v, dropout, mask=None):
        # q,k,v are [B, n_head, S, head_dim]
        # wei = [B, n_head, S, S]
        wei = q @ k.transpose(-2,-1) / np.sqrt(self.head_dim)
        # mask is [B, 1, S, S]
        if mask is not None:
          wei = wei.masked_fill(mask, float('-inf'))
        wei = dropout(F.softmax(wei, dim=-1))
        out = wei @ v
        return out

    def forward(self, x, mask=None):
        # x: (B, S, n_embd)
        # Step 1 and 2: Project full query, key, value, then split via reshaping
        q = self.split_heads(self.query(x))
        k = self.split_heads(self.key(x))
        v = self.split_heads(self.value(x))

        # Step 3: Compute scaled dot-product attention with causal mask
        attn = self.scaled_dot_product(q, k, v, self.drop, mask)

        # Step 4 and 5: Concatenate attention scores, return projected output matrix
        out = self.out(self.combine_heads(attn)) # (B, S, n_embd)
        return out

### TODO: Switch Router

In [None]:
# class SwitchFeedForward(nn.Module):
#     def __init__(self,
#                  d_model,
#                  n_ff,
#                  capacity_factor,
#                  drop_tokens: bool,
#                  n_experts,
#                  expert: MLP,
#                  dropout=0.1):
#         super().__init__()

#         self.capacity_factor = capacity_factor
#         self.n_experts = n_experts
#         self.drop_tokens = drop_tokens
        
#         self.experts = nn.ModuleList([copy.deepcopy(expert(d_model,
#                                                           n_ff,
#                                                           dropout) for _ in range(n_experts)])

#         # Routing layer
#         self.switch = nn.Linear(d_model, n_experts)

#     def forward(self, x):
#         # determine x shape

#         # pass through self.switch then softmax
#         # take torch.max dim = -1 to return max val and indices
#         # check capacity and drop tokens
#         # feed tokens to relevant experts
#         # obtain output from experts and identity from dropped tokens, scale with gated probs
#         # return output, + other metadata for loss

## TODO: Switch Block

#### does Switch alternate regular MLP with SwitchFeedForward? maybe not
#### could offer that functionality though

In [None]:
# ## TODO

# class SwitchBlock(nn.Module):
#     def __init__(self, n_embd, n_head, n_ff, dropout=0.1):
#         super().__init__()
#         self.sa = MultiHeadAttention(n_embd, n_head, dropout)
        # self.mlp = SwitchFeedForward(n_embd, n_ff,
        #                              capacity_factor, 
        #                              drop_tokens,
        #                              n_experts,
        #                              expert=MLP,
        #                              dropout=dropout)
#         self.ln1 = nn.LayerNorm(n_embd)
#         self.ln2 = nn.LayerNorm(n_embd)
#         self.drop = nn.Dropout(p=dropout)

#     def forward(self, x, mask):
#         # residual connection (stream)
#         # pre layer norm
#         x = x + self.drop(self.sa(self.ln1(x), mask))
#         x = x + self.drop(self.mlp(self.ln2(x)))
#         return x

In [None]:
class PositionalEncoding(nn.Module):
  """
  Formula taken from the original Transformer paper:
  PE(pos, 2i (even)) = sin(pos/(10000^{2i/d_model}))
  PE(pos, 2i+1 (odd)) = cos(pos/(10000^{2i/d_model}))

  See reference for more details:
  https://kikaben.com/transformers-positional-encoding/
  """
  def __init__(self, d_model, max_len):
      # just set d_model = n_embd and max_len = seq_len
      super().__init__()

      position = torch.arange(max_len).unsqueeze(1) # [max_len, 1]
      divisor = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model)) # [d_model / 2, half for each of sin and cos]
      pe = torch.zeros(max_len, d_model)
      pe[:, 0::2] = torch.sin(position * divisor)
      pe[:, 1::2] = torch.cos(position * divisor)
      self.register_buffer('pe', pe) # result: self.pe = [max_len, d_model], mapping each token index to a vector of length d_model as desired

  def forward(self, x):
      # x = torch.arange(seq_length) has shape [seq_length], so x.size(0) extracts it, then we index self.pe for the first seq_length mappings
      # note we do not add the positional embeddings to x itself yet, we simply return them
      # output = (seq_length, d_model=n_embd)
      return self.pe[:x.size(0)]

#### TODO: 
-Selective precision  
-Smaller weight initialization  
-Separate higher expert dropout

In [None]:
# ## TODO

# class SwitchTransformer(nn.Module):
#     def __init__(self, vocab_size, seq_length,
#                  n_embd, n_head, n_ff, n_layer,
#                  device, dropout=0.1):
#         super().__init__()

#         self.token_embedding = nn.Embedding(vocab_size, n_embd)
#         self.position_embedding = PositionalEncoding(n_embd, seq_length)

#         self.blocks = nn.Sequential(*[Block(n_embd,
#                                             n_head,
#                                             n_ff,
#                                             dropout) for _ in range(n_layer)])
#         self.lm_head = nn.Linear(n_embd, vocab_size)
#         self.drop = nn.Dropout(dropout)
#         self.seq_length = seq_length
#         self.device = device
#         self.init_params()

#     # weight initialization (Xavier uniform)
#     def init_params(self, default_initialization=False):
#         if not default_initialization:
#             for name, p in self.named_parameters():
#                 if p.dim() > 1:
#                     nn.init.xavier_uniform_(p)

#     # Remark: Xavier normal is not supported at this time.

#     def get_causal_mask(self,  x):
#         """
#         Generates causal mask for decoding
#         """
#         B, S = x.shape # x = (batch_size x seq_len)
#         attn_shape = (B, 1, S, S)
#         subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') # k = 1 shifts the diagonal, so that the main diagonal gets 0's
#         return (torch.from_numpy(subsequent_mask) == 0).to(self.device)
#         # True along main diagonal + below, False elsewhere

#     def forward(self, x):
        
#         x = x.to(torch.int64)
#         B, S = x.shape

#         # get mask
#         mask = self.get_causal_mask(x).to(self.device)
#         # mask = (B x 1 x S x S)

#         tok_emb = self.token_embedding(x)
#         pos_emb = self.position_embedding(torch.arange(S))
#         x = self.drop(tok_emb + pos_emb)
#         # (B, S, n_embd)
#         for block in self.blocks:
#             x = block(x, ~mask) # (B, S, n_embd)
#         # negate mask to fill originally False values with -inf later
#         logits = self.lm_head(x) # (B, S, vocab_size)

#         return logits


#     def generate(self, input_ids, method='multinomial',
#                  max_new_tokens=1000, temp=None,
#                  num_beams=None, p_nucleus=None, k=None):

#         # input_ids begins as (B, S)
#         self.eval()

#         for _ in range(max_new_tokens):
#             if method in ['multinomial', 'temperature', 'greedy', 'nucleus', 'top-k']:
#                 # i) Truncate to the most recent `max length` tokens
#                 text_cond = input_ids[:, -self.seq_length:]
#                 # ii) Retrieve predictions
#                 with torch.no_grad():
#                     logits = self(text_cond)
#                 # model output: (B, S, vocab_size)
#                 # iii) Find last token logits of each
#                 logits = logits[:, -1, :] # (B, vocab_size)

#                 # if temperature sampling, divide logits by temp before applying softmax
#                 if method == 'temperature':
#                     logits = logits / temp

#                 # iv) Take softmax along each
#                 probs = F.softmax(logits, dim=-1)

#                 # v) Sample next token depending on method
#                 if method == 'greedy':
#                     next_idx = probs.argmax(dim=-1).unsqueeze(-1)

#                 elif method in ['multinomial', 'temperature', 'nucleus', 'top-k']:
#                     if method == 'nucleus':
#                         assert p_nucleus is not None and (0 < p_nucleus) and (p_nucleus <= 1)

#                         sorted_probs, sorted_idx = probs.sort(dim=-1, descending=True)
#                         prob_cumsum = sorted_probs.cumsum(dim=-1)
#                         idx_remove = prob_cumsum > p_nucleus
#                         # shift one right to ensure the first token is above the threshold
#                         idx_remove[..., 1:] = idx_remove[..., :-1].clone()
#                         idx_remove[..., 0] = False
#                         # retrieve original indices by reverse-sorting
#                         remove_mask = idx_remove.gather(dim=-1,
#                                           index=sorted_idx.argsort(dim=-1))
#                         # ^ specifically, we do this by first argsorting the indices which were returned from argsort
#                         # you can show that this returns indices that when used to subset a sorted array, returns the original array in unsorted order
#                         # https://stackoverflow.com/questions/52127723/pytorch-better-way-to-get-back-original-tensor-order-after-torch-sort
#                         probs[remove_mask] = 0

#                     if method == 'top-k':
#                         remove_mask = probs < torch.topk(probs, k).values[..., -1, None] # topk returns (B, 1), leaving only the
#                         # kth largest probs (i.e. the cutoff value for each). Then mask is same size as probs (B, vocab_size)
#                         probs[remove_mask] = 0

#                     # Sample probabilistically via scores
#                     next_idx = torch.multinomial(probs, num_samples=1) # (B, 1)

#                 # vi) Autoregressively append to input_text
#                 input_ids = torch.cat((input_ids, next_idx), dim=-1)

#                 # now input_text = (B, S + 1)
        
#         return input_ids

# Data

In [10]:
with open(f"{root}/data/tiny-shakespeare.txt", 'r') as f:
    text = f.read()

chars = sorted(list(set(text)))
VOCAB_SIZE = len(chars)
print(f'Vocab: {chars}')
print(f'Vocab size: {VOCAB_SIZE}')

Vocab: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Vocab size: 65


In [11]:
# Prepare mappings / tokenizer
# create a mapping from characters to integers
txt2idx = { ch:i for i,ch in enumerate(chars) }
idx2txt = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [txt2idx[c] for c in s]
decode = lambda l: ''.join([idx2txt[i] for i in l])

print(encode("tiny-shakespeare is sick"))
print(decode(encode("tiny-shakespeare is sick")))

[58, 47, 52, 63, 7, 57, 46, 39, 49, 43, 57, 54, 43, 39, 56, 43, 1, 47, 57, 1, 57, 47, 41, 49]
tiny-shakespeare is sick


In [12]:
# tokenizer data
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # 90-10 split
train_data = data[:n]
val_data = data[n:]
print('train_data len:', len(train_data), 'val_data len:', len(val_data))

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - SEQ_LEN, (BATCH_SIZE,))
    x = torch.stack([data[i:i+SEQ_LEN] for i in ix])
    y = torch.stack([data[i+1:i+SEQ_LEN+1] for i in ix])
    return x.to(device), y.to(device)

train_data len: 1003854 val_data len: 111540


# Training

### TODO: tweak hyperparameters to keep effective parameter count or FLOP count constant

In [13]:
set_seed(SEED)
# model = SwitchTransformer(VOCAB_SIZE, SEQ_LEN,
#                  N_EMBD, N_HEAD, N_FF, N_LAYER,
#                  device, dropout=0.1)

In [14]:
summary(model)

Layer (type:depth-idx)                   Param #
VanillaTransformer                       --
├─Embedding: 1-1                         8,320
├─PositionalEncoding: 1-2                --
├─Sequential: 1-3                        --
│    └─Block: 2-1                        --
│    │    └─MultiHeadAttention: 3-1      65,536
│    │    └─MLP: 3-2                     131,712
│    │    └─LayerNorm: 3-3               256
│    │    └─LayerNorm: 3-4               256
│    │    └─Dropout: 3-5                 --
│    └─Block: 2-2                        --
│    │    └─MultiHeadAttention: 3-6      65,536
│    │    └─MLP: 3-7                     131,712
│    │    └─LayerNorm: 3-8               256
│    │    └─LayerNorm: 3-9               256
│    │    └─Dropout: 3-10                --
│    └─Block: 2-3                        --
│    │    └─MultiHeadAttention: 3-11     65,536
│    │    └─MLP: 3-12                    131,712
│    │    └─LayerNorm: 3-13              256
│    │    └─LayerNorm: 3-14         

## TODO: add auxiliary loss

In [15]:
# ## TODO
# def calc_loss(logits, targets):
#     B, S, C = logits.shape
#     logits = logits.view(B*S, C)
#     targets = targets.view(B*S)
#     loss = F.cross_entropy(logits, targets)
#     return loss

In [113]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
for step in range(10000): # around 50 min per 10000 steps, so 3.3 batches/sec = 53 samples per sec = 6780 char/s
    optimizer.zero_grad(set_to_none=True)
    
    inputs, targets = get_batch('train')
    logits = model(inputs)
    loss = calc_loss(logits, targets)
    
    loss.backward()
    optimizer.step()
    if step % PRINT_ITER == 0:
        print(f"Step {step}/10000 | Loss: {loss.item()}")

Step 0/10000 | Loss: 6.962159633636475
Step 50/10000 | Loss: 3.401878595352173
Step 100/10000 | Loss: 3.175950765609741
Step 150/10000 | Loss: 2.812167167663574
Step 200/10000 | Loss: 2.6810946464538574
Step 250/10000 | Loss: 2.664418935775757
Step 300/10000 | Loss: 2.659294366836548
Step 350/10000 | Loss: 2.694620370864868
Step 400/10000 | Loss: 2.6138129234313965
Step 450/10000 | Loss: 2.5488266944885254
Step 500/10000 | Loss: 2.5062320232391357
Step 550/10000 | Loss: 2.4803411960601807
Step 600/10000 | Loss: 2.4527230262756348
Step 650/10000 | Loss: 2.38985013961792
Step 700/10000 | Loss: 2.3893115520477295
Step 750/10000 | Loss: 2.3820767402648926
Step 800/10000 | Loss: 2.32847261428833
Step 850/10000 | Loss: 2.3164374828338623
Step 900/10000 | Loss: 2.2766780853271484
Step 950/10000 | Loss: 2.297649383544922
Step 1000/10000 | Loss: 2.2336761951446533
Step 1050/10000 | Loss: 2.255802869796753
Step 1100/10000 | Loss: 2.123323440551758
Step 1150/10000 | Loss: 2.198436737060547
Step 1

KeyboardInterrupt: 

### TODO: functions now work. but need to reconsider freq. to save train times. don't want lit every step. maybe same freq as EVAL?

In [21]:
set_seed(SEED)
model = VanillaTransformer(VOCAB_SIZE, SEQ_LEN,
                 N_EMBD, N_HEAD, N_FF, N_LAYER,
                 device, dropout=0.1)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

In [22]:
def train(model, optimizer, device, 
          train_loss_list=None, val_loss_list=None, train_time_list=None):

    train_losses = train_loss_list if train_loss_list is not None else []
    val_losses = val_loss_list if val_loss_list is not None else []
    train_times = train_time_list if train_time_list is not None else []

    model.train()
    model.to(device)

    # Set up prompt generation
    generation_file_path = f"{path}/outputs/OUTPUT_{MODEL_NAME}_SEED_{SEED}.txt"
    empty_tokens = torch.zeros((1, 1), dtype=torch.long).to(device)
    cond_prompts = ["KING TERRY: Thou art",
                    "DANIEL: Ay, my dear,"]

    cond_token_list = [encode(prompt) for prompt in cond_prompts]

    for step in range(MAX_ITERS):

        start = time.perf_counter()
        
        optimizer.zero_grad(set_to_none=True)
        
        inputs, targets = get_batch('train')
        logits = model(inputs)
        loss = calc_loss(logits, targets)
        train_losses.append(loss.item())
        
        loss.backward()

         # Monitor gradient norm
        grads = [
                param.grad.detach().flatten()
                for param in model.parameters()
                if param.grad is not None
            ]
        norm = torch.cat(grads).norm()

        train_time = time.perf_counter()-start
        tokens_per_sec = (1/train_time) * BATCH_SIZE * SEQ_LEN
        train_times.append(tokens_per_sec)

        optimizer.step()

        # print training statistics
        if step % PRINT_ITERS == 0 and step != 0:
            print(f"Step {step}/{MAX_ITERS} | Running Avg Train Loss: {np.mean(train_losses):.5f} |",
                  f"Grad Norm: {norm:.3f} | Running Avg Tokens/Sec: {np.mean(train_times):.3f}")

        # estimate val loss, generate text and save
        if step % EVAL_ITERS == 0 and step != 0:
            val_losses = estimate_loss(model, val_losses)
            generate(model, generation_file_path, empty_tokens, cond_token_list, step)
            model.train()

        # save model, val losses (not train_losses), train times
        if step % SAVE_ITERS == 0 and step != 0:
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict()},
                f'{path}/checkpoints/{MODEL_NAME}_STEP_{step}_SEED_{SEED}.pt')
            
        with open(f'{path}/train_logs/{MODEL_NAME}_SEED_{SEED}_val_losses.json', 'w') as f:
            json.dump(val_losses, f)

        with open(f'{path}/train_logs/{MODEL_NAME}_SEED_{SEED}_train_times.json', 'w') as f2:
            json.dump(train_times[EVAL_ITERS::EVAL_ITERS], f2) # match freq of val_losses
            # note this means if you load from checkpoint to continue training you will have a sparser train_times
            # list in computing running avg

In [23]:
@torch.no_grad()
def estimate_loss(model, val_losses):
    model.eval()
    losses = torch.zeros(EVAL_ITER_COUNT)
    for k in range(EVAL_ITER_COUNT):
        inputs, targets = get_batch('test')
        logits = model(inputs)
        losses[k] = calc_loss(logits, targets).item()
    val_loss = losses.mean().item()
    val_losses.append(val_loss)
    # keep model in eval, next call is to .generate() anyway
    print(f"Est. Val Loss: {val_loss:.5f}")
    return val_losses

In [24]:
def generate(model, generation_file_path, empty_tokens, cond_token_list, step):

    set_seed(42)
    
    uncond_res1 = decode(model.generate(empty_tokens,
                                        method='top-k',
                                        k=5,
                                        max_new_tokens=500)[0].tolist())
    uncond_res2 = decode(model.generate(empty_tokens,
                                        method='nucleus',
                                        p_nucleus=0.5,
                                        max_new_tokens=500)[0].tolist())

    cond_res_list = []
    for prompt in cond_token_list:
        cond_res = decode(model.generate(torch.tensor(prompt).unsqueeze(0).long().to(device),
                      method='top-k', k=5,
                      max_new_tokens=500)[0].tolist())
        cond_res_list.append(cond_res)
    
    cond_res_list = '\n\n'.join(cond_res_list)
    
    generation_text = f"""{MODEL_NAME} Output, Step {step}
    UNCONDITIONAL GENERATION:

    Top-k (5) (500 max_tokens):
    {uncond_res1}

    Nucleus (0.5) (500 max_tokens):
    {uncond_res2}

    #####################################################
    CONDITIONAL GENERATION (Top-k (5), 500 max_tokens):
    {cond_res_list}
    -----------------------------------------------------
    """
    with open(generation_file_path, 'a') as file:
      file.write(generation_text)
    print(generation_text)

In [None]:
## Driver code
train(model, optimizer, device)

Step 0/50000 | Running Avg Train Loss: 6.24023 | Grad Norm: 20.527 | Running Avg Tokens/Sec: 2993.337
Step 50/50000 | Running Avg Train Loss: 4.21355 | Grad Norm: 1.394 | Running Avg Tokens/Sec: 6741.455
Step 100/50000 | Running Avg Train Loss: 3.77272 | Grad Norm: 2.327 | Running Avg Tokens/Sec: 6705.369
Est. Val Loss: 3.10417
vt_4_LAYERs_4_HEAD_128_EMBD_DIM_128_SEQ_LEN Output, Step 100
    UNCONDITIONAL GENERATION:

    Top-k (5) (500 max_tokens):
    
  ae  t ae e ha t ses sthist sh toue hee th   h at s sh hes ho s he th a  as hi thon enes t t t ha ate h  seat s teatothe  tha hh ta h h s

e shes harot  a he t he s hhh  aresh  thoshe  sh s
h ho t hhhe hhea h sh s h s hhhe hoth  he he

 s
hah ha hos s ahh he s
heh  hhho hahe he  thha t he at   toth tha a hothhar se  h h h s a a th ho h t hh h sh she anhan h s tha  t  the he han

he h s h har a soh ha t ah ha   a ha th tos shor h arhe thohh  tho arorarohe


 hos


ho  thar h at
he thos hhe t


 

    Nucleus (0.5) (500 max_tokens):
   

# Generation

*After 2250 steps *  16 batch_size, training loss 1.8277:*

In [114]:
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))



AUCENTIO:
Which the may sich that nough to the slay'd one so to;
His be knot, I Vistrengs, thy the good our kim in to call:
No, thou man I good, Say, for pmburds tell eack.

HESSend.

MENCIO:
Dever and will'd my Vaing, life to suke in lise,
These'll was of yret, fol smy his no Fear shom gestard:
Retil appoutis commentex e'epon tend her his him buse,
And what ityer am the iends, come; God foll ding:
by appeerk.

LOUCIO:
Petwild, bake you, that I same, what wear;
from in in or my speak as For Jul


In [115]:
input_txt = "TERRY: thou art"
ctx = encode(input_txt)
print(decode(model.generate(torch.tensor(ctx).unsqueeze(0).long(), max_new_tokens=500)[0].tolist()))

TERRY: thou arte a my she
Which have may and of contain.

DUKE VINCENTIO:
Good as I tall no knrow, for shalt agarnt
And mpo; and Kong a m, not outhpile Mesce.

HENRY VI:
When I will thy lookess, oner the pexstrey
The the the hee voagh gresed livioe.

MENCIO:
My her callis his peaced of to that
We where's by shall bore: as shall myselvea
The plender feuls!

PAPELLANT:
In the the into balby me dods to love,
In but the giving of nyou ase. I tall it-me?'e Goveuling
The theer haught art praver count madeng Camen:
T
