# Assignment 4

In [1]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import math
from tqdm import tqdm

### 1. Study the transformer code provided in the [module](https://github.com/DrUzair/NLP/blob/master/Transformer/5%20LM_TransformerBlock_MLP_PosEmb_AddNormOpt.py)

In [11]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

''' Look at all previous tokens to generate next
    @Author: Uzair Ahmad
    2022
    +TransformerBlock 
'''


class TransformerBlockLM(nn.Module):
    class TransformerBlock(nn.Module):
        def __init__(self, head_count, in_size, out_size,device="cuda"):
            super().__init__()
            self.comm = TransformerBlockLM.MultiHeadAttention(head_count=head_count,
                                                              in_size=in_size,
                                                              out_size=out_size).to(device)
            self.think = TransformerBlockLM.MLP(embed_size=out_size).to(device)

        def forward(self, x):
            return x + self.think(x + self.comm(x))

    class MLP(nn.Module):
        # FFNN (embed_size, embed_size*4, embed_size)
        def __init__(self, embed_size,device="cuda"):
            super().__init__()
            self.mlp = nn.Sequential(nn.Linear(embed_size, embed_size * 4).to(device),
                                     nn.ReLU(),
                                     nn.Linear(embed_size * 4, embed_size).to(device))
            self.layerNorm = nn.LayerNorm(embed_size).to(device)

        def forward(self, x):  # think
            return self.layerNorm(self.mlp(x))  # paper - after
            # return self.mlp(self.layerNorm(x)) # alternate - before

    class MultiHeadAttention(nn.Module):
        """
        multiple parallel SA heads (communication among words)
        """

        def __init__(self, head_count, in_size, out_size,device="cuda"):
            super().__init__()
            self.heads = nn.ModuleList(
                TransformerBlockLM.SelfAttentionHead(in_size, out_size // head_count).to(device)
                for _ in range(head_count)
            )
            self.layerNorm = nn.LayerNorm(out_size).to(device)
            # self.proj = nn.Linear(out_size, out_size)

        def forward(self, x):
            # concat over channel/embeddings_size dimension
            return self.layerNorm(torch.cat([head(x) for head in self.heads], dim=-1))  # paper - after
            # return torch.cat([head(self.layerNorm(x)) for head in self.heads], dim=-1) # alternate - before
            # return self.proj(torch.cat([head(x) for head in self.heads], dim=-1))

    class SelfAttentionHead(nn.Module):
        def __init__(self, in_size, out_size,device="cuda"):
            """
            in_size is embed_size
            out_size is head_size
            """
            super().__init__()
            self.head_size = out_size
            self.K = nn.Linear(in_size, self.head_size, bias=False).to(device)
            self.Q = nn.Linear(in_size, self.head_size, bias=False).to(device)
            self.V = nn.Linear(in_size, self.head_size, bias=False).to(device)

        def forward(self, x):
            keys = self.K(x)
            queries = self.Q(x)
            # affinities :
            # all the queries will dot-product with all the keys
            # transpose (swap) second dimension (input_length) with third (head_size)
            keys_t = keys.transpose(1, 2)
            autocorrs = (queries @ keys_t) * (self.head_size ** -0.5)  # (batch_size x input_length x input_length)
            '''
            (batch_size x input_length x embed_size) @ (batch_size x embed_size x input_length) ----> (batch_size x input_length x input_length)
            '''
            autocorrs = torch.tril(autocorrs)
            autocorrs = autocorrs.masked_fill(autocorrs == 0, float('-inf'))
            autocorrs = torch.softmax(autocorrs, dim=-1)
            values = self.V(x)  # (batch_size x input_length x head_size)
            out = autocorrs @ values
            return out

    def __init__(self, batch_size=4,
                 input_length=8,
                 embed_size=16,
                 sa_head_size=8,
                 sa_multihead_count=4,
                 pos_embed=False,
                 include_mlp=False):
        super().__init__()
        self.blocks = None
        self.ffn = None
        self.sa_heads = None
        # sa_head_size head_size of self-attention module
        self.sa_head_size = sa_head_size
        self.sa_multihead_count = sa_multihead_count

        self.val_data = None
        self.train_data = None
        self.val_text = None
        self.train_text = None
        self.K = None
        self.linear_sahead_to_vocab = None
        self.vocab = None
        self.token_embeddings_table = None
        self.vocab_size = None
        self.encoder = None
        self.decoder = None
        self.vocab_size: int
        self.is_pos_emb = pos_embed
        self.include_mlp = include_mlp
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # input_length = how many consecutive tokens/chars in one input
        self.input_length = input_length
        # batch_size = how many inputs are going to be processed in-parallel (on GPU)
        self.batch_size = batch_size
        # embed_size = embedding size
        self.embed_size = embed_size

        self.lm_head = None
        self.position_embeddings_table = None

    def forward(self, in_ids, target=None):
        # print("Hello")
        # print(in_ids.shape)
        # print(self.token_embeddings_table)
        in_ids_emb = self.token_embeddings_table(in_ids[:, -self.input_length:])
        # print("Hello")
        if self.is_pos_emb:
            in_ids_pos_emb = self.position_embeddings_table(
                torch.arange(in_ids[:, -self.input_length:].shape[1], device=self.device)
            )
            in_ids_emb = in_ids_emb + in_ids_pos_emb
        block_outputs = self.blocks((in_ids_emb))
        logits = self.linear_sahead_to_vocab(block_outputs).to(self.device)  # compute
        if target is None:
            ce_loss = None
        else:
            batch_size, input_length, vocab_size = logits.shape
            logits_ = logits.view(batch_size * input_length, vocab_size)
            targets = target.view(batch_size * input_length)
            ce_loss = F.cross_entropy(logits_, targets)
        return logits, ce_loss

    def fit(self, train_iters=100, eval_iters=10, lr=0.0001):
        """
        train_iters = how many training iterations
        eval_iters = how many batches to evaluate to get average performance
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        for iteration in tqdm(range(train_iters)):
            if iteration % eval_iters == 0:
                avg_loss = self.eval_loss(eval_iters)
                print(f"iter {iteration}: train {avg_loss['train']} val {avg_loss['eval']}")
            inputs, targets = self.get_batch(split='train')
            _, ce_loss = self(inputs, targets)
            optimizer.zero_grad(set_to_none=True)  # clear gradients of previous step
            ce_loss.backward()  # propagate loss back to each unit in the network
            optimizer.step()  # update network parameters w.r.t the loss
        # torch.save(self, 'sa_pos_')

    def generate(self, context_token_ids, max_new_tokens):
        for _ in range(max_new_tokens):
            token_rep, _ = self(context_token_ids)
            last_token_rep = token_rep[:, -1, :]
            probs = F.softmax(last_token_rep, dim=1)
            next_token = torch.multinomial(probs, num_samples=1)
            context_token_ids = torch.cat((context_token_ids, next_token), dim=1)
        output_text = self.decoder(context_token_ids[0].tolist())
        return output_text

    @torch.no_grad()  # tell torch not to prepare for back-propagation (context manager)
    def eval_loss(self, eval_iters):
        perf = {}
        # set dropout and batch normalization layers to evaluation mode before running inference.
        self.eval()
        for split in ['train', 'eval']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                tokens, targets = self.get_batch(split)  # get random batch of inputs and targete
                _, ce_loss = self(tokens, targets)  # forward pass
                losses[k] = ce_loss.item()  # the value of loss tensor as a standard Python number
            perf[split] = losses.mean()
        self.train()  # turn-on training mode-
        return perf

    def prep(self, corpus):
        self.vocab = sorted(list(set(corpus)))
        self.vocab_size = len(self.vocab)
        c2i = {c: i for i, c in
               enumerate(self.vocab)}  # char c to integer i map. assign value i for every word in vocab
        i2c = {i: c for c, i in c2i.items()}  # integer i to char c map

        self.encoder = lambda doc: [c2i[c] for c in doc]
        self.decoder = lambda nums: ''.join([i2c[i] for i in nums])
        text=corpus
        n = len(text)
        self.train_text = text[:int(n * 0.9)]
        self.val_text = text[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long)

        # look-up table for embeddings (vocab_size x embed_size)
        # it will be mapping each token id to a vector of embed_size
        # a wrapper to store vector representations of each token
        self.token_embeddings_table = \
            nn.Embedding(self.vocab_size, self.embed_size).to(self.device)

        if self.is_pos_emb:
            self.position_embeddings_table = nn.Embedding(self.input_length, self.embed_size).to(self.device)

        self.blocks = nn.Sequential(
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
        )
        # linear projection of sa_head output to vocabulary
        self.linear_sahead_to_vocab = nn.Linear(self.sa_head_size, self.vocab_size).to(self.device)
        return c2i
    def get_batch(self, split='train'):
        data = self.train_data if split == 'train' else self.val_data
        # get random chunks of length batch_size from data
        ix = torch.randint(len(data) - self.input_length,
                           (self.batch_size,))
        inputs_batch = torch.stack([data[i:i + self.input_length] for i in ix])
        targets_batch = torch.stack([data[i + 1:i + self.input_length + 1] for i in ix])
        inputs_batch = inputs_batch.to(self.device)
        targets_batch = targets_batch.to(self.device)
        # inputs_batch is
        return inputs_batch, targets_batch

In [12]:
with open('emily_dickonson.txt', 'r') as f:
    text = f.read()

# text = 'a quick brown fox jumps over the lazy dog.\n ' \
#        'lazy dog and a quick brown fox.\n' \
#        'the dog is lazy and the fox jumps quickly.\n' \
#        'a fox jumps over the dog because he is lazy.\n' \
#        'dog is lazy and fox is brown. she quickly jumps over the lazy dog.'

model = TransformerBlockLM(batch_size=64,
                           input_length=32,
                           embed_size=128,
                           sa_multihead_count=8,
                           sa_head_size=128,
                           pos_embed=True,
                           include_mlp=True)
model = model.to(model.device)
# model="cpu"
model.prep(text)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
print(f'params {sum([np.prod(p.size()) for p in model_parameters])}')
# input_batch, output_batch = model.get_batch(split='train')
# _, _ = model(input_batch, output_batch)


params 1112398


In [13]:
model.fit(train_iters=4000, eval_iters=1000, lr=1e-3)


  0%|          | 1/4000 [00:23<26:16:58, 23.66s/it]

iter 0: train 5.754003524780273 val 5.807081699371338


 25%|██▌       | 1001/4000 [01:53<3:19:43,  4.00s/it]

iter 1000: train 1.663784146308899 val 1.7214789390563965


 50%|█████     | 2001/4000 [03:22<2:00:21,  3.61s/it]

iter 2000: train 1.3952398300170898 val 1.7330751419067383


 75%|███████▌  | 3001/4000 [04:46<51:54,  3.12s/it]  

iter 3000: train 1.1602956056594849 val 1.8472594022750854


100%|██████████| 4000/4000 [05:47<00:00, 11.51it/s]


In [14]:
outputs = model.generate(context_token_ids=torch.zeros((1, 1),dtype=torch.long,device=model.device),max_new_tokens=1000)
print(outputs)



PUTUL.

The gave of overges
   I tell hem the hills —
The istants 'd leave the resent the size
As but one, believed for thought.

Next not affoded pope; apart of paraise.
Sincle placiled as the nest haids,
We cribsence rarered them in reached out open, relieves in low,
And wend well to knowl, that well.

Mage wandering back nought, it real, —
To denied it, it makes all,
And the bodiless slipped,
Sill were suspected landscape
     For the birds stars to dails upon man,

Speers surpasse,
And vented, and dewle a thing
Capated etern-ustance plush
I could not exist it will come a well!
  The woods are plain.

XV.

THE LOST JOUTY.

I'll sound at opposite,
This, the midnights all well,
   We knowing it is low mat
Is rambles on the crose at warm.

My second angels, we mornings blossom into report;
A ridd the dew:
If were in the dare run;
When closer twill we are known
And then the brakes 't is, the maids?
Begeads we doubt, buy night
    For heaven everywhere.

To see if the wall do,
Spinsibl

### 1.A) Identify the points where code is different from the proposed architecture in Google's patent (provided in the module) (5 points)


* **Position Embedding**: The code implements Learned Position Embedding, while the Google patent utilizes Sinusoidal Position Embedding, demonstrating superior position understanding.

* **Architecture Composition**: The code comprises a variation of the decoder component, diverging from the proposed architecture in the Google patent, which includes both encoder and decoder sections.

* **Multi-Head Attention Variations**: While the Google patent introduces three types of Multi-Head Attention (Multi-Head Attention, Encoder-Decoder Multi-Head Attention, and Masked Multi-Head Attention), the code solely employs Masked Multi-Head Attention.

* **Decoder Layer Modifications**: In the proposed model, the decoder layer incorporates Masked Multi-Head Attention, Encoder-Decoder Multi-Head Attention, Layer Normalization, Multi-layer Perceptron layer, and skip connections. Conversely, the code implements Masked Multi-Head Attention, Layer Normalization, Multi-layer Perceptron layer, and skip connections, with variations from the original patent architecture.

* **Padding**: In the proposed model by google, they use padding whereas oin the given code there is no padding of input tokens.

### 1.B) Re-design the code to make it more intuitive. Give arguments why do you think your code is better. (5 points)
#### e.g. Find a bug and fix it, OR restructure the classes, OR read the parameters from a config file OR ask chatgpt OR add visualizations of key, query, value OR any other idea you deem necessary to improve the code.

### Re-designing the Code for Improved Intuitiveness

#### My Arguments:

1. **Position Embedding**:  
   *Learned Position Embeddings vs. Sinusoidal Position Embeddings*  
   - *Learned Position Embeddings*: These embeddings are learned during training, offering flexibility to adapt to specific sequence patterns.
   - *Sinusoidal Position Embeddings*: Provide fixed embeddings based on sine and cosine functions, offering simplicity but lack adaptability.

2. **Bugs**:  
   - Some parts of the code lack compatibility with GPU settings, hindering performance and efficiency.
   - Incorrect usage of the number of heads and head dimension leads to incorrect multi-head attention implementation.

3. **Code Structure**:  
   - The current code structure does not adhere to industry standards, making it challenging to maintain and scale.

4. **Data Preparation**:  
   - The variable named "text" in data preparation should be renamed to "corpus" for clarity and consistency.

5. **Model Architecture**:  
   - The Self Attention module only performs Masked Self Attention, lacking the flexibility to switch between Masked and Non-Masked Self Attention.

#### Proposed Improvements:
1. **Bug Fixes**: Address compatibility issues with GPU settings and correct calculations for multi-head attention dimensions.
2. **Code Refactoring**: Restructure the code following industry best practices for improved readability and maintainability.
3. **Parameterization**: Utilize configuration files to manage hyperparameters, enhancing flexibility and ease of experimentation.
4. **Comprehensive Attention Mechanisms**: Implement both Masked and Non-Masked Self Attention options to enhance model versatility.
5. **Documentation and Comments**: Enhance code documentation and add explanatory comments to improve understanding and collaboration.

I have addressed these issues and redesigned the architecture in my version of the model named "model_venky".

### Below is a rough architecture used in the Google Patent

In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(TransformerModel, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.transformer = nn.Transformer(d_model, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dim_feedforward=d_ff, dropout=dropout)
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, tgt):
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
        output = self.transformer(src_embedded, tgt_embedded)
        output = self.fc(output)
        return output

### I redesigned the given code to encorporate both Encoder and Decoder architecture

In [16]:
class TransformerBlockLM(nn.Module):
    class TransformerBlock(nn.Module):
        def __init__(self, head_count, in_size):
            super().__init__()
            self.encoder_bl= nn.Sequential(*[TransformerBlockLM.encoder_block(head_count,in_size) for _ in range(6)])
            self.decoder_bl= nn.ModuleList([TransformerBlockLM.decoder_block(head_count,in_size) for _ in range(6)])

        def forward(self, x,target):
            encoder_output=self.encoder_bl(x)
            for decoder in self.decoder_bl:
                output_temp=decoder(target,encoder_output)
                target=output_temp
            # print(output_temp.shape)
            return output_temp

    class MLP(nn.Module):
        # FFNN (embed_size, embed_size*4, embed_size)
        def __init__(self, embed_size):
            super().__init__()
            self.mlp = nn.Sequential(nn.Linear(embed_size, embed_size * 4),
                                     nn.ReLU(),
                                     nn.Linear(embed_size * 4, embed_size))
            self.layerNorm = nn.LayerNorm(embed_size)

        def forward(self, x):  # think
            return self.layerNorm(self.mlp(x))  # paper - after
            # return self.mlp(self.layerNorm(x)) # alternate - before

    class MultiHeadAttention(nn.Module):
        """
        multiple parallel SA heads (communication among words)
        """

        def __init__(self, head_count, in_size,mask=False):
            super().__init__()
            self.heads = nn.ModuleList(
                TransformerBlockLM.SelfAttentionHead(in_size, in_size // head_count,mask)
                for _ in range(head_count)
            )
            self.layerNorm = nn.LayerNorm(in_size)
            # self.proj = nn.Linear(out_size, out_size)

        def forward(self, q,k,v):
            # concat over channel/embeddings_size dimension
            return self.layerNorm(torch.cat([head(q,k,v) for head in self.heads], dim=-1))  # paper - after
            # return torch.cat([head(self.layerNorm(x)) for head in self.heads], dim=-1) # alternate - before
            # return self.proj(torch.cat([head(x) for head in self.heads], dim=-1))

    class SelfAttentionHead(nn.Module):
        def __init__(self, in_size, head_size,mask=False):
            """
            in_size is embed_size
            out_size is head_size
            """
            super().__init__()
            self.head_size = head_size
            self.K = nn.Linear(in_size, self.head_size, bias=False)
            self.Q = nn.Linear(in_size, self.head_size, bias=False)
            self.V = nn.Linear(in_size, self.head_size, bias=False)
            self.mask=mask

        def forward(self, q,k,v):
            keys = self.K(k)
            queries = self.Q(q)
            # affinities :
            # all the queries will dot-product with all the keys
            # transpose (swap) second dimension (input_length) with third (head_size)
            keys_t = keys.transpose(1, 2)
            autocorrs = (queries @ keys_t) * (self.head_size ** -0.5)  # (batch_size x input_length x input_length)
            '''
            (batch_size x input_length x embed_size) @ (batch_size x embed_size x input_length) ----> (batch_size x input_length x input_length)
            '''
            if self.mask==True:
                autocorrs = torch.tril(autocorrs)
                autocorrs = autocorrs.masked_fill(autocorrs == 0, float('-inf'))
            autocorrs = torch.softmax(autocorrs, dim=-1)
            values = self.V(v)  # (batch_size x input_length x head_size)
            out = autocorrs @ values
            return out
    
    class encoder_block(nn.Module):
        def __init__(self,head_count, in_size):
            super().__init__()
            embd_size=in_size
            self.mha=TransformerBlockLM.MultiHeadAttention(head_count, in_size)
            self.mlp=TransformerBlockLM.MLP(embd_size)
        def forward(self,input):
            x1= self.mha(input,input,input)+input
            x2= self.mlp(x1)+x1
            return x2
            
    class decoder_block(nn.Module):
        def __init__(self,head_count, in_size):
            super().__init__()
            self.mask_mha=TransformerBlockLM.MultiHeadAttention(head_count, in_size,mask=True)
            self.mha=TransformerBlockLM.MultiHeadAttention(head_count, in_size,mask=False)
            self.mlp=TransformerBlockLM.MLP(in_size)
        def forward(self,target,enc_output):
            x1= self.mha(target, target,target)+target
            x2= self.mha(x1, enc_output,enc_output)+x1
            x3= self.mlp(x2)+x2
            return x3
    
    def __init__(self, batch_size=4,
                 input_length=8,
                 embed_size=16,
                 sa_multihead_count=4,
                 num_encoders=6,
                 num_decoders=6,
                 pos_embed=False,
                 include_mlp=False):
    
        super().__init__()
        self.blocks = None
        self.ffn = None
        self.sa_heads = None
        # sa_head_size head_size of self-attention module
        # self.sa_head_size = sa_head_size
        self.sa_multihead_count = sa_multihead_count

        self.val_data = None
        self.train_data = None
        self.val_text = None
        self.train_text = None
        self.K = None
        self.linear_sahead_to_vocab = None
        self.vocab = None
        self.token_embeddings_table = None
        self.vocab_size = None
        self.encoder = None
        self.decoder = None
        self.vocab_size: int
        self.is_pos_emb = pos_embed
        self.include_mlp = include_mlp
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # input_length = how many consecutive tokens/chars in one input
        self.input_length = input_length
        # batch_size = how many inputs are going to be processed in-parallel (on GPU)
        self.batch_size = batch_size
        # embed_size = embedding size
        self.embed_size = embed_size

        self.lm_head = None
        self.position_embeddings_table = None
        self.num_encoders=num_encoders
        self.num_decoders=num_decoders
        self.Transformer=TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,in_size=self.embed_size)
        

    def forward(self, in_ids, target=None):
        in_ids_emb = self.token_embeddings_table(in_ids[:, -self.input_length:])
        if self.is_pos_emb:
            in_ids_emb=self.positional_encoding(self.encoder_embedding(in_ids_emb))
        if target is None:
            decoder_output=self.Transformer(in_ids_emb,target)
        else:
            target_emb = self.token_embeddings_table(target)
            target_emb=self.positional_encoding(self.encoder_embedding(target_emb))
            decoder_output=self.Transformer(in_ids_emb,target_emb)
        logits = self.linear_vocab(decoder_output)  # compute

        if target is None:
            ce_loss = None
        else:
            batch_size, input_length, vocab_size = logits.shape
            logits_ = logits.view(batch_size * input_length, vocab_size)            
            targets = target.view(batch_size * input_length)
            ce_loss = F.cross_entropy(logits_, targets)
        return logits, ce_loss

    def fit(self, train_iters=100, eval_iters=10, lr=0.0001):
        """
        train_iters = how many training iterations
        eval_iters = how many batches to evaluate to get average performance
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        for iteration in range(train_iters):
            # print(iteration)
            if iteration % eval_iters == 0:
                avg_loss = self.eval_loss(eval_iters)
                print(f"iter {iteration}: train loss: {avg_loss['train']} val loss: {avg_loss['eval']}")
            inputs, targets = self.get_batch(split='train')
            _, ce_loss = self(inputs, targets)
            optimizer.zero_grad(set_to_none=True)  # clear gradients of previous step
            ce_loss.backward()  # propagate loss back to each unit in the network
            optimizer.step()  # update network parameters w.r.t the loss
        # torch.save(self, 'sa_pos_')

    def generate(self, context_token_ids, max_new_tokens):
        for _ in range(max_new_tokens):
            token_rep, _ = self(context_token_ids)
            last_token_rep = token_rep[:, -1, :]
            probs = F.softmax(last_token_rep, dim=1)
            next_token = torch.multinomial(probs, num_samples=1)
            context_token_ids = torch.cat((context_token_ids, next_token), dim=1)
        output_text = self.decoder(context_token_ids[0].tolist())
        return output_text

    @torch.no_grad()  # tell torch not to prepare for back-propagation (context manager)
    def eval_loss(self, eval_iters):
        perf = {}
        # set dropout and batch normalization layers to evaluation mode before running inference.
        self.eval()
        for split in ['train', 'eval']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                tokens, targets = self.get_batch(split)  # get random batch of inputs and targete
                _, ce_loss = self(tokens, targets)  # forward pass
                losses[k] = ce_loss.item()  # the value of loss tensor as a standard Python number
            perf[split] = losses.mean()
        self.train()  # turn-on training mode-
        return perf

    def prep(self, corpus):
        self.vocab = sorted(list(set(corpus)))
        self.vocab_size = len(self.vocab)
        c2i = {c: i for i, c in
               enumerate(self.vocab)}  # char c to integer i map. assign value i for every word in vocab
        i2c = {i: c for c, i in c2i.items()}  # integer i to char c map

        self.encoder = lambda doc: [c2i[c] for c in doc]
        self.decoder = lambda nums: ''.join([i2c[i] for i in nums])
        n = len(corpus)
        # print(corpus)
        self.train_text = corpus[:int(n * 0.9)]
        self.val_text = corpus[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long)

        # look-up table for embeddings (vocab_size x embed_size)
        # it will be mapping each token id to a vector of embed_size
        # a wrapper to store vector representations of each token
        self.token_embeddings_table = \
            nn.Embedding(self.vocab_size, self.embed_size)

        if self.is_pos_emb:
            # self.position_embeddings_table = PositionalEncoding(self.embed_size, self.input_length)
            self.position_embeddings_table = PositionalEncoding(self.embed_size, self.input_length)
        
        # linear projection of sa_head output to vocabulary
        self.linear_vocab = nn.Linear(self.embed_size, self.vocab_size)

    def get_batch(self, split='train'):
        data = self.train_data if split == 'train' else self.val_data
        # get random chunks of length batch_size from data
        ix = torch.randint(len(data) - self.input_length,(self.batch_size,))
        inputs_batch = torch.stack([data[i:i + self.input_length] for i in ix])
        targets_batch = torch.stack([data[i + 1:i + self.input_length + 1] for i in ix])
        inputs_batch = inputs_batch.to(self.device)
        targets_batch = targets_batch.to(self.device)
        # inputs_batch is
        return inputs_batch, targets_batch

In [18]:
model_enc_dec=TransformerBlockLM(batch_size=4,input_length=8,embed_size=16,sa_multihead_count=4,num_encoders=6,num_decoders=6,pos_embed=True,include_mlp=True)
print(model_enc_dec)

TransformerBlockLM(
  (Transformer): TransformerBlock(
    (encoder_bl): Sequential(
      (0): encoder_block(
        (mha): MultiHeadAttention(
          (heads): ModuleList(
            (0-3): 4 x SelfAttentionHead(
              (K): Linear(in_features=16, out_features=4, bias=False)
              (Q): Linear(in_features=16, out_features=4, bias=False)
              (V): Linear(in_features=16, out_features=4, bias=False)
            )
          )
          (layerNorm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        )
        (mlp): MLP(
          (mlp): Sequential(
            (0): Linear(in_features=16, out_features=64, bias=True)
            (1): ReLU()
            (2): Linear(in_features=64, out_features=16, bias=True)
          )
          (layerNorm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        )
      )
      (1): encoder_block(
        (mha): MultiHeadAttention(
          (heads): ModuleList(
            (0-3): 4 x SelfAttentionHead(
          

In [218]:
model_enc_dec.fit(train_iters=100, eval_iters=10, lr=1e-3)

iter 0: train loss: 6.543510437011719 val loss: 6.64241886138916
iter 10: train loss: 2.300812244415283 val loss: 2.2039215564727783
iter 20: train loss: 0.3769061863422394 val loss: 0.3353438973426819
iter 30: train loss: 0.10858909785747528 val loss: 0.08668842166662216
iter 40: train loss: 0.02294185198843479 val loss: 0.02188847027719021
iter 50: train loss: 0.010622554458677769 val loss: 0.010144343599677086
iter 60: train loss: 0.0023777219466865063 val loss: 0.0025417529977858067
iter 70: train loss: 0.0029227244667708874 val loss: 0.003797037061303854
iter 80: train loss: 0.0007192973280325532 val loss: 0.0021690037101507187
iter 90: train loss: 0.0022298030089586973 val loss: 0.0020070835016667843


### 2. Apply Transformer code provided in the module to train a language models that generates financial discourse in Warren Buffet's style. Train you model using Warren Buffets's Annual Letters to shareholders 

### 2.A) Data Pre-processing

In [4]:
import math
import pandas as pd
import re
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import warnings
# import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/desai.ven/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/desai.ven/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/desai.ven/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [81]:
# Data Cleaning

file_path = 'WarrenBuffet.txt'
with open(file_path, 'r') as f:
    text = f.read()

english_stopwords = stopwords.words('english')
ps = PorterStemmer()

# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()

def clean_text(text_data):
    text_lower = text_data.lower()
    text_link_removed = re.sub(r'http\S+', '', text_lower)
    text_punc_removed = re.sub(r'[^\w\s]', '', text_link_removed)
    text_tokenized = word_tokenize(text_punc_removed)
    # text_cleaned = [word for word in text_tokenized if word not in english_stopwords]
    text_cleaned = [ps.stem(word) for word in text_tokenized]
#     text_cleaned = [lemmatizer.lemmatize(word) for word in text_cleaned]
    return text_cleaned

cleaned_text = clean_text(text)
len(cleaned_text)

53837

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


### 2.B) Changes in the given model - Venky Model

In [34]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length, device="cuda"):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0).to(device))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [35]:
class SelfAttentionHead(nn.Module):
        def __init__(self, in_size, head_size,device="cuda"):
            """
            in_size is embed_size
            out_size is head_size
            """
            super().__init__()
            self.head_size = head_size
            self.K = nn.Linear(in_size, self.head_size, bias=False).to(device)
            self.Q = nn.Linear(in_size, self.head_size, bias=False).to(device)
            self.V = nn.Linear(in_size, self.head_size, bias=False).to(device)

        def forward(self, x,mask=True):
            keys = self.K(x)
            queries = self.Q(x)
            # affinities :
            # all the queries will dot-product with all the keys
            # transpose (swap) second dimension (input_length) with third (head_size)
            keys_t = keys.transpose(1, 2)
            autocorrs = (queries @ keys_t) * (self.head_size ** -0.5)  # (batch_size x input_length x input_length)
            '''
            (batch_size x input_length x embed_size) @ (batch_size x embed_size x input_length) ----> (batch_size x input_length x input_length)
            '''
            if mask:
                autocorrs = torch.tril(autocorrs)
                autocorrs = autocorrs.masked_fill(autocorrs == 0, float('-inf'))
            autocorrs = torch.softmax(autocorrs, dim=-1)
            values = self.V(x) # (batch_size x input_length x head_size)
            out = autocorrs @ values
            return out

In [36]:
class MultiHeadAttention(nn.Module):
        """
        multiple parallel SA heads (communication among words)
        """

        def __init__(self, head_count, in_size,device="cuda"):
            super().__init__()
            self.heads = nn.ModuleList(SelfAttentionHead(in_size, in_size // head_count).to(device)
                for _ in range(head_count)
            )
            self.layerNorm = nn.LayerNorm(in_size).to(device)
            # self.proj = nn.Linear(out_size, out_size)

        def forward(self, x):
            # concat over channel/embeddings_size dimension
            return self.layerNorm(torch.cat([head(x) for head in self.heads], dim=-1))  # paper - after

In [37]:
class MLP(nn.Module):
        # FFNN (embed_size, embed_size*4, embed_size)
        def __init__(self, embed_size,device="cuda"):
            super().__init__()
            self.mlp = nn.Sequential(nn.Linear(embed_size, embed_size * 4).to(device),
                                     nn.ReLU(),
                                     nn.Linear(embed_size * 4, embed_size).to(device))
            self.layerNorm = nn.LayerNorm(embed_size).to(device)

        def forward(self, x):  # think
            return self.layerNorm(self.mlp(x))

In [38]:
class TransformerBlock(nn.Module):
        def __init__(self, head_count, in_size):
            super().__init__()
            self.comm = MultiHeadAttention(head_count=head_count,in_size=in_size)#.to(self.device)
            self.think = MLP(embed_size=in_size)#.to(self.device)

        def forward(self, x):
            return (x + self.think(x + self.comm(x)))

In [39]:
class TransformerBlockLM(nn.Module):
    def __init__(self, batch_size=4,
                 input_length=8,
                 embed_size=16,
                 sa_multihead_count=4,
                 pos_embed=True,
                 include_mlp=True):
        super().__init__()
        self.blocks = None
        self.ffn = None
        self.sa_heads = None
        # sa_head_size head_size of self-attention module
        self.sa_multihead_count = sa_multihead_count

        self.val_data = None
        self.train_data = None
        self.val_text = None
        self.train_text = None
        self.K = None
        self.linear_sahead_to_vocab = None
        self.vocab = None
        self.token_embeddings_table = None
        self.vocab_size = None
        self.encoder = None
        self.decoder = None
        self.vocab_size: int
        self.is_pos_emb = pos_embed
        self.include_mlp = include_mlp
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # input_length = how many consecutive tokens/chars in one input
        self.input_length = input_length
        # batch_size = how many inputs are going to be processed in-parallel (on GPU)
        self.batch_size = batch_size
        # embed_size = embedding size
        self.embed_size = embed_size

        self.lm_head = None
        self.position_embeddings_table = None

    def forward(self, in_ids, target=None):
        in_ids_emb = self.token_embeddings_table(in_ids[:, -self.input_length:]).to(self.device)
        if self.is_pos_emb:
            in_ids_emb=self.position_embeddings_table(in_ids_emb).to(self.device)
        block_outputs = self.blocks(in_ids_emb)
        logits = self.linear_sahead_to_vocab(block_outputs).to(self.device)  # compute
        
        if target is None:
            ce_loss = None
        else:
            batch_size, input_length, vocab_size = logits.shape
            logits_ = logits.view(batch_size * input_length, vocab_size)
            targets = target.view(batch_size * input_length)
            ce_loss = F.cross_entropy(logits_, targets)
        return logits, ce_loss

    def fit(self, train_iters=100, eval_iters=10, lr=0.0001):
        """
        train_iters = how many training iterations
        eval_iters = how many batches to evaluate to get average performance
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        for iteration in tqdm(range(train_iters)):
            if iteration % eval_iters == 0:
                avg_loss = self.eval_loss(eval_iters)
                print(f"iter {iteration}: train {avg_loss['train']} val {avg_loss['eval']}")
            inputs, targets = self.get_batch(split='train')
            _, ce_loss = self(inputs.to(self.device), targets.to(self.device))
            optimizer.zero_grad(set_to_none=True)  # clear gradients of previous step
            ce_loss.backward()  # propagate loss back to each unit in the network
            optimizer.step()  # update network parameters w.r.t the loss
        # torch.save(self, 'sa_pos_')

    def generate(self, context_token_ids, max_new_tokens):
        for _ in range(max_new_tokens):
            token_rep, _ = self(context_token_ids)
            last_token_rep = token_rep[:, -1, :]
            probs = F.softmax(last_token_rep, dim=1)
            next_token = torch.multinomial(probs, num_samples=1)
            context_token_ids = torch.cat((context_token_ids, next_token), dim=1)
        output_text = self.decoder(context_token_ids[0].tolist())
        return output_text

    @torch.no_grad()  # tell torch not to prepare for back-propagation (context manager)
    def eval_loss(self, eval_iters):
        perf = {}
        # set dropout and batch normalization layers to evaluation mode before running inference.
        self.eval()
        for split in ['train', 'eval']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                tokens, targets = self.get_batch(split)  # get random batch of inputs and targete
                _, ce_loss = self(tokens, targets)  # forward pass
                losses[k] = ce_loss.item()  # the value of loss tensor as a standard Python number
            perf[split] = losses.mean()
        self.train()  # turn-on training mode-
        return perf

    def prep(self, corpus):
        self.vocab = sorted(list(set(corpus)))
        self.vocab_size = len(self.vocab)
        c2i = {c: i for i, c in
               enumerate(self.vocab)}  # char c to integer i map. assign value i for every word in vocab
        i2c = {i: c for c, i in c2i.items()}  # integer i to char c map

        self.encoder = lambda doc: [c2i[c] for c in doc]
        self.decoder = lambda nums: ' '.join([i2c[i] for i in nums])

        text=corpus
        n = len(text)#text is the corpus
        self.train_text = text[:int(n * 0.9)]
        self.val_text = text[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long)

        # look-up table for embeddings (vocab_size x embed_size)
        # it will be mapping each token id to a vector of embed_size
        # a wrapper to store vector representations of each token
        self.token_embeddings_table = nn.Embedding(self.vocab_size, self.embed_size).to(self.device)

        if self.is_pos_emb:
            # self.position_embeddings_table = nn.Embedding(self.input_length, self.embed_size)
            self.position_embeddings_table = PositionalEncoding(self.embed_size, self.input_length).to(self.device)
            

        self.blocks = nn.Sequential(
            TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size),
            TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size),
            TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size),
            TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size),
            TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size),
            TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size),
        )
        # linear projection of sa_head output to vocabulary
        self.linear_sahead_to_vocab = nn.Linear(self.embed_size, self.vocab_size).to(self.device)
        return c2i

    def get_batch(self, split='train'):
        data = self.train_data if split == 'train' else self.val_data
        # get random chunks of length batch_size from data
        ix = torch.randint(len(data) - self.input_length,(self.batch_size,))
        inputs_batch = torch.stack([data[i:i + self.input_length] for i in ix])
        targets_batch = torch.stack([data[i + 1:i + self.input_length + 1] for i in ix])
        inputs_batch = inputs_batch.to(self.device)
        targets_batch = targets_batch.to(self.device)
        # inputs_batch is
        
        return inputs_batch, targets_batch

In [40]:
model_venky = TransformerBlockLM(batch_size=64,
                           input_length=32,
                           embed_size=128,
                           sa_multihead_count=8,
                           pos_embed=True,
                           include_mlp=True)
model_venky = model_venky.to(model_venky.device)
vocab=model_venky.prep(cleaned_text)
model_parameters = filter(lambda p: p.requires_grad, model_venky.parameters())
print(f'params {sum([np.prod(p.size()) for p in model_parameters])}')

params 2400755


In [41]:
print(model_venky)

TransformerBlockLM(
  (token_embeddings_table): Embedding(5107, 128)
  (position_embeddings_table): PositionalEncoding()
  (blocks): Sequential(
    (0): TransformerBlock(
      (comm): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x SelfAttentionHead(
            (K): Linear(in_features=128, out_features=16, bias=False)
            (Q): Linear(in_features=128, out_features=16, bias=False)
            (V): Linear(in_features=128, out_features=16, bias=False)
          )
        )
        (layerNorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
      (think): MLP(
        (mlp): Sequential(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): ReLU()
          (2): Linear(in_features=512, out_features=128, bias=True)
        )
        (layerNorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      )
    )
    (1): TransformerBlock(
      (comm): MultiHeadAttention(
        (heads): ModuleList(
          (0-7):

In [92]:
# model_venky.fit(train_iters=20000, eval_iters=1000, lr=1e-4)

  0%|          | 1/20000 [00:25<141:46:18, 25.52s/it]

iter 0: train 9.808846473693848 val 9.797698020935059


  5%|▌         | 1001/20000 [01:47<24:14:36,  4.59s/it]

iter 1000: train 4.214922904968262 val 6.238032817840576


 10%|█         | 2001/20000 [03:08<22:28:02,  4.49s/it]

iter 2000: train 2.2400100231170654 val 6.643329620361328


 15%|█▌        | 3001/20000 [04:30<15:24:12,  3.26s/it]

iter 3000: train 1.1253684759140015 val 7.279648303985596


 20%|██        | 4001/20000 [06:02<20:38:39,  4.65s/it]

iter 4000: train 0.6694943308830261 val 7.815800189971924


 25%|██▌       | 5001/20000 [07:30<10:42:04,  2.57s/it]

iter 5000: train 0.4890514016151428 val 8.238455772399902


 30%|███       | 6001/20000 [08:49<9:59:01,  2.57s/it] 

iter 6000: train 0.3979106843471527 val 8.593734741210938


 35%|███▌      | 7001/20000 [10:12<13:18:33,  3.69s/it]

iter 7000: train 0.3452211320400238 val 8.911792755126953


 40%|████      | 8001/20000 [11:37<14:52:57,  4.47s/it]

iter 8000: train 0.3048756718635559 val 9.17525863647461


 45%|████▌     | 9001/20000 [12:56<12:09:03,  3.98s/it]

iter 9000: train 0.2758282721042633 val 9.408883094787598


 50%|█████     | 10001/20000 [14:22<11:40:02,  4.20s/it]

iter 10000: train 0.26136037707328796 val 9.545560836791992


 55%|█████▌    | 11001/20000 [15:48<7:31:26,  3.01s/it] 

iter 11000: train 0.2503507435321808 val 9.741219520568848


 60%|██████    | 12001/20000 [17:10<6:14:59,  2.81s/it]

iter 12000: train 0.23830480873584747 val 9.870199203491211


 65%|██████▌   | 13001/20000 [18:36<7:02:15,  3.62s/it]

iter 13000: train 0.22813695669174194 val 10.062262535095215


 70%|███████   | 14001/20000 [20:02<6:23:20,  3.83s/it]

iter 14000: train 0.21847420930862427 val 10.219225883483887


 75%|███████▌  | 15001/20000 [21:24<4:02:48,  2.91s/it]

iter 15000: train 0.21461302042007446 val 10.253060340881348


 80%|████████  | 16001/20000 [22:43<3:03:46,  2.76s/it]

iter 16000: train 0.20829837024211884 val 10.416168212890625


 85%|████████▌ | 17001/20000 [23:57<3:02:01,  3.64s/it]

iter 17000: train 0.2042946219444275 val 10.49260425567627


 90%|█████████ | 18001/20000 [25:22<1:41:01,  3.03s/it]

iter 18000: train 0.20247820019721985 val 10.610663414001465


 95%|█████████▌| 19001/20000 [26:41<43:04,  2.59s/it]  

iter 19000: train 0.19791699945926666 val 10.659485816955566


100%|██████████| 20000/20000 [27:48<00:00, 11.99it/s]


### Save Model

In [93]:
# torch.save(model_venky.state_dict(), 'transformer_model_venky_no_stopwords_e4.pth')

### Load the saved parameters into the model

In [43]:
model_venky.load_state_dict(torch.load('transformer_model_venky_no_stopwords_e4.pth'))

<All keys matched successfully>

In [45]:
outputs_venky = model_venky.generate(context_token_ids=torch.zeros((1, 1),dtype=torch.long,device=model_venky.device),max_new_tokens=1000)
print(outputs_venky)


02 236 1974 1978 244 43 201 1975 1979 301 147 154 1976 1980 334 139 195 1977 1981 290 81 209 1978 1982 299 141 158 1979 1983 316 173 143 1980 1984 270 148 122 1981 1985 326 146 180 1982 1986 315 198 117 liabil net the senior citizen off the senior citizen who run start neg in other insur compani under 2002 well be costfre under hi leadership and we dave sokol had about the enorm 1999 304 285 mitek enorm proud of our yearend arizona by hold in aggreg our dividend repurchas higher all the big at midamerican famili 2006 80636 compound growth rate 19652006 275 perform will be about explain whi those for a sharehold quit grow becaus of the quarter from our endoftheyear estim made or iou to the rest of the world like i felt i owe my berkshir share by tad montross at bottom a sound insur oper requir four disciplin 1 an understand of all exposur that might caus a polici to incur loss 2 a conserv evalu of the likelihood of ani 107 41 2401 2105 39037142 walmart store inc 11 1893 2105 358936125 w

### 2.C) Prof. Model response

In [46]:
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

''' Look at all previous tokens to generate next
    @Author: Uzair Ahmad
    2022
    +TransformerBlock 
'''


class TransformerBlockLM(nn.Module):
    class TransformerBlock(nn.Module):
        def __init__(self, head_count, in_size, out_size,device="cuda"):
            super().__init__()
            self.comm = TransformerBlockLM.MultiHeadAttention(head_count=head_count,
                                                              in_size=in_size,
                                                              out_size=out_size).to(device)
            self.think = TransformerBlockLM.MLP(embed_size=out_size).to(device)

        def forward(self, x):
            return x + self.think(x + self.comm(x))

    class MLP(nn.Module):
        # FFNN (embed_size, embed_size*4, embed_size)
        def __init__(self, embed_size,device="cuda"):
            super().__init__()
            self.mlp = nn.Sequential(nn.Linear(embed_size, embed_size * 4).to(device),
                                     nn.ReLU(),
                                     nn.Linear(embed_size * 4, embed_size).to(device))
            self.layerNorm = nn.LayerNorm(embed_size).to(device)

        def forward(self, x):  # think
            return self.layerNorm(self.mlp(x))  # paper - after
            # return self.mlp(self.layerNorm(x)) # alternate - before

    class MultiHeadAttention(nn.Module):
        """
        multiple parallel SA heads (communication among words)
        """

        def __init__(self, head_count, in_size, out_size,device="cuda"):
            super().__init__()
            self.heads = nn.ModuleList(
                TransformerBlockLM.SelfAttentionHead(in_size, out_size // head_count).to(device)
                for _ in range(head_count)
            )
            self.layerNorm = nn.LayerNorm(out_size).to(device)
            # self.proj = nn.Linear(out_size, out_size)

        def forward(self, x):
            # concat over channel/embeddings_size dimension
            return self.layerNorm(torch.cat([head(x) for head in self.heads], dim=-1))  # paper - after
            # return torch.cat([head(self.layerNorm(x)) for head in self.heads], dim=-1) # alternate - before
            # return self.proj(torch.cat([head(x) for head in self.heads], dim=-1))

    class SelfAttentionHead(nn.Module):
        def __init__(self, in_size, out_size,device="cuda"):
            """
            in_size is embed_size
            out_size is head_size
            """
            super().__init__()
            self.head_size = out_size
            self.K = nn.Linear(in_size, self.head_size, bias=False).to(device)
            self.Q = nn.Linear(in_size, self.head_size, bias=False).to(device)
            self.V = nn.Linear(in_size, self.head_size, bias=False).to(device)

        def forward(self, x):
            keys = self.K(x)
            queries = self.Q(x)
            # affinities :
            # all the queries will dot-product with all the keys
            # transpose (swap) second dimension (input_length) with third (head_size)
            keys_t = keys.transpose(1, 2)
            autocorrs = (queries @ keys_t) * (self.head_size ** -0.5)  # (batch_size x input_length x input_length)
            '''
            (batch_size x input_length x embed_size) @ (batch_size x embed_size x input_length) ----> (batch_size x input_length x input_length)
            '''
            autocorrs = torch.tril(autocorrs)
            autocorrs = autocorrs.masked_fill(autocorrs == 0, float('-inf'))
            autocorrs = torch.softmax(autocorrs, dim=-1)
            values = self.V(x)  # (batch_size x input_length x head_size)
            out = autocorrs @ values
            return out

    def __init__(self, batch_size=4,
                 input_length=8,
                 embed_size=16,
                 sa_head_size=8,
                 sa_multihead_count=4,
                 pos_embed=False,
                 include_mlp=False):
        super().__init__()
        self.blocks = None
        self.ffn = None
        self.sa_heads = None
        # sa_head_size head_size of self-attention module
        self.sa_head_size = sa_head_size
        self.sa_multihead_count = sa_multihead_count

        self.val_data = None
        self.train_data = None
        self.val_text = None
        self.train_text = None
        self.K = None
        self.linear_sahead_to_vocab = None
        self.vocab = None
        self.token_embeddings_table = None
        self.vocab_size = None
        self.encoder = None
        self.decoder = None
        self.vocab_size: int
        self.is_pos_emb = pos_embed
        self.include_mlp = include_mlp
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # input_length = how many consecutive tokens/chars in one input
        self.input_length = input_length
        # batch_size = how many inputs are going to be processed in-parallel (on GPU)
        self.batch_size = batch_size
        # embed_size = embedding size
        self.embed_size = embed_size

        self.lm_head = None
        self.position_embeddings_table = None

    def forward(self, in_ids, target=None):
        # print("Hello")
        # print(in_ids.shape)
        # print(self.token_embeddings_table)
        in_ids_emb = self.token_embeddings_table(in_ids[:, -self.input_length:])
        # print("Hello")
        if self.is_pos_emb:
            in_ids_pos_emb = self.position_embeddings_table(
                torch.arange(in_ids[:, -self.input_length:].shape[1], device=self.device)
            )
            in_ids_emb = in_ids_emb + in_ids_pos_emb
        block_outputs = self.blocks((in_ids_emb))
        logits = self.linear_sahead_to_vocab(block_outputs).to(self.device)  # compute
        if target is None:
            ce_loss = None
        else:
            batch_size, input_length, vocab_size = logits.shape
            logits_ = logits.view(batch_size * input_length, vocab_size)
            targets = target.view(batch_size * input_length)
            ce_loss = F.cross_entropy(logits_, targets)
        return logits, ce_loss

    def fit(self, train_iters=100, eval_iters=10, lr=0.0001):
        """
        train_iters = how many training iterations
        eval_iters = how many batches to evaluate to get average performance
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        for iteration in tqdm(range(train_iters)):
            if iteration % eval_iters == 0:
                avg_loss = self.eval_loss(eval_iters)
                print(f"iter {iteration}: train {avg_loss['train']} val {avg_loss['eval']}")
            inputs, targets = self.get_batch(split='train')
            _, ce_loss = self(inputs, targets)
            optimizer.zero_grad(set_to_none=True)  # clear gradients of previous step
            ce_loss.backward()  # propagate loss back to each unit in the network
            optimizer.step()  # update network parameters w.r.t the loss
        # torch.save(self, 'sa_pos_')

    def generate(self, context_token_ids, max_new_tokens):
        for _ in range(max_new_tokens):
            token_rep, cse_loss = self(context_token_ids)
            last_token_rep = token_rep[:, -1, :]
            probs = F.softmax(last_token_rep, dim=1)
            next_token = torch.multinomial(probs, num_samples=1)
            context_token_ids = torch.cat((context_token_ids, next_token), dim=1)
        output_text = self.decoder(context_token_ids[0].tolist())
        return output_text

    @torch.no_grad()  # tell torch not to prepare for back-propagation (context manager)
    def eval_loss(self, eval_iters):
        perf = {}
        # set dropout and batch normalization layers to evaluation mode before running inference.
        self.eval()
        for split in ['train', 'eval']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                tokens, targets = self.get_batch(split)  # get random batch of inputs and targete
                _, ce_loss = self(tokens, targets)  # forward pass
                losses[k] = ce_loss.item()  # the value of loss tensor as a standard Python number
            perf[split] = losses.mean()
        self.train()  # turn-on training mode-
        return perf

    def prep(self, corpus):
        self.vocab = sorted(list(set(corpus)))
        self.vocab_size = len(self.vocab)
        c2i = {c: i for i, c in
               enumerate(self.vocab)}  # char c to integer i map. assign value i for every word in vocab
        i2c = {i: c for c, i in c2i.items()}  # integer i to char c map

        self.encoder = lambda doc: [c2i[c] for c in doc]
        self.decoder = lambda nums: ' '.join([i2c[i] for i in nums])
        text=corpus
        n = len(text)
        self.train_text = text[:int(n * 0.9)]
        self.val_text = text[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long)

        # look-up table for embeddings (vocab_size x embed_size)
        # it will be mapping each token id to a vector of embed_size
        # a wrapper to store vector representations of each token
        self.token_embeddings_table = \
            nn.Embedding(self.vocab_size, self.embed_size).to(self.device)

        if self.is_pos_emb:
            self.position_embeddings_table = nn.Embedding(self.input_length, self.embed_size).to(self.device)

        self.blocks = nn.Sequential(
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
            TransformerBlockLM.TransformerBlock(head_count=self.sa_multihead_count,
                                                in_size=self.embed_size,
                                                out_size=self.sa_head_size),
        )
        # linear projection of sa_head output to vocabulary
        self.linear_sahead_to_vocab = nn.Linear(self.sa_head_size, self.vocab_size).to(self.device)
        return c2i
    def get_batch(self, split='train'):
        data = self.train_data if split == 'train' else self.val_data
        # get random chunks of length batch_size from data
        ix = torch.randint(len(data) - self.input_length,
                           (self.batch_size,))
        inputs_batch = torch.stack([data[i:i + self.input_length] for i in ix])
        targets_batch = torch.stack([data[i + 1:i + self.input_length + 1] for i in ix])
        inputs_batch = inputs_batch.to(self.device)
        targets_batch = targets_batch.to(self.device)
        # inputs_batch is
        return inputs_batch, targets_batch

model = TransformerBlockLM(batch_size=64,
                           input_length=32,
                           embed_size=128,
                           sa_multihead_count=8,
                           sa_head_size=128,
                           pos_embed=True,
                           include_mlp=True)
model = model.to(model.device)


In [47]:
vocab=model.prep(cleaned_text)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
print(f'params {sum([np.prod(p.size()) for p in model_parameters])}')


params 2404851


In [21]:
model.fit(train_iters=20000, eval_iters=1000, lr=1e-4)

  0%|          | 1/20000 [00:25<142:55:49, 25.73s/it]

iter 0: train 10.834053993225098 val 10.818982124328613


  5%|▌         | 1001/20000 [01:50<17:03:35,  3.23s/it]

iter 1000: train 5.281001091003418 val 6.567872047424316


 10%|█         | 2003/20000 [03:20<11:48:43,  2.36s/it]

iter 2000: train 2.9567742347717285 val 6.759474754333496


 15%|█▌        | 3003/20000 [04:47<14:00:59,  2.97s/it]

iter 3000: train 1.7320491075515747 val 7.25107479095459


 20%|██        | 4003/20000 [06:14<11:36:19,  2.61s/it]

iter 4000: train 1.0589375495910645 val 7.79243278503418


 25%|██▌       | 5001/20000 [07:46<16:43:28,  4.01s/it]

iter 5000: train 0.7249415516853333 val 8.351359367370605


 30%|███       | 6001/20000 [09:18<17:21:20,  4.46s/it]

iter 6000: train 0.5594689249992371 val 8.754654884338379


 35%|███▌      | 7001/20000 [10:43<10:01:12,  2.78s/it]

iter 7000: train 0.46669816970825195 val 9.127132415771484


 40%|████      | 8001/20000 [12:12<10:18:10,  3.09s/it]

iter 8000: train 0.40510857105255127 val 9.44779109954834


 45%|████▌     | 9001/20000 [13:35<10:28:55,  3.43s/it]

iter 9000: train 0.3588620126247406 val 9.716057777404785


 50%|█████     | 10001/20000 [15:03<8:28:20,  3.05s/it]

iter 10000: train 0.32399845123291016 val 9.969474792480469


 55%|█████▌    | 11003/20000 [16:37<6:41:51,  2.68s/it]

iter 11000: train 0.2953031063079834 val 10.221504211425781


 60%|██████    | 12001/20000 [18:03<6:13:45,  2.80s/it]

iter 12000: train 0.2738821506500244 val 10.466636657714844


 65%|██████▌   | 13001/20000 [19:28<5:24:38,  2.78s/it]

iter 13000: train 0.25600090622901917 val 10.681992530822754


 70%|███████   | 14001/20000 [20:52<4:14:17,  2.54s/it]

iter 14000: train 0.24152036011219025 val 10.898113250732422


 75%|███████▌  | 15001/20000 [22:17<3:28:13,  2.50s/it]

iter 15000: train 0.22816354036331177 val 11.106903076171875


 80%|████████  | 16001/20000 [23:48<3:34:14,  3.21s/it]

iter 16000: train 0.2183024287223816 val 11.269912719726562


 85%|████████▌ | 17001/20000 [25:08<2:35:51,  3.12s/it]

iter 17000: train 0.20949403941631317 val 11.472888946533203


 90%|█████████ | 18003/20000 [26:30<1:18:31,  2.36s/it]

iter 18000: train 0.20234547555446625 val 11.673047065734863


 95%|█████████▌| 19001/20000 [27:50<51:39,  3.10s/it]  

iter 19000: train 0.19651968777179718 val 11.7802734375


100%|██████████| 20000/20000 [28:53<00:00, 11.54it/s]


### Save the model

In [22]:
# torch.save(model.state_dict(), 'transformer_model_prof_no_stopwords_e4.pth')

### Load the model

In [48]:
model.load_state_dict(torch.load('transformer_model_prof_no_stopwords_e4.pth'))
model = model.to(model.device)

In [23]:
outputs = model.generate(context_token_ids=torch.zeros((1, 1),dtype=torch.long,device=model.device),max_new_tokens=1000)
print(outputs)

02 236 1974 1978 244 43 201 1975 1979 301 147 154 1976 1980 334 139 195 1977 1981 290 81 209 1978 1982 299 141 158 1979 1983 316 173 143 1980 1984 270 148 122 1981 1985 326 146 180 1982 1986 315 198 117 1983 1987 274 164 110 1984 1988 250 152 98 1985 1989 311 203 108 1986 1990 229 131 98 1987 1991 254 153 101 1988 1992 256 158 98 1989 1993 244 145 99 1990 1994 186 87 99 1991 1995 256 165 91 1992 1996 242 152 90 1993 1997 269 202 67 1994 1998 337 240 97 1995 1999 304 285 19 1996 2000 229 183 46 1997 2001 148 107 41 1998 2002 104 06 110 1999 2003 60 well the frailti thought process and i have need your here 4 pm on saturday afternoon for sharehold who have come from outsid of north america everi year our meet draw mani peopl from around the globe and charli and i want to be sure we person greet those who have come so far last year we enjoy meet more than 400 of you from mani dozen of countri ani sharehold who come from other than the us or canada will be given a special credenti and inst

### 2.D) Perplexity


Implemented the model perplexity defined in this article - https://medium.com/@priyankads/perplexity-of-language-models-41160427ed72

In [25]:
def generate(text, vocab,device):
    cleaned_data=clean_text(text)
    # print(cleaned_data)
    indx_id=[]
    for text in cleaned_data:
        try:
            indx_id.append(vocab[text])
        except:
            print("Not present in vocab:",text)
    tensor = torch.tensor(indx_id, dtype=torch.long, device=device)
    return tensor

In [26]:
test_text = text[:int(len(text) * 0.1)]
# text_2=test_text[:200]
print("Input:",test_text)
tokens=generate(test_text,vocab,"cuda")

Input: BERKSHIRE HATHAWAY INC. 



To the Shareholders of Berkshire Hathaway Inc.: 

Our gain in net worth during 2006 was $16.9 billion, which increased the per-share book value of 
both our Class A and Class B stock by 18.4%. Over the last 42 years (that is, since present management 
took over) book value has grown from $19 to $70,281, a rate of 21.4% compounded annually.* 

We believe that $16.9 billion is a record for a one-year gain in net worth - more than has ever 
been booked by any American business, leaving aside boosts that have occurred because of mergers (e.g., 
AOL's purchase of Time Warner). Of course, Exxon Mobil and other companies earn far more than 
Berkshire, but their earnings largely go to dividends and/or repurchases, rather than to building net worth. 

All that said, a confession about our 2006 gain is in order. Our most important business, 
insurance, benefited from a large dose of luck: Mother Nature, bless her heart, went on vacation. After 
hammering us wit

#### Venky Model

Venky Model - without stopwords - lr=6e-3

In [None]:
_,loss= model_venky(tokens[0:25].reshape(1,-1),tokens[1:26].reshape(1,-1))
print("Perplexity",np.exp(loss.cpu().tolist()))

Venky model - lr=6e-3

In [97]:
_,loss= model_venky(tokens[0:25].reshape(1,-1),tokens[1:26].reshape(1,-1))
print("Perplexity",np.exp(loss.cpu().tolist()))

Perplexity 1.6726893632587998


Venky Model - lr=e4

In [95]:
_,loss= model_venky(tokens[0:25].reshape(1,-1),tokens[1:26].reshape(1,-1))
print("Perplexity",np.exp(loss.cpu().tolist()))

Perplexity 1.3393933768307407


Venky Model - lr=e5

In [99]:
_,loss= model_venky(tokens[0:25].reshape(1,-1),tokens[1:26].reshape(1,-1))
print("Perplexity",np.exp(loss.cpu().tolist()))

Perplexity 4.489131593828046


#### Prof. Model


Prof. Model (lr=1e-4)

In [29]:
_,loss= model(tokens[0:25].reshape(1,-1),tokens[1:26].reshape(1,-1))
print("Perplexity",np.exp(loss.cpu().tolist()))

Perplexity 1.3162753617234262


Prof. Model (lr=6e-3)

In [31]:
_,loss= model(tokens[0:25].reshape(1,-1),tokens[1:26].reshape(1,-1))
print("Perplexity",np.exp(loss.cpu().tolist()))

Perplexity 1.3775242324140997


### 2.D) Results

In [65]:
def generate(text, vocab,device):
    cleaned_data=clean_text(text)
    # print(cleaned_data)
    indx_id=[]
    input=[]
    for text in cleaned_data:
        try:
            indx_id.append(vocab[text])
            input.append(text)
        except:
            print("Not present in vocab:",text)
    tensor = torch.tensor(indx_id, dtype=torch.long, device=device)
    return tensor, input

        

In [None]:
og_popular = [
    "Rule No. 1 is never lose money. Rule No. 2 is never forget Rule No. 1.",
    "It's far better to buy a wonderful company at a fair price than a fair company at a wonderful price",
    "Opportunities come infrequently. When it rains gold, put out the bucket, not the thimble",
    "We simply attempt to be fearful when others are greedy and to be greedy only when others are fearful",
    "The most important quality for an investor is temperament, not intellect. You need a temperament that neither derives great pleasure from being with the crowd or against the crowd"
]

In [66]:
print("Original: The most important quality for an investor is temperament, not intellect. You need a temperament that neither derives great pleasure from being with the crowd or against the crowd")
print("************************************************************************************************************************************************************************************************************************")
text_1 ="The most important quality for an investor is temperament, not"
tokens,input=generate(text_1,vocab,"cuda")
print("Input tokens:", input)
print("************************************************************************************************************************************************************************************************************************")
tokens=tokens.reshape(1,-1) #BxL
outputs = model.generate(context_token_ids=tokens,max_new_tokens=1000)
# outputs = model.generate(context_token_ids,max_new_tokens=1000)
print("Prof. Model: ",outputs)

print("************************************************************************************************************************************************************************************************************************")

# print(tokens.reshape(-1,1).shape)

outputs = model_venky.generate(context_token_ids=tokens,max_new_tokens=1000)
# outputs = model.generate(context_token_ids,max_new_tokens=1000)
print("Venky's Model: ",outputs)

Original: The most important quality for an investor is temperament, not intellect. You need a temperament that neither derives great pleasure from being with the crowd or against the crowd
************************************************************************************************************************************************************************************************************************
Input tokens: ['the', 'most', 'import', 'qualiti', 'for', 'an', 'investor', 'is', 'tempera', 'not']
************************************************************************************************************************************************************************************************************************
Prof. Model:  the most import qualiti for an investor is tempera not ga had million at the qwest s stand will be surpris to wwwgeicocom lord 8 stock 198 compound annual an charli and i we earn repres no when we bought justin industri but we are usual ignor instead consult and

In [82]:
test_text = text[:int(len(text) * 0.1)]
print("Input:",test_text)
tokens, input =generate(test_text,vocab,"cuda")

Input: BERKSHIRE HATHAWAY INC. 



To the Shareholders of Berkshire Hathaway Inc.: 

Our gain in net worth during 2006 was $16.9 billion, which increased the per-share book value of 
both our Class A and Class B stock by 18.4%. Over the last 42 years (that is, since present management 
took over) book value has grown from $19 to $70,281, a rate of 21.4% compounded annually.* 

We believe that $16.9 billion is a record for a one-year gain in net worth - more than has ever 
been booked by any American business, leaving aside boosts that have occurred because of mergers (e.g., 
AOL's purchase of Time Warner). Of course, Exxon Mobil and other companies earn far more than 
Berkshire, but their earnings largely go to dividends and/or repurchases, rather than to building net worth. 

All that said, a confession about our 2006 gain is in order. Our most important business, 
insurance, benefited from a large dose of luck: Mother Nature, bless her heart, went on vacation. After 
hammering us wit

In [71]:
# tokens=tokens.reshape(1,-1) #BxL
tokens=tokens[0:32].reshape(1,-1) #BxL

print("Input Tokens:",input)
print()
print("***************************************************************************************************************************")
print()
outputs = model.generate(context_token_ids=tokens,max_new_tokens=1000)
print("Prof's Model:",outputs)
print()
print("***************************************************************************************************************************")
print()
venky_outputs= model_venky.generate(context_token_ids=tokens,max_new_tokens=1000)
# outputs = model.generate(context_token_ids,max_new_tokens=1000)
print("Venky's Model:",venky_outputs)

Input Tokens: ['berkshir', 'hathaway', 'inc', 'to', 'the', 'sharehold', 'of', 'berkshir', 'hathaway', 'inc', 'our', 'gain', 'in', 'net', 'worth', 'dure', '2006', 'wa', '169', 'billion', 'which', 'increas', 'the', 'pershar', 'book', 'valu', 'of', 'both', 'our', 'class', 'a', 'and', 'class', 'b', 'stock', 'by', '184', 'over', 'the', 'last', '42', 'year', 'that', 'is', 'sinc', 'present', 'manag', 'took', 'over', 'book', 'valu', 'ha', 'grown', 'from', '19', 'to', '70281', 'a', 'rate', 'of', '214', 'compound', 'annual', 'we', 'believ', 'that', '169', 'billion', 'is', 'a', 'record', 'for', 'a', 'oneyear', 'gain', 'in', 'net', 'worth', 'more', 'than', 'ha', 'ever', 'been', 'book', 'by', 'ani', 'american', 'busi', 'leav', 'asid', 'boost', 'that', 'have', 'occur', 'becaus', 'of', 'merger', 'eg', 'aol', 'purchas', 'of', 'time', 'warner', 'of', 'cours', 'exxon', 'mobil', 'and', 'other', 'compani', 'earn', 'far', 'more', 'than', 'berkshir', 'but', 'their', 'earn', 'larg', 'go', 'to', 'dividend', '

In [79]:
text ="Best suggestion that warren buffet gave was"
text="Last year I told you that if you had a new son or grandson to be sure to name him"
text = "In continuation of our investment philosophy, let's discuss"
text = "In a world of short-term noise, Berkshire Hathaway believes in"
text="The most important thing to remember when investing is"
print(text)
print("************************************************************************************************************************************************************************************************************************")
tokens,input=generate(text,vocab,"cuda")
print("Input tokens:", input)
print("************************************************************************************************************************************************************************************************************************")
tokens=tokens.reshape(1,-1) #BxL
outputs = model.generate(context_token_ids=tokens,max_new_tokens=1000)
# outputs = model.generate(context_token_ids,max_new_tokens=1000)
print("Prof. Model: ",outputs)

print("************************************************************************************************************************************************************************************************************************")

# print(tokens.reshape(-1,1).shape)

outputs = model_venky.generate(context_token_ids=tokens,max_new_tokens=1000)
# outputs = model.generate(context_token_ids,max_new_tokens=1000)
print("Venky's Model: ",outputs)

The most important thing to remember when investing is
************************************************************************************************************************************************************************************************************************
Input tokens: ['the', 'most', 'import', 'thing', 'to', 'rememb', 'when', 'invest', 'is']
************************************************************************************************************************************************************************************************************************
Prof. Model:  the most import thing to rememb when invest is much these long on a less lofti level sing a ceo to list for both prestig and busi that exactli what weve ad be extraordinarili rich in the movi we made last year berkshir onli becaus the combin though our origin loan but our custom satisfact midamerican acquir to at 2006 mark it to use all learn where i cant prove of cours expect that we will retain will tel

#### Discuss the most impressive text your model generated. What are the high impact design choices behind the generated text.

**Input Text = "The most important thing to remember when investing is"**

**Model Response** = "the most import thing to rememb when invest is the point i detail of the demand bancorp equiti put togeth with ibmec up wall street pressur from the agenc forc and broker or simpli a refus by a testosteronedriven ceo to accept shrink volum ha led too mani insur to write busi at inadequ price the other guy is do it sometim these year the largest view offer limit achiev by the opportun"


**What could the output possibly mean ?**

Ans. I asked the ChatGPT if it could understand the output words and rephrase them into meaningful sentences. It gave me this response "The most important thing to remember when investing is the careful consideration of demand for Bancorp equities, combined with insights from IBMEC and awareness of Wall Street pressure exerted by agencies and brokers. Additionally, it's crucial not to succumb to pressure from a testosterone-driven CEO to accept reduced volumes, as this has led many insurers to write business at inadequate prices. Other investors have also faced challenges similar to this. Sometime this year, achieving the largest view offer limit will depend on seizing opportunities"


**Model Design Parameters**

model_venky = TransformerBlockLM(batch_size=64, input_length=32, embed_size=128, sa_multihead_count=8, pos_embed=True, include_mlp=True)
model_venky.fit(train_iters=20000, eval_iters=1000, lr=1e-4)

The high impact design choices could be due positional encoding, skip connections, layer normalizations, use of multiple decoders, self-attention mechanism which understands relation of each word w.r.t the other words in the sentence