# MT5 Small Model

In this notebook, we will be fine tuning the MT5 Sequence-to-Sequence Transformer model to take a Natural Language structured card specification to Java code.

### Check for Cuda Compatibility.

In [7]:
import torch
print(torch.cuda.is_available())

True


# Tokenizer for the BART Model

In [8]:
# Tokenizers
import transformers
pretrained_model_name = 'google/mt5-small'

tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name)
context_ids = tokenizer.encode("wearing an N95 mask.")
print(tokenizer.convert_ids_to_tokens(context_ids))

['▁wear', 'ing', '▁an', '▁N', '95', '▁mask', '.', '</s>']


In [6]:
ctx_list = [
    "You can protect yourself by wearing an N95 mask.", 
    "wearing an N95 mask"
]

tokenizer_output = tokenizer.batch_encode_plus(
    ctx_list,
    max_length = 12,
    truncation=True,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt'
)

for key, value in tokenizer_output.items():
    print(key)
    print(value)
    print('-------------------------')

input_ids
tensor([[ 1662,   738, 36478, 23191,   455, 25972,   347,   461,   441,  4910,
          6408,     1],
        [25972,   347,   461,   441,  4910,  6408,     1,     0,     0,     0,
             0,     0]])
-------------------------
attention_mask
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
-------------------------


In [17]:

class ModelOutputs:
    def __init__(self, start_logits=None, end_logits=None, loss=None):
        """
        An object containing the output of the CardTranslationModel
        
        Parameters
        ----------
        arg1 : int
            Description of arg1
        arg2 : str
            Description of arg2

        """
        self.start_logits = start_logits
        self.end_logits = end_logits
        self.loss = loss
        
        
        
class CardTranslationModel(nn.Module):

    def __init__(self, lm=None):
        """
        Summary line.

        Extended description of function.

        Parameters
        ----------
        arg1 : int
            Description of arg1
        arg2 : str
            Description of arg2

        """
        super(QuestionAnsweringModel, self).__init__()
        self.qa_outputs = nn.Linear(lm.config.dim, 2)
        self.lm = lm
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids=None, attention_mask=None,
                expected=None):
        """
        Summary line.

        Extended description of function.

        Parameters
        ----------
        input_ids : int[] 
            shape (batch_size, seq_len) ids of the concatenated input tokens
        attention_mask : int[]
            shape (batch_size, seq_len) concatenated attention masks
        expected: ModelOutputs
            shape (batch_size, ans_len) the expected code output

        Returns
        -------
        ModelOutputs
            Description of return value

        """
        # Feed our input ids into the pretrained transformer
        lm_output = self.lm(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        hidden_states = lm_output["last_hidden_state"]
        
        

        # -------- End of your code for logits calculation ------ #

        total_loss = None
        if expected is not None:

            loss_fct = nn.CrossEntropyLoss()
            
            # ----- Getting training losses ------------------- #
            #
            # 1. calculate start_losses with
            #       a. start_logits
            #       b. start_positions
            #
            # 2. calculate end_losses with end_logits
            #       a. end_logits
            #       b. end_positions
            #
            # --------- Your code starts ---------------------- #

            start_loss = loss_fct(start_logits, start_positions)
            end_loss =   loss_fct(end_logits, end_positions)

            # --------- Your code ends ------------------------ #

            total_loss = (start_loss + end_loss) / 2
        
        return ModelOutputs(
            start_logits=start_logits,
            end_logits=end_logits,
            loss=total_loss)
        

In [11]:
lm_pretrained = transformers.AutoModel.from_pretrained('google/mt5-small')
model = QuestionAnsweringModel(lm_pretrained)
model.cuda()

Downloading:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/mt5-small were not used when initializing MT5Model: ['lm_head.weight']
- This IS expected if you are initializing MT5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MT5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:

decoder_list = [
    "hello world",
    "public class"
]

decoder_tokenized = tokenizer.batch_encode_plus(
    decoder_list,
    max_length = 12,
    truncation=True,
    padding='longest',
    return_attention_mask=False,
    return_tensors='pt'
)

lm_pretrained(tokenizer_output["input_ids"], tokenizer_output["attention_mask"], decoder_tokenized["input_ids"])

Seq2SeqModelOutput(last_hidden_state=tensor([[[-0.2847,  0.0122,  0.0427,  ...,  0.0160,  0.1034,  0.1684],
         [-0.7667,  0.0132,  0.3480,  ..., -0.0117,  0.1370, -0.0580],
         [-0.2628,  0.1356,  0.4360,  ..., -0.0551,  0.0601, -0.1534]],

        [[-0.8850, -0.0634, -0.3341,  ...,  0.4440, -0.2975, -0.2454],
         [ 0.0252, -0.0385, -0.0856,  ...,  0.0708, -0.1840, -0.0376],
         [-0.1595,  0.0676,  0.1992,  ..., -0.0364,  0.0111, -0.0969]]],
       grad_fn=<MulBackward0>), past_key_values=((tensor([[[[ 2.2908,  0.6447,  1.0895,  ..., -0.1596, -1.0412, -1.5044],
          [-1.5115,  1.2403, -0.1899,  ...,  0.3731,  1.3173,  0.0486],
          [-0.9841, -0.3238,  1.3064,  ..., -1.1924,  0.1291,  1.2711]],

         [[ 0.3688, -0.7613, -0.5332,  ...,  0.2858,  0.1711, -0.0721],
          [ 0.7757, -0.5995,  0.7747,  ...,  0.0086,  0.7643, -1.1765],
          [ 0.7209, -0.6039, -0.2207,  ...,  0.5430, -0.4988, -0.0220]],

         [[-0.3608,  0.3277,  0.2261,  ...,  1.

In [23]:
tokenizer("The <extra_id_0> walks in <extra_id_1> park")

{'input_ids': [486, 250099, 12747, 263, 281, 250098, 10676, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}