# MT5 Small Model

In this notebook, we will be fine tuning the MT5 Sequence-to-Sequence Transformer model to take a Natural Language structured card specification to Java code.

### Check for Cuda Compatibility.

In [None]:
import torch
import torch.nn as nn
torch.cuda.is_available()

True

In [None]:
using_google_drive = True

if using_google_drive:
    from google.colab import drive
    drive.mount('/content/gdrive')
    mahmoud_path = '/content/gdrive/MyDrive/Final Project/'
    tommy_path = '/content/gdrive/MyDrive/Colab Notebooks/Final Project/'
    path = mahmoud_path
    PATH = path

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%%bash
pip -q install transformers
pip -q install tqdm
pip -q install sentencepiece 

# Tokenizer for the MT5 Model

In [None]:
# Tokenizers
import transformers
pretrained_model_name = 'google/mt5-small'

tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name)
context_ids = tokenizer.encode("When this creature enters the battle field, target creature loses 2 life.")
print(tokenizer.convert_ids_to_tokens(context_ids))

['▁', 'When', '▁this', '▁c', 'reature', '▁enter', 's', '▁the', '▁battle', '▁field', ',', '▁target', '▁c', 'reature', '▁los', 'es', '▁2', '▁life', '.', '</s>']


In [None]:
ctx_list = [
    "You can protect yourself by wearing an N95 mask.", 
    "wearing an N95 mask"
]

tokenizer_output = tokenizer.batch_encode_plus(
    ctx_list,
    max_length = 12,
    truncation=True,
    padding='longest',
    return_attention_mask=True,
    return_tensors='pt'
)

for key, value in tokenizer_output.items():
    print(key)
    print(value)
    print('-------------------------')

input_ids
tensor([[ 1662,   738, 36478, 23191,   455, 25972,   347,   461,   441,  4910,
          6408,     1],
        [25972,   347,   461,   441,  4910,  6408,     1,     0,     0,     0,
             0,     0]])
-------------------------
attention_mask
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
-------------------------


# Dataset Collection and Processing

Load the dataset. The framework for making changes to individual points in the dataset is set in the `preprocess_datapoint` method, which at the moment does nothing to our dataset.

In [None]:
with open(PATH + 'datasets_uwu/train_magic.in') as f:
    train_x = f.readlines()
with open(PATH + 'datasets_uwu/train_magic.out') as f:
    train_y = f.readlines()
with open(PATH + 'datasets_uwu/test_magic.in') as f:
    test_x = f.readlines()
with open(PATH + 'datasets_uwu/test_magic.out') as f:
    test_y = f.readlines()
# Structure the dataset somewhat similarly to the dataset objects
training_dataset = [{ 'card': x, 'code': y } for x, y in zip(train_x, train_y)]
testing_dataset  = [{ 'card': x, 'code': y } for x, y in zip(test_x,  test_y )]

dataset = {
    "train": training_dataset,
    "test":  testing_dataset
}

In [None]:
import json
import random
from multiprocessing import Pool
from tqdm import tqdm, trange


def preproc_init(tokenizer_for_model):
    """ 
    Use this to assign global variables within a new thread 
    
    Parameters
    ----------
    tokenizer_for_mode: fn
        The tokenizer for the pretrained transformer model
    """
    global tokenizer
    tokenizer = tokenizer_for_model

def preprocess_datapoint (datapoint):
    """
    Is effectively an identity function, but is here if we do preprocessing later
    
    This method will preprocess a single datapoint loaded above. This can involve
    replacing characters, removing parts of the input or output, etc. The current
    implementation applies no change to the dict. It can return None if we want to 
    remove this datapoint as well.

    Parameters
    ----------
    datapoint: dict
        The dict containing the initial value of each data in the dataset.
        Each datapoint has the following two datapoints
        "card": the string for the card description and meta data
        "code": the string for the card implementation in Java
    
    Returns
    -------
    dict
        A new representation for this individual datapoint.
    """
    
    # We have access to global vars defined in preproc_init
    return datapoint
    
    
def preprocess_dataset(dataset_list, threads, tokenizer):
    """
    Preprocesses the entire dataset in `threads` threads
    
    This method opens `threads` new threads in

    Parameters
    ----------
    dataset_list: dict[]
        A list of datapoints, where each datapoint is in the shape:
        "card": the string for the card description and meta data
        "code": the string for the card implementation in Java
    threads: int
        The number of threads to run the preprocessing on
    tokenizer: fn
        The tokenizer for the particular pretrained model
    
    Returns
    -------
    dict
        A new representation for every datapoint in the dataset_list
    """
    
    # Open new threads and map tasks between them
    with Pool(threads, initializer=preprocess_datapoint, initargs=(tokenizer,)) as p:
        processed_dataset = list(tqdm(p.imap(preprocess_datapoint, dataset_list), total=len(dataset_list)))
    # Remove None values in the list
    processed_dataset = [x for x in processed_dataset if x]
    
    json.dump(processed_dataset, open(PATH + "/processed_dataset.json", 'w'))
    return processed_dataset

processed_dataset = preprocess_dataset(dataset['train'], 16, tokenizer)


100%|██████████| 11969/11969 [00:01<00:00, 7873.45it/s]


# Building the Model

In [None]:

class ModelOutputs:
    def __init__(self, output_logits=None, loss=None):
        """
        An object containing the output of the CardTranslationModel
        
        Parameters
        ----------
        output_logits : torch.tensor
            shape (batch_size, ans_len)
        loss : torch.tensor
            shape (1) The loss of the output

        """
        self.output_logits = output_logits
        self.loss = loss
        
class CardTranslationModel(nn.Module):

    def __init__(self, lm=None):
        """
        Initializes the CardTranslationModel with the provided learning mdoel

        Parameters
        ----------
        lm : pretrained transformer
            Description of arg1

        """
        super(CardTranslationModel, self).__init__()
        self.lm = lm
    
    def forward(self, input_ids=None, attention_mask=None, label_ids=None):
        """
        Summary line.

        Extended description of function.

        Parameters
        ----------
        input_ids : torch.tensor
            shape (batch_size, seq_len) ids of the concatenated input tokens
        attention_mask : torch.tensor
            shape (batch_size, seq_len) concatenated attention masks
        label_ids: torch.tensor
            shape (batch_size, ans_len) the expected code output

        Returns
        -------
        ModelOutputs
            Description of return value

        """
        # Feed our input ids into the pretrained transformer
        lm_output = self.lm(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=label_ids,
            use_cache=False
        )
        
        # Compute Loss from the output of the learning model
        total_loss = lm_output['loss']
        if False and label_ids is not None:
            loss_fct = nn.CrossEntropyLoss()
            total_loss = loss_fct(label_ids, lm_output['logits'])
            
        return ModelOutputs(
            output_logits=lm_output['logits'],
            loss=total_loss)
        

In [None]:
from transformers import MT5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained(pretrained_model_name)
# Create the CardTranslationModel using the MT5 Conditional Generation model
lm_pretrained = MT5ForConditionalGeneration.from_pretrained(pretrained_model_name)
model = CardTranslationModel(lm_pretrained).cuda()

## Up Next Training:

In [None]:
import torch

# Hyper-parameters: you could try playing with different settings
num_epochs = 1
learning_rate = 3e-5
weight_decay = 1e-5
eps = 1e-6
batch_size = 2
warmup_rate = 0.05
card_max_length = 448
code_max_length = 448

# Calculating the number of warmup steps
num_training_cases = len(processed_dataset)
t_total = (num_training_cases // batch_size + 1) * num_epochs
ext_warmup_steps = int(warmup_rate * t_total)

# Initializing an AdamW optimizer
ext_optim = torch.optim.AdamW(model.parameters(), lr=learning_rate,
                              eps=eps, weight_decay=weight_decay)

# Initializing the learning rate scheduler [details are in the BERT paper]
# ext_sche = transformers.get_linear_schedule_with_warmup(
#     ext_optim, num_warmup_steps=ext_warmup_steps, num_training_steps=t_total
# )

print("***** Training Info *****")
print("  Num examples = %d" % t_total)
print("  Num Epochs = %d" % num_epochs)
print("  Batch size = %d" % batch_size)
print("  Total optimization steps = %d" % t_total)

***** Training Info *****
  Num examples = 5985
  Num Epochs = 1
  Batch size = 2
  Total optimization steps = 5985


In [None]:
def vectorize_batch(batch, tokenizer):
    """
    Converts the batch of processed datapoints into separate tensors of token ids
    
    Converts the batch of processed datapoints into separate tensors of token ids
    hosted on the GPU. 
    
    Parameters
    ----------
    batch: str[]
        shape (batch_size, 1)
    tokenizer: fn
        Converts the batch to a tensor of input and output ids
    
    Returns
    -------
    input_ids: torch.tensor
        shape (batch_size, max_input_len)
    input_attn_mask: torch.tensor
        shape (batch_size, max_input_len)
    label_ids: torch.tensor
        shape (batch_size, max_output_len)
    """
    
    # Separate the batch into input and output
    card_batch = [card_data['card'] for card_data in batch]
    code_batch = [code_data['code'] for code_data in batch]
    
    # Encode the card's natural language representation
    card_encode = tokenizer.batch_encode_plus(
        card_batch,
        max_length = card_max_length,
        truncation = True,
        padding = 'longest',
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    # Encode the card's java code representation
    code_encode = tokenizer.batch_encode_plus(
        code_batch,
        max_length = code_max_length,
        truncation = True,
        padding = 'longest',
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    
    # Move the training batch to GPU
    card_ids        = card_encode['input_ids'].cuda()
    card_attn_mask  = card_encode['attention_mask'].cuda()
    code_ids        = code_encode['input_ids'].cuda()
    
    return card_ids, card_attn_mask, code_ids
    
    

In [None]:
import gc

model.train()
max_grad_norm = 1

training_dataset = processed_dataset[:6000]
num_training_cases = len(training_dataset)

step_id = 0
for _ in range(num_epochs):

    random.shuffle(training_dataset)

    for i in range(0, num_training_cases, batch_size):
        gc.collect()
        torch.cuda.empty_cache()

        batch = training_dataset[i: i + batch_size]
        input_ids, input_attn_mask, label_ids = vectorize_batch(batch, tokenizer)

        gc.collect()
        torch.cuda.empty_cache()

        model.zero_grad() # Does the same as ext_optim.zero_grad()

        # Get the model outputs, including (start, end) logits and losses
        # stored as a ModelOutput object
        outputs = model(            
            input_ids=input_ids,
            attention_mask=input_attn_mask,
            label_ids=label_ids
        )
        
        gc.collect()
        torch.cuda.empty_cache()

        # Back-propagate the loss signal and clip the gradients
        loss = outputs.loss.mean()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        # Update neural network parameters and the learning rate
        ext_optim.step()
        # ext_sche.step() # Update learning rate for better convergence

        if step_id % 100 == 0:
            print(f'At step {step_id}, the extraction loss = {loss}')
        
        step_id += 1

        input_ids.detach()
        input_attn_mask.detach()
        label_ids.detach()
        outputs.loss.detach()
        
        del input_ids
        del input_attn_mask
        del label_ids
        del outputs

        torch.cuda.empty_cache()

print('Finished Training')

At step 0, the extraction loss = 25.254150390625
At step 100, the extraction loss = 18.907732009887695
At step 200, the extraction loss = 13.932411193847656
At step 300, the extraction loss = 9.968170166015625
At step 400, the extraction loss = 8.454705238342285
At step 500, the extraction loss = 7.8662261962890625
At step 600, the extraction loss = 5.209946632385254
At step 700, the extraction loss = 4.320058822631836
At step 800, the extraction loss = 3.4333794116973877
At step 900, the extraction loss = 2.139693021774292
At step 1000, the extraction loss = 2.3690686225891113
At step 1100, the extraction loss = 1.8576878309249878
At step 1200, the extraction loss = 2.5573413372039795
At step 1300, the extraction loss = 2.447085380554199
At step 1400, the extraction loss = 2.2938404083251953
At step 1500, the extraction loss = 1.7430731058120728
At step 1600, the extraction loss = 1.3317904472351074
At step 1700, the extraction loss = 1.7130123376846313
At step 1800, the extraction lo

In [None]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/MT5_checkpoint/MT5.pt')

In [None]:
import torch
import gc
gc.collect()
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    4589 MB |    9100 MB |   29047 GB |   29042 GB |
|       from large pool |    4301 MB |    8742 MB |   26755 GB |   26751 GB |
|       from small pool |     288 MB |     540 MB |    2291 GB |    2291 GB |
|---------------------------------------------------------------------------|
| Active memory         |    4589 MB |    9100 MB |   29047 GB |   29042 GB |
|       from large pool |    4301 MB |    8742 MB |   26755 GB |   26751 GB |
|       from small pool |     288 MB |     540 MB |    2291 GB |    2291 GB |
|---------------------------------------------------------------

In [None]:

model.load_state_dict(torch.load(PATH + '/checkpoint'))

RuntimeError: ignored

In [None]:
#---Sandbox code---#

import random
batch_size = 2
i = random.randint(0, num_training_cases-1)

datapoint = training_dataset[i:i + batch_size]
#print(datapoint)
card_ids, card_attn_masks, code_ids = vectorize_batch(datapoint, tokenizer)

output = model(card_ids, card_attn_masks, code_ids)



In [None]:
#---Sandbox code---#

#print(output.output_logits)
softmax = torch.nn.Softmax(dim=1)(output.output_logits)

input_id = tokenizer([datapoint[_]['card'] for _ in range(2)], padding=True, return_tensors='pt').input_ids.cuda()
print('Card: ' + datapoint[0]['card'])
#print(input_id)
outputs = model.lm.generate(input_id)
print(outputs.squeeze(0))
code = tokenizer.batch_decode(outputs)
print(code)
print(end)
#print('Ground-truth code: ' + datapoint[0]['code'])
print(type(tokenizer))
print(type(code))


Card: Moriok Rigger NAME_END 2 ATK_END 2 DEF_END {2}{B} COST_END NIL DUR_END Creature - Human Rogue Rigger TYPE_END Fifth Dawn PLAYER_CLS_END 54 RACE_END R RARITY_END Whenever an artifact is put into a graveyard from the battlefield , you may put a +1/+1 counter on Moriok Rigger .

tensor([[     0,   2821,   3931,  48359,    925,    582,  48519,  42685,    263,
          12495, 205046,    785,   3792,  16077,    785,   3792,  10961,  48359,
            925,    582],
        [     0,   2821,   3931, 102372,    470,  18283, 127739,   1513,  42685,
            263,  12495, 205046,    785,   3792,  10961, 102372,    470,  18283,
         127739,   1513]], device='cuda:0')
['<pad> public class MoriokRigger extends CardImpl {§static {§public MoriokR', '<pad> public class AnthemOfRakdos extends CardImpl {§public AnthemOfRakdos']
<pad> public class AnthemManaCostImpl {§public AnthemManaCost(UUID
<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>
<class 'list'>


In [None]:
def autoregressive_generate(model, tokenizer, card_desc):
  '''
  Applies autoregressive generation on a card description to generate its corresponding code

  Parameters
  ----------
  model: CardTranslationModel
      Model used for autoregressive generation
  tokenizer: transformers.models.tokenizer.PreTrainedTokenizer
      Tokenizer for encoding the card description
  card_desc: str or list[str] (batch_size length)
      card description

  Returns
  ---------
  torch.Tensor containing the sequence of code ids generated from the card description
     shape: (batch_size, max_seq_len)
'''
  card_inputs = tokenizer(card_desc, padding=True, return_tensors='pt').input_ids.cuda()
  return model.lm.generate(card_inputs)

def decode(tokenizer, code_ids):
  '''
  Translates code ids into tokens

  Parameters
  ----------
  tokenizer: transformers.models.tokenizer.PreTrainedTokenizer
      Tokenizer for decoding
  code_ids: torch.Tensor
      shape: (batch_size, max_seq_len)

  Returns
  ---------
  list containing the token seq for each id sequence in the batch
      shape (batch_size, max_seq_len)
  '''
  decode_output = tokenizer.batch_decode(code_ids)
  return decode_output
