## Sequence2Sequence Transformer for Code Summarization

### Setting up the Environment - Installing Dependencies

In [None]:
!python -m spacy download en_core_web_sm # for spacy tokenizer
!pip install portalocker==2.1 # dependency for the torch.text library to work on google colab environment.

2023-08-10 10:54:28.426525: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-10 10:54:28.483643: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the pack

### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Importing Libraries

In [None]:
import json
import regex as re
from tqdm import tqdm

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.metrics import bleu_score
from typing import Iterable, List

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from timeit import default_timer as timer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Loading the Dataset from Google Drive

In [None]:
train_filename = "train.jsonl"
valid_filename = "valid.jsonl"
test_filename = "test.jsonl"

file_path = "/content/drive/MyDrive/CodeSearchNet-Python/" # change this to your path

In [None]:
def load_data(file_path, file_name):
    compiled = []
    # Regular expression pattern to remove docstrings from Python functions: matches both """[docstring]""" and '''[docstring]'''
    docstring_pattern  = r'"""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\'' 
    with open(f"{file_path}{file_name}", 'r') as file:
        for line in file:
            # Load JSON data to convert the string into a dictionary
            data = json.loads(line)
            if len(data['code_tokens']) < 256 and len(data['docstring_tokens']) < 256:
                # Use the re.sub function to replace the matched pattern with an empty string
                cleaned_code_text = re.sub(docstring_pattern, '', data['code'].strip())        
                compiled.append({'code': cleaned_code_text, 'docstring': data['docstring'].strip()})

    code_corpus = [data['code'] for data in compiled]
    docstrings_corpus = [data['docstring'] for data in compiled]
    
    return code_corpus, docstrings_corpus

In [None]:
train_code_corpus, train_docstrings_corpus = load_data(file_path, train_filename)
valid_code_corpus, valid_docstrings_corpus = load_data(file_path, valid_filename)
test_code_corpus, test_docstrings_corpus = load_data(file_path, test_filename)

### Building Vocabulary from the Dataset

In [None]:
# Initialize placeholder lists
source_data = 'code'
target_data = 'summary'

tokenizer = {}
vectorizer = {}

In [None]:
# Initilizing tokenizer to split the text into tokens
tokenizer[source_data] = get_tokenizer('spacy', language='en_core_web_sm')
tokenizer[target_data] = get_tokenizer('spacy', language='en_core_web_sm')


# function to yield list of tokens using the spacy tokenizer
def yield_tokens(data_iter, language):
    data_index = {source_data: 0, target_data: 1}

    for data_sample in data_iter:
        yield tokenizer[language](data_sample[data_index[language]])

# Define special symbols and indices
unk_idx, pad_idx, bos_idx, eos_idx = 0, 1, 2, 3

special_tokens = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [source_data, target_data]:
    # Training data Iterator
    train_iter = iter(zip(train_code_corpus, train_docstrings_corpus))
    # Creating torchtext's Vocab Object
    vectorizer[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=5,
                                                    specials=special_tokens,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
for ln in [source_data, target_data]:
  vectorizer[ln].set_default_index(unk_idx)

##### Printing the Vocabulary Size for Code and Summary Tokens

In [None]:
print("PythonCode Vocab Size: ", len(vectorizer[source_data]))
print("Docstring Vocab Size: ", len(vectorizer[target_data]))

PythonCode Vocab Size:  63637
Docstring Vocab Size:  31465


### Tokenization and Vector Embedding

In [None]:
# function to help perform sequential transformation on text data
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([bos_idx]),
                      torch.tensor(token_ids),
                      torch.tensor([eos_idx])))

# helper object to perform tokenization, numericalization and adding EOS/BOS tokens usiing sequential_transforms function
text_transform = {}
for ln in [source_data, target_data]:
    text_transform[ln] = sequential_transforms(tokenizer[ln], #Tokenization
                                               vectorizer[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# tokenization, numericalization and adding EOS/BOS tokens by calling the helper object text_transform
def vectorize(data):
    code_corpus_ids, docstring_corpus_ids = [], []
    for code_data, docstring_data in data:
        code_ids = text_transform[source_data](code_data.rstrip("\n"))
        docstring_ids = text_transform[target_data](docstring_data.rstrip("\n"))
        if len(code_ids) < 256 and len(docstring_ids) < 256:
            code_corpus_ids.append(code_ids)
            docstring_corpus_ids.append(docstring_ids)
    return list(zip(code_corpus_ids, docstring_corpus_ids))

##### Converting the test strings into vectors, and also deleting unnecessary variables to free up memory

In [None]:
train_corpus_vectors = vectorize(zip(train_code_corpus, train_docstrings_corpus))
del train_code_corpus, train_docstrings_corpus
validate_corpus_vectors = vectorize(zip(valid_code_corpus, valid_docstrings_corpus))
del valid_code_corpus, valid_docstrings_corpus
test_corpus_vectors = vectorize(zip(test_code_corpus, test_docstrings_corpus))
# del test_code_corpus, test_docstrings_corpus

##### Positional Encoding, Token Embedding, and the Transformer Model

In [None]:
# Positional Encoding to help the model learn the position of the tokens in the sequence
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Transformer Network Module - Encoder-Decoder Architecture with Multi-Head Attention
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

Function to add a subsequent word mask that will prevent the model from looking into the future words when making predictions. 

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)

    src_padding_mask = (src == pad_idx).transpose(0, 1)
    tgt_padding_mask = (tgt == pad_idx).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

### Hyperparameters

In [None]:
load_model = True
save_model = True
model_filepath = "/content/drive/MyDrive/model_saves/my_checkpoint.pth.tar" # change this to your path
model_checkpoint_interval = 1
sample_summarize_interval = 1

n_epochs = 100
learning_rate = 0.0001 
eps = 1e-9
source_vocab_size = len(vectorizer[source_data])
target_vocab_size = len(vectorizer[target_data])
embed_size = 256
n_attention_heads = 8
ffn_dim = 32
batch_size = 64 
n_encoder_layers = 3
n_decoder_layers = 3


transformer = Seq2SeqTransformer(n_encoder_layers, n_decoder_layers, embed_size,
                                 n_attention_heads, source_vocab_size, target_vocab_size, ffn_dim) # Initialize model

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx)

optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=eps)

##### Code to Perform Inferencing

In [None]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == eos_idx:
            break
    return ys


# actual function to take input code and generate summaries in english.
def summarize_code(model: torch.nn.Module, sample_code: str):
    model.eval()
    src = text_transform[source_data](sample_code).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=bos_idx).flatten()
    return " ".join(vectorizer[target_data].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

### Collation

This function adds padding to the sequences in each batch to make them of equal length.

This is performed batch wise to reduce the amount of padding required, and hence reduce the amount of computation required.





Training and evaluation loop that will be called for each
epoch.




In [None]:
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(src_sample)
        tgt_batch.append(tgt_sample)
    src_batch = pad_sequence(src_batch, padding_value=pad_idx)
    tgt_batch = pad_sequence(tgt_batch, padding_value=pad_idx)
    return src_batch, tgt_batch

#####  Functions to perform Training and Evaluation of the model

In [None]:
def train_model(model, optimizer):
    model.train()
    losses = 0
    train_iter = train_corpus_vectors
    train_dataloader = DataLoader(train_iter, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    for src, tgt in tqdm(train_dataloader):
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        # torch.cuda.empty_cache()
        losses += loss.item()
    mean_epoch_loss = losses / len(list(train_dataloader))

    return mean_epoch_loss


def evaluate_model(model):
    model.eval()
    losses = 0

    validate_iter = validate_corpus_vectors
    validate_dataloader = DataLoader(validate_iter, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    for src, tgt in validate_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        # torch.cuda.empty_cache()
        losses += loss.item()

    mean_eval_loss = losses / len(list(validate_dataloader))
    return mean_eval_loss

Load Checkpoint, if available and continue training from there. Save for every epoch.




In [None]:
epoch_counter = 1
train_loss_list = []
eval_loss_list = []

if load_model:
      checkpoint = torch.load(model_filepath)
      transformer.load_state_dict(checkpoint["state_dict"])
      optimizer.load_state_dict(checkpoint["optimizer"])
      epoch_counter = checkpoint["epoch_counter"] + 1
      train_loss_list = checkpoint["train_loss_list"]
      eval_loss_list = checkpoint["eval_loss_list"]

while epoch_counter <= n_epochs+1:
    start_time = timer()
    mean_epoch_loss = train_model(transformer, optimizer)
    end_time = timer()
    train_loss_list.append(mean_epoch_loss)

    mean_eval_loss = evaluate_model(transformer)
    eval_loss_list.append(mean_eval_loss)
    if save_model and epoch_counter % model_checkpoint_interval==0:
        checkpoint = {
            "state_dict": transformer.state_dict(),
            "optimizer": optimizer.state_dict(),
            "epoch_counter": epoch_counter,
            "train_loss_list": train_loss_list,
            "eval_loss_list": eval_loss_list
        }
        torch.save(checkpoint, model_filepath)
        print("\nCheckpoint Saved!\n")
        if mean_eval_loss < min(eval_loss_list):
            torch.save(checkpoint, f"/content/drive/MyDrive/model_saves/best_model_eval_loss{mean_eval_loss:.3f}_epoch_{epoch_counter}_checkpoint.pth.tar") # change this to your path
            print("\nBest model by far, Saved it!\n")
    if epoch_counter % sample_summarize_interval == 0:
        print("Sample Summary Generation:\n")
        print("Reference: ", test_docstrings_corpus[456])
        print("Generated: ", summarize_code(transformer, test_code_corpus[456]))
        print("\n")

    print((f"Epoch: {epoch_counter}, Train loss: {mean_epoch_loss:.3f}, Validation loss: {mean_eval_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    epoch_counter+=1

100%|██████████| 2980/2980 [05:06<00:00,  9.71it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme from current theme . 


Epoch: 38, Train loss: 2.407, Validation loss: 3.551, Epoch time = 309.450s


100%|██████████| 2980/2980 [05:03<00:00,  9.83it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme name from current theme . 


Epoch: 39, Train loss: 2.394, Validation loss: 3.532, Epoch time = 305.496s


100%|██████████| 2980/2980 [05:02<00:00,  9.84it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme name from current theme . 


Epoch: 40, Train loss: 2.382, Validation loss: 3.558, Epoch time = 305.136s


100%|██████████| 2980/2980 [05:02<00:00,  9.84it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get theme from theme using theme name . 


Epoch: 41, Train loss: 2.370, Validation loss: 3.569, Epoch time = 305.102s


100%|██████████| 2980/2980 [05:02<00:00,  9.85it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme name based on theme name . 


Epoch: 42, Train loss: 2.358, Validation loss: 3.577, Epoch time = 304.593s


100%|██████████| 2980/2980 [05:03<00:00,  9.83it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Set current theme to use . 


Epoch: 43, Train loss: 2.348, Validation loss: 3.575, Epoch time = 305.479s


100%|██████████| 2980/2980 [05:02<00:00,  9.84it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme name from current theme . 


Epoch: 44, Train loss: 2.336, Validation loss: 3.576, Epoch time = 305.031s


100%|██████████| 2980/2980 [05:04<00:00,  9.79it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme from current theme . 

     Default override is used to create a new theme name . 


Epoch: 45, Train loss: 2.325, Validation loss: 3.572, Epoch time = 306.467s


100%|██████████| 2980/2980 [05:02<00:00,  9.84it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get theme from theme . 


Epoch: 46, Train loss: 2.315, Validation loss: 3.584, Epoch time = 305.101s


100%|██████████| 2980/2980 [05:03<00:00,  9.82it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme name from current theme . 

     : param override : default theme name . Default is None . 
     : return : theme name . 


Epoch: 47, Train loss: 2.305, Validation loss: 3.595, Epoch time = 305.517s


100%|██████████| 2980/2980 [05:03<00:00,  9.82it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme name from current theme . 

     If no theme is given , use the default theme name . 


Epoch: 48, Train loss: 2.295, Validation loss: 3.588, Epoch time = 305.621s


100%|██████████| 2980/2980 [05:03<00:00,  9.81it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme from current theme . 

     Default is ' <unk> ' , which returns a new theme name . 


Epoch: 49, Train loss: 2.286, Validation loss: 3.612, Epoch time = 306.087s


100%|██████████| 2980/2980 [05:03<00:00,  9.81it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme name from theme . 

     Default is ' <unk> ' , which returns theme name . 


Epoch: 50, Train loss: 2.277, Validation loss: 3.624, Epoch time = 305.823s


100%|██████████| 2980/2980 [05:03<00:00,  9.81it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme from current theme 

     : param override : default theme name 
     : returns : new theme name 


Epoch: 51, Train loss: 2.268, Validation loss: 3.623, Epoch time = 306.101s


100%|██████████| 2980/2980 [05:03<00:00,  9.81it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme from theme . 

     : param override : name of theme to use . Default is None . 
     : returns : new theme name . 


Epoch: 52, Train loss: 2.259, Validation loss: 3.631, Epoch time = 305.994s


100%|██████████| 2980/2980 [05:03<00:00,  9.81it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get a theme from the current theme . 

     : param override : default theme to use . Default : None . 
     : returns : a new theme name . 


Epoch: 53, Train loss: 2.251, Validation loss: 3.602, Epoch time = 306.098s


100%|██████████| 2980/2980 [05:03<00:00,  9.82it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme from current theme . 

     If no theme is given , use current theme . If no theme is given , return current theme . 


Epoch: 54, Train loss: 2.243, Validation loss: 3.631, Epoch time = 305.575s


100%|██████████| 2980/2980 [05:03<00:00,  9.81it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get theme from theme . 

     Default is used to create theme from theme . 

     : param override : name of theme to use . Default is None . 
     : returns : current theme used to use to use theme . 


Epoch: 55, Train loss: 2.234, Validation loss: 3.646, Epoch time = 306.058s


100%|██████████| 2980/2980 [05:03<00:00,  9.81it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get theme from theme . 

     Default theme is used to override theme in the current theme . 

     : param override : theme to use to use to use theme ( defaults to current theme ) 
     : returns : theme name 


Epoch: 56, Train loss: 2.226, Validation loss: 3.647, Epoch time = 305.853s


100%|██████████| 2980/2980 [05:03<00:00,  9.83it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme from current theme . 

     If no theme is given , use the current theme name . 


Epoch: 57, Train loss: 2.217, Validation loss: 3.641, Epoch time = 305.244s


100%|██████████| 2980/2980 [05:03<00:00,  9.82it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Set theme to use . If no theme is given , use the current theme name . 

     If no theme is given , use the current theme name . 


Epoch: 58, Train loss: 2.211, Validation loss: 3.682, Epoch time = 305.526s


100%|██████████| 2980/2980 [05:03<00:00,  9.82it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme from current theme 


Epoch: 59, Train loss: 2.203, Validation loss: 3.666, Epoch time = 305.672s


100%|██████████| 2980/2980 [05:03<00:00,  9.80it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get theme from theme . 

     If no theme is given , use the current theme name . 


Epoch: 60, Train loss: 2.195, Validation loss: 3.683, Epoch time = 306.113s


100%|██████████| 2980/2980 [05:02<00:00,  9.84it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get current theme name from theme . 

     If no theme is specified , use the current theme name . 

     : param override : name of theme to use . If no theme name is given , use current theme name . 
     : returns : current theme name . 


Epoch: 61, Train loss: 2.189, Validation loss: 3.681, Epoch time = 305.103s


100%|██████████| 2980/2980 [05:03<00:00,  9.80it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get theme from current theme . 

     If no theme is given , use the current theme name . If no theme is given , 
     will be used . 


Epoch: 62, Train loss: 2.182, Validation loss: 3.682, Epoch time = 306.111s


100%|██████████| 2980/2980 [05:05<00:00,  9.75it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get a theme from the theme . If no theme is given , use the current theme . 

     Default is used . 


Epoch: 63, Train loss: 2.175, Validation loss: 3.684, Epoch time = 307.713s


100%|██████████| 2980/2980 [05:03<00:00,  9.83it/s]



Checkpoint Saved!

Sample Summary Generation:

Reference:  Returns theme name.

    Checks in this order:
    1. override
    2. cookies
    3. settings
Generated:   Get a theme from the theme . 

     Default is ' <unk> ' , which returns the theme name . 

     If no theme is given , use the current theme name . 


Epoch: 64, Train loss: 2.168, Validation loss: 3.701, Epoch time = 305.534s


 21%|██        | 621/2980 [01:03<04:01,  9.78it/s]


KeyboardInterrupt: ignored

##### Sample Inference

In [None]:
index = 4081
print("Sample Summary Generation:\n")
print("Input Code: ", test_code_corpus[index])
print("Reference: ", test_docstrings_corpus[index])
print("\n")
print("Generated: ", summarize_code(transformer, test_code_corpus[index]))


Sample Summary Generation:

Input Code:  def get_permissions(self):
        

        permissions = ''
        if self.groups.filter(name='Admin').exists() or self.is_superuser:
            permissions = 'admin'

        return permissions
Reference:  Returns the user's permissions.


Generated:   Returns permissions of the user . 


### Evaluation

Installing Dependencies

In [None]:
!pip install pytorch-ignite

Collecting pytorch-ignite
  Downloading pytorch_ignite-0.4.12-py3-none-any.whl (266 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/266.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m194.6/266.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.8/266.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-ignite
Successfully installed pytorch-ignite-0.4.12


Importing libraries for evaluation

In [None]:
from ignite.metrics import RougeL
from ignite.metrics.nlp import Bleu

Function to calculate the BLEU and RougeL scores for the model

In [None]:
# below is the code to calculate the BLEU and rouge score for the model using the test dataset
def calculate_bleu_rougel(model: torch.nn.Module, data):
    model.eval()
    targets = []
    outputs = []
    for src_sample, tgt_sample in tqdm(data):
        src = text_transform[source_data](src_sample).view(-1, 1)
        num_tokens = src.shape[0]
        src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
        tgt_tokens = greedy_decode(
            model,  src, src_mask, max_len=num_tokens + 5, start_symbol=bos_idx).flatten()

        generated_summary = vectorizer[target_data].lookup_tokens(list(tgt_tokens.cpu().numpy()))
        correct_summary_tokens = tokenizer[target_data](tgt_sample.rstrip("\n"))

        targets.append(correct_summary_tokens)
        outputs.append(generated_summary)

    rouge_metric = RougeL(multiref="best")
    bleu = Bleu(ngram=1, smooth="smooth1")

    bleu.update(([generated_summary], [correct_summary_tokens]))
    rouge_metric.update((outputs, targets))

    print("BLEU-1: ", bleu.compute())
    print("RougeL: ", rouge_metric.compute())

    return bleu.compute(), rouge_metric.compute()

Calculate the BLEU and RougeL scores for the model on the test dataset

In [None]:
calculate_bleu_rougel(transformer, list(zip(test_code_corpus,test_docstrings_corpus)))

100%|██████████| 11450/11450 [28:12<00:00,  6.76it/s]


BLEU-1:  tensor(0.1000, dtype=torch.float64)
RougeL:  {'Rouge-L-P': 0.04868377655271807, 'Rouge-L-R': 0.766002440190948, 'Rouge-L-F': 0.766002440190948}


(tensor(0.1000, dtype=torch.float64),
 {'Rouge-L-P': 0.04868377655271807,
  'Rouge-L-R': 0.766002440190948,
  'Rouge-L-F': 0.766002440190948})