In [1]:
!pip install --upgrade transformers
!pip install sentencepiece
!pip install pandas

Collecting transformers
  Downloading transformers-4.8.1-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 1.4 MB/s eta 0:00:01
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 12.9 MB/s eta 0:00:01
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting regex!=2019.12.17
  Downloading regex-2021.4.4-cp37-cp37m-manylinux2014_x86_64.whl (720 kB)
[K     |████████████████████████████████| 720 kB 13.8 MB/s eta 0:00:01
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 16.6 MB/s eta 0:00:01
Collecting joblib
  Downloading joblib-1.0.1-py3-none-any.whl (303 kB)
[K     |████████████████████████████████| 303 kB 22.5 MB/s eta 0:00:01
[?25hCollectin

In [2]:
!nvidia-smi

Sun Jun 27 21:00:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.04    Driver Version: 455.23.04    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3090    On   | 00000000:42:00.0 Off |                  N/A |
|  0%   50C    P8    38W / 350W |      0MiB / 24267MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
import re
import io
import os
import glob
import tokenize
import sentencepiece as sp
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import get_constant_schedule_with_warmup
from transformers.models.t5 import T5Model, T5Config, T5ForConditionalGeneration
from transformers.models.bart import BartForConditionalGeneration, BartConfig

In [34]:
# Constants

path = "."
MODEL_VERSION = 'result'
SAMPLE_SIZE = 0.6
VOCAB_SIZE = 30_000
P_BPE = 0.2
D_MODEL = 1024
NUM_LAYERS = 8
NUM_DECODER_LAYERS = 8
D_FF = 3072
NUM_HEADS = 16
DROPOUT_RATE = 0.1
LEARNING_RATE = 5e-5
NUM_WARMUP_STEPS = 500
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 8
MAX_EPOCHS = 8
PATIENCE = 0

In [7]:
!wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
!unzip python.zip

--2021-06-27 21:01:33--  https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.110.198
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.110.198|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 940909997 (897M) [application/zip]
Saving to: ‘python.zip’


2021-06-27 21:02:10 (25.0 MB/s) - ‘python.zip’ saved [940909997/940909997]

Archive:  python.zip
   creating: python/
   creating: python/final/
   creating: python/final/jsonl/
   creating: python/final/jsonl/train/
  inflating: python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_10.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_0.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_6.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_2.jsonl.gz  
  inflating: python/final/jsonl/train/p

In [8]:
def jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f,
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

def get_dfs(path):
    """Grabs the different data splits and converts them into dataframes"""
    dfs = []
    for split in ["train", "valid", "test"]:
        split_dir = os.path.join(path, split)
        files = []
        for file in os.listdir(split_dir):
            if file.endswith('.gz'):
                files.append(os.path.join(split_dir, file))
        files = sorted(files)
        print(files)
        df = jsonl_list_to_dataframe(files, ["code", "docstring"])
        dfs.append(df)
        
    return dfs

df_trn, df_val, df_tst = get_dfs("python/final/jsonl")
df_trn.head()

['python/final/jsonl/train/python_train_0.jsonl.gz', 'python/final/jsonl/train/python_train_1.jsonl.gz', 'python/final/jsonl/train/python_train_10.jsonl.gz', 'python/final/jsonl/train/python_train_11.jsonl.gz', 'python/final/jsonl/train/python_train_12.jsonl.gz', 'python/final/jsonl/train/python_train_13.jsonl.gz', 'python/final/jsonl/train/python_train_2.jsonl.gz', 'python/final/jsonl/train/python_train_3.jsonl.gz', 'python/final/jsonl/train/python_train_4.jsonl.gz', 'python/final/jsonl/train/python_train_5.jsonl.gz', 'python/final/jsonl/train/python_train_6.jsonl.gz', 'python/final/jsonl/train/python_train_7.jsonl.gz', 'python/final/jsonl/train/python_train_8.jsonl.gz', 'python/final/jsonl/train/python_train_9.jsonl.gz']
['python/final/jsonl/valid/python_valid_0.jsonl.gz']
['python/final/jsonl/test/python_test_0.jsonl.gz']


Unnamed: 0,code,docstring
0,"def train(train_dir, model_save_path=None, n_n...",Trains a k-nearest neighbors classifier for fa...
1,"def predict(X_img_path, knn_clf=None, model_pa...",Recognizes faces in given image using a traine...
2,"def show_prediction_labels_on_image(img_path, ...",Shows the face recognition results visually.\n...
3,"def _rect_to_css(rect):\n """"""\n Convert ...",Convert a dlib 'rect' object to a plain tuple ...
4,"def _trim_css_to_bounds(css, image_shape):\n ...","Make sure a tuple in (top, right, bottom, left..."


In [9]:
def isLatin(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

df_trn = df_trn.sample(frac = SAMPLE_SIZE)
df_val = df_val.sample(frac = SAMPLE_SIZE)
df_tst = df_tst.sample(frac = SAMPLE_SIZE)

df_trn = df_trn[df_trn['docstring'].apply(lambda x: isLatin(x))]
df_val = df_val[df_val['docstring'].apply(lambda x: isLatin(x))]
df_tst = df_tst[df_tst['docstring'].apply(lambda x: isLatin(x))]

df_trn = df_trn[df_trn['code'].apply(lambda x: isLatin(x))]
df_val = df_val[df_val['code'].apply(lambda x: isLatin(x))]
df_tst = df_tst[df_tst['code'].apply(lambda x: isLatin(x))]

In [10]:
def remove_comments_and_docstrings(source):
    io_obj = io.StringIO(source)
    out = ""
    prev_toktype = tokenize.INDENT
    last_lineno = -1
    last_col = 0
    for tok in tokenize.generate_tokens(io_obj.readline):
        token_type = tok[0]
        token_string = tok[1]
        start_line, start_col = tok[2]
        end_line, end_col = tok[3]
        ltext = tok[4]
        if start_line > last_lineno:
            last_col = 0
        if start_col > last_col:
            out += (" " * (start_col - last_col))
        if token_type == tokenize.COMMENT:
            pass
        elif token_type == tokenize.STRING:
            if prev_toktype != tokenize.INDENT:
                if prev_toktype != tokenize.NEWLINE:
                    if start_col > 0:
                        out += token_string
        else:
            out += token_string
        prev_toktype = token_type
        last_col = end_col
        last_lineno = end_line
    out = '\n'.join(l for l in out.splitlines() if l.strip())
    return out

def filter_docstrings(df):
    methods = []
    comments = []
    for i, row in tqdm(list(df.iterrows())):
        code = row["code"]
        try:
            methods.append(remove_comments_and_docstrings(code))
            comments.append(row["docstring"]) 
        except Exception as ex:
            print(ex)
        
    new_df = pd.DataFrame(zip(methods, comments), columns = ["code", "docstring"])

    return new_df

df_trn = filter_docstrings(df_trn);
df_val = filter_docstrings(df_val);
df_tst = filter_docstrings(df_tst);

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=243387.0), HTML(value='')))

unindent does not match any outer indentation level (<tokenize>, line 24)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=13539.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=13176.0), HTML(value='')))




In [11]:
df_trn = df_trn[~(df_trn['docstring'] == '')]
df_val = df_val[~(df_val['docstring'] == '')]
df_tst = df_tst[~(df_tst['docstring'] == '')]

df_trn = df_trn[~df_trn['docstring'].duplicated()]
df_val = df_val[~df_val['docstring'].duplicated()]
df_tst = df_tst[~df_tst['docstring'].duplicated()]

len(df_trn), len(df_val), len(df_tst)

(234017, 13224, 12905)

In [12]:
df_trn.itertuples()

<map at 0x7f3c20b7ad90>

In [9]:
def df_to_txt_file(df, output, cols):
    """Converts a dataframe and converts it into a text file that SentencePiece can use to train a BPE model"""
    file_path = os.path.join(output,'text.txt')
    with open(file_path, 'w') as f:
        for i, col in enumerate(cols):
            f.write('\n'.join(list(df[col])))
            f.write('\n')
            
    return file_path

def gen_sp_model(df, output, tokenizer_name, cols, vocab_size=8000):
    """Trains a SentencePiece BPE model from a pandas dataframe"""
    fname = df_to_txt_file(df, output, cols)
    sp.SentencePieceTrainer.train(f'--input={fname} --model_prefix={os.path.join(output, tokenizer_name)}'\
                                  f' --hard_vocab_limit=false --vocab_size={vocab_size}'\
                                  ' --unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3')

In [10]:
# os.makedirs('shared_bpe', exist_ok=True)

In [11]:
shared_bpe_path = 'shared_bpe'

In [12]:
shared_tokenizer = 'shared_bpe'
# gen_sp_model(df_trn.sample(frac = P_BPE), 'shared_bpe', shared_tokenizer, ["code", "docstring"], vocab_size=VOCAB_SIZE)

In [13]:
shared_spm = sp.SentencePieceProcessor()
shared_spm.Load(os.path.join(shared_bpe_path, shared_tokenizer + '.model'))

True

In [14]:
shared_spm.encode_as_ids("def bar(x,y):")

[26, 2621, 7, 79, 6, 151, 17]

In [15]:
shared_spm.encode_as_pieces("def bar(x,y):")

['▁def', '▁bar', '(', 'x', ',', 'y', '):']

In [16]:
en = shared_spm.tokenize("def bar(x,y):", int,add_bos=True, add_eos=True,)
en

[2, 26, 2621, 7, 79, 6, 151, 17, 3]

In [13]:
df_trn.head(5)

Unnamed: 0,code,docstring
0,def mtf_image_transformer_tiny():\n hparams =...,Catch bugs locally...
1,"def get_child_family_ids(self, family_id):\n ...",Gets the child ``Ids`` of the given family.\n\...
2,def end_headers(self):\n if self.reques...,Send the blank line ending the MIME headers.
3,"def view_call(method_name, *args, **kwargs):\n...",Creates an effect that will drop the current e...
4,"def get_logger(name=None, level=logging.DEBUG,...",returns a colorized logger. This function can ...


In [14]:
shared_spm.pad_id()

NameError: name 'shared_spm' is not defined

In [15]:
class CodeToDocstringDataset(Dataset):
    def __init__(self, tokenizer,
                 codes, docstrings,
                 max_code_len=340, max_docstring_len=340
                ):
        self.tokenizer = tokenizer
        self.codes = [tokenizer(c)['input_ids'] for c in tqdm(codes)]
        self.docstrings = [tokenizer(d)['input_ids'] for d in tqdm(docstrings)]
        self.max_code_len = max_code_len
        self.max_docstring_len = max_docstring_len
        
    def __getitem__(self, idx):
        return (self.codes[idx], self.docstrings[idx])
    
    def __len__(self):
        return len(self.codes)

    def collate(self, rows):
        codes = [torch.tensor(row[0][:self.max_code_len], 
                              dtype=torch.long) for row in rows]
        
        comments = [torch.tensor(row[1][:self.max_docstring_len],
                                 dtype=torch.long) for row in rows]
        
        code_tensor = pad_sequence(codes,batch_first=True,padding_value=self.tokenizer.pad_token_id)
        doc_tensor = pad_sequence(comments,batch_first=True,padding_value=self.tokenizer.pad_token_id)
        
        return code_tensor, doc_tensor
        

In [16]:
from transformers import BartTokenizerFast

In [17]:
# bart_tokenizer.pad_token_id

In [18]:
bart_tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=898823.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1355863.0), HTML(value='')))




In [19]:
import re

In [20]:
df_trn.columns

Index(['code', 'docstring'], dtype='object')

In [21]:
df_trn['code_cleaned'] = df_trn['code'].apply(lambda c: re.sub("\s+"," ",c))

In [22]:
df_tst['code_cleaned'] = df_tst['code'].apply(lambda c: re.sub("\s+"," ",c))

In [23]:
df_val['code_cleaned'] = df_val['code'].apply(lambda c: re.sub("\s+"," ",c))

In [24]:
df_trn['doc_cleaned'] = df_trn['docstring'].apply(lambda c: re.sub("\s+"," ",c))
df_tst['doc_cleaned'] = df_tst['docstring'].apply(lambda c: re.sub("\s+"," ",c))
df_val['doc_cleaned'] = df_val['docstring'].apply(lambda c: re.sub("\s+"," ",c))


In [25]:
for c in df_trn['code_cleaned']:
    print(c)
    break

def mtf_image_transformer_tiny(): hparams = mtf_image_transformer_base() hparams.hidden_size = 128 hparams.d_ff = 256 hparams.batch_size = 4 hparams.num_encoder_layers = 1 hparams.num_decoder_layers = 4 hparams.num_heads = 4 hparams.attention_key_size = 128 hparams.attention_value_size = 128 hparams.block_length = 32 hparams.mesh_shape = "batch:2" hparams.layout = "batch:batch" return hparams


In [26]:
train_dataset = CodeToDocstringDataset(bart_tokenizer,
                                       df_trn['code_cleaned'], df_trn['doc_cleaned'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=234017.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (5698 > 1024). Running this sequence through the model will result in indexing errors





HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=234017.0), HTML(value='')))




In [27]:
train_dataset.collate([train_dataset[0], train_dataset[1]])

(tensor([[    0,  9232,   475, 41407,  1215, 20094,  1215,  9981, 22098,  1215,
          40743, 49536,  1368, 49237,  5457,   475, 41407,  1215, 20094,  1215,
           9981, 22098,  1215, 11070, 43048,  1368, 49237,     4, 37392,  1215,
          10799,  5457, 13950,  1368, 49237,     4,   417,  1215,  3145,  5457,
          22078,  1368, 49237,     4, 35001,  1215, 10799,  5457,   204,  1368,
          49237,     4, 42666,  1215, 14210, 15362,  1215,   462, 24950,  5457,
            112,  1368, 49237,     4, 42666,  1215, 11127, 15362,  1215,   462,
          24950,  5457,   204,  1368, 49237,     4, 42666,  1215, 16560,  5457,
            204,  1368, 49237,     4,  2611, 19774,  1215,  5282,  1215, 10799,
           5457, 13950,  1368, 49237,     4,  2611, 19774,  1215, 19434,  1215,
          10799,  5457, 13950,  1368, 49237,     4, 16776,  1215, 16096,  5457,
           2107,  1368, 49237,     4,   119,  4891,  1215, 43882,  5457,    22,
          35001,    35,   176,   113,  1

In [28]:
validation_dataset = CodeToDocstringDataset(bart_tokenizer,
                                       df_val['code_cleaned'], df_val['doc_cleaned'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=13224.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=13224.0), HTML(value='')))




In [29]:
# shared_spm.vocab_size()

In [30]:
# model_config = BartConfig(
#     vocab_size=shared_spm.vocab_size(),
#     d_model=D_MODEL,
#     num_layers=NUM_LAYERS,
#     num_decoder_layers=NUM_DECODER_LAYERS,
#     d_ff=D_FF,
#     num_heads=NUM_HEADS, 
#     dropout_rate=DROPOUT_RATE,
#     decoder_start_token_id=shared_spm.bos_id(),
#     tie_word_embeddings = False
# )

model_config = BartConfig.from_pretrained("facebook/bart-base")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1627.0), HTML(value='')))




In [31]:
class BestModel:
    def __init__(self, path, initial_criterion):
        self.path = path
        self.criterion = initial_criterion
        
    def update(self, model, criterion):
        self.criterion = criterion
        torch.save({'model_state': model.state_dict(), 'criterion': criterion}, self.path)
        
    def load_model_data(self):
        return torch.load(self.path)
    
    def restore(self, model):
        model_data = self.load_model_data()
        model.load_state_dict(model_data['model_state'])

In [32]:
import torch.nn.functional as F

def masked_crossentropy(logits, targets, padding_idx):
    return F.cross_entropy(logits.view(-1, logits.size(-1)),
                           targets.view(-1), ignore_index=padding_idx)

In [33]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=557941479.0), HTML(value='')))




In [35]:
optim = torch.optim.Adam(lr=LEARNING_RATE,params=model.parameters())

In [36]:
scheduler = get_constant_schedule_with_warmup(optim, num_warmup_steps=NUM_WARMUP_STEPS)

In [37]:
next(iter(optim.param_groups))['lr']

0.0

In [38]:
bart_tokenizer.bos_token_id

0

In [39]:
def test_model(df, sample_size = 10):
    print("[Test model results]")
    
    sample = df.sample(sample_size)
    
    with torch.no_grad():
        for idx, row in sample.iterrows():
            code = row['code_cleaned']
            inp = bart_tokenizer(code)['input_ids']
            inp = torch.tensor(inp).view(1,-1)[:,:330].clone()
            generated = model.generate(input_ids=inp.to(model.device), 
                                       decoder_start_token_id=bart_tokenizer.bos_token_id, 
                                       num_beams=10,
                                       max_length=25, no_repeat_ngram_size=4)
            generated = generated[0].cpu().tolist()
            print("---------------------------")
            print(f"Code: {code}")
            print(f"Target: {row.docstring}")
            print(f"Generated :{bart_tokenizer.decode(generated)}")
            
    print("---------------------------")

In [49]:
test_model(df_tst, 5)

[Test model results]
---------------------------
Code: def _kl_uniform_uniform(a, b, name=None): with tf.name_scope(name or "kl_uniform_uniform"): final_batch_shape = distribution_util.get_broadcast_shape( a.low, b.low, a.high, b.high) dtype = dtype_util.common_dtype( [a.low, a.high, b.low, b.high], tf.float32) return tf.where((b.low <= a.low) & (a.high <= b.high), tf.math.log(b.high - b.low) - tf.math.log(a.high - a.low), tf.broadcast_to( dtype_util.as_numpy_dtype(dtype)(np.inf), final_batch_shape))
Target: Calculate the batched KL divergence KL(a || b) with a and b Uniform.

  Note that the KL divergence is infinite if the support of `a` is not a subset
  of the support of `b`.

  Args:
    a: instance of a Uniform distribution object.
    b: instance of a Uniform distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_uniform_uniform".

  Returns:
    Batchwise KL(a || b)
Generated :<s><s>def _kl_uniform_uniform(a, b, name=None): with tf.na

In [41]:
GEN_EVERY=25_000 // TRAIN_BATCH_SIZE

In [42]:
def compute_loss(model, encoder_inputs, decoder_inputs, decoder_labels, padding_idx):
    attention_mask = (encoder_inputs != padding_idx).type(torch.long)
    decoder_attention_mask = (decoder_inputs != padding_idx).type(torch.long)
            
    pass_result = model.forward(encoder_inputs, attention_mask, 
                          decoder_input_ids=decoder_inputs, 
                          decoder_attention_mask=decoder_attention_mask, return_dict=True)
    logits = pass_result['logits']
            
    loss = masked_crossentropy(logits, decoder_labels, padding_idx)
    
    return loss

def train_model(model, optimizer, lr_scheduler, scheduler_step_frequency,
                loaders, max_epochs, device, 
                best_model: BestModel, patience, padding_idx):
    allowed_epochs_without_improvement = patience
    
    for epoch in tqdm(range(max_epochs)):
        model.train()
        train_iter = tqdm(loaders['train'])
        running_sum_loss = 0.0
        running_total_batches = 0
        for i, (xx, yy) in enumerate(train_iter):
            optimizer.zero_grad()
            encoder_inputs = xx.to(device)
            decoder_inputs = yy[:,:-1].clone().to(device)
            decoder_labels = yy[:,1:].clone().to(device)

            loss = compute_loss(model, encoder_inputs, decoder_inputs, decoder_labels, padding_idx)
            
            loss.backward()
            optimizer.step()
            if lr_scheduler is not None and scheduler_step_frequency == 'step':
                lr_scheduler.step()
            
            running_sum_loss += loss.item()
            running_total_batches += 1
            
            train_iter.set_postfix({
                "avg_train_loss": np.round(running_sum_loss / running_total_batches, 4),
                "train_step_loss": np.round(loss.item(), 4)
            })
            
            if i % GEN_EVERY == 0:
                test_model(df_tst, 5)

        if lr_scheduler is not None and scheduler_step_frequency == 'epoch':
            lr_scheduler.step()
            
        with torch.no_grad():
            model.eval()
            
            val_iter = tqdm(loaders['validation'])
            
            running_sum_loss = 0.0
            running_total_batches = 0
            
            for (xx, yy) in val_iter:
                encoder_inputs = xx.to(device)
                decoder_inputs = yy[:,:-1].clone().to(device)
                decoder_labels = yy[:,1:].clone().to(device)
                
                loss = compute_loss(model, encoder_inputs, decoder_inputs, decoder_labels, padding_idx)
                
                running_sum_loss += loss.item()
                running_total_batches += 1
                
                val_iter.set_postfix({
                    "avg_val_loss": np.round(running_sum_loss / running_total_batches, 4),
                    "val_step_loss": np.round(loss.item(), 4)
                })
                
            final_val_loss = running_sum_loss / running_total_batches
            prev_loss = best_model.criterion
            if final_val_loss < prev_loss:
                best_model.update(model, final_val_loss)
                print(f"Model saved due to improvement in 'val_loss'"\
                      f" from {prev_loss} to {final_val_loss}")
                allowed_epochs_without_improvement = patience
            else:
                if allowed_epochs_without_improvement <= 0:
                    print(f"Training stopped due to no improvement in {patience} epochs")
                    return
                allowed_epochs_without_improvement -= 1
           
        test_model(df_tst, 5)
        

In [43]:
torch.set_num_threads(4)

In [44]:
dataloaders = {}
dataloaders['train'] = DataLoader(train_dataset, TRAIN_BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate)
dataloaders['validation'] = DataLoader(validation_dataset,
                                       VAL_BATCH_SIZE, collate_fn=validation_dataset.collate)

In [45]:
MODEL_VERSION = "rt_bart"

In [46]:
best_model = BestModel(f"model_{MODEL_VERSION}", initial_criterion=10000)

In [47]:
device = 'cuda'

In [48]:
model = model.to(device)

In [50]:
# del model

In [51]:
train_model(model, 
            optimizer=optim, 
            lr_scheduler=scheduler, 
            scheduler_step_frequency='step',
            loaders=dataloaders, 
            max_epochs=MAX_EPOCHS, 
            device=device,
            best_model=best_model,
            patience=PATIENCE, 
            padding_idx=bart_tokenizer.pad_token_id
            )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=29253.0), HTML(value='')))

[Test model results]
---------------------------
Code: def call(self, inputs): collapsed_shape = tf.concat(([-1], tf.shape(input=inputs)[-2:]), axis=0) out = tf.reshape(inputs, collapsed_shape) out = self.bilstm(out) expanded_shape = tf.concat((tf.shape(input=inputs)[:-2], [-1]), axis=0) out = tf.reshape(out, expanded_shape) out = self.output_layer(out) loc = out[..., :self.latent_size] scale_diag = tf.nn.softplus(out[..., self.latent_size:]) + 1e-5 return tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale_diag)
Target: Runs the model to generate a distribution `q(f | x_{1:T})`.

    This generates a list of batched MultivariateNormalDiag
    distributions using the output of the recurrent model at each
    timestep to parameterize each distribution.

    Args:
      inputs: A batch of intermediate representations of image frames
        across all timesteps, of shape [..., batch_size, timesteps,
        hidden_size].

    Returns:
      A batched MultivariateNormalDiag distribution 

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1653.0), HTML(value='')))


Model saved due to improvement in 'val_loss' from 10000 to 2.834005860653489
[Test model results]
---------------------------
Code: def parse_log(log_file): template = OrderedDict([ ("clean_len", 0), ("total_trim", 0), ("total_trim_perc", 0), ("5trim", 0), ("3trim", 0), ("bad_reads", 0) ]) with open(log_file) as fh: for line in fh: fields = [int(x) for x in line.strip().split()[-4:]] if not fields[0]: template["bad_reads"] += 1 template["5trim"] += fields[1] template["3trim"] += fields[3] template["total_trim"] += fields[1] + fields[3] template["clean_len"] += fields[0] total_len = template["clean_len"] + template["total_trim"] if total_len: template["total_trim_perc"] = round( (template["total_trim"] / total_len) * 100, 2) else: template["total_trim_perc"] = 0 return template
Target: Retrieves some statistics from a single Trimmomatic log file.

    This function parses Trimmomatic's log file and stores some trimming
    statistics in an :py:class:`OrderedDict` object. This object co

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=29253.0), HTML(value='')))

[Test model results]
---------------------------
Code: def normalize_profile(in_profile, log=False, return_offset = True): if log: tmp_prefactor = in_profile.max(axis=1) tmp_prof = np.exp(in_profile.T - tmp_prefactor).T else: tmp_prefactor = 0.0 tmp_prof = in_profile norm_vector = tmp_prof.sum(axis=1) return (np.copy(np.einsum('ai,a->ai',tmp_prof,1.0/norm_vector)), (np.log(norm_vector) + tmp_prefactor) if return_offset else None)
Target: return a normalized version of a profile matrix

    Parameters
    ----------
    in_profile : np.array
        shape Lxq, will be normalized to one across each row
    log : bool, optional
        treat the input as log probabilities
    return_offset : bool, optional
        return the log of the scale factor for each row

    Returns
    -------
    tuple
        normalized profile (fresh np object) and offset (if return_offset==True)
Generated :<s>Normalize a profile. Parameters ---------- in_profile : numpy.ndarray, shape = [n_</s>
--------------

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1653.0), HTML(value='')))


Model saved due to improvement in 'val_loss' from 2.834005860653489 to 2.752179807413728
[Test model results]
---------------------------
Code: def action_logging(f): @functools.wraps(f) def wrapper(*args, **kwargs): assert args assert isinstance(args[0], Namespace), "1st positional argument should be argparse.Namespace instance, " "but {}".format(args[0]) metrics = _build_metrics(f.__name__, args[0]) cli_action_loggers.on_pre_execution(**metrics) try: return f(*args, **kwargs) except Exception as e: metrics['error'] = e raise finally: metrics['end_datetime'] = datetime.utcnow() cli_action_loggers.on_post_execution(**metrics) return wrapper
Target: Decorates function to execute function at the same time submitting action_logging
    but in CLI context. It will call action logger callbacks twice,
    one for pre-execution and the other one for post-execution.

    Action logger will be called with below keyword parameters:
        sub_command : name of sub-command
        start_datetim

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=29253.0), HTML(value='')))

[Test model results]
---------------------------
Code: def reg_on_exit(self, callable_object, *args, **kwargs): persistent = kwargs.pop('persistent', False) event = self._create_event(callable_object, 'exit', persistent, *args, **kwargs) self.exit_callbacks.append(event) return event
Target: Register a function/method to be called on program exit,
        will get executed regardless of successs/failure of the program running
Generated :<s>Registers an event to be executed when an exception is raised. Args: callable_object (callable</s>
---------------------------
Code: def signin_card(card: SigninCard) -> Attachment: if not isinstance(card, SigninCard): raise TypeError('CardFactory.signin_card(): `card` argument is not an instance of an SigninCard, ' 'unable to prepare attachment.') return Attachment(content_type=CardFactory.content_types.signin_card, content=card)
Target: Returns an attachment for a signin card. For channels that don't natively support signin cards an alternative
   

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1653.0), HTML(value='')))


Model saved due to improvement in 'val_loss' from 2.752179807413728 to 2.7199637416774984
[Test model results]
---------------------------
Code: def create_build(self, tarball_url, env=None, app_name=None): data = { 'source_blob': { 'url': tarball_url } } if env: data['overrides'] = {'env': env} if app_name: data['app'] = {'name': app_name} return self.api_request('POST', '/app-setups', data=data)
Target: Creates an app-setups build. Returns response data as a dict.

        :param tarball_url: URL of a tarball containing an ``app.json``.
        :param env: Dict containing environment variable overrides.
        :param app_name: Name of the Heroku app to create.
        :returns: Response data as a ``dict``.
Generated :<s>Create a new app. :param tarball_url: :param env: :param app_name: :</s>
---------------------------
Code: def group_transactions(self): groups = [] if self: last_txn = self.tail_transaction current_group = [last_txn] for current_txn in self.transactions[1:]: if cur

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=29253.0), HTML(value='')))

[Test model results]
---------------------------
Code: def _num_tasks_per_fetch_process(self): return max(1, int(math.ceil(1.0 * len(self.tasks) / self._sync_parallelism)))
Target: How many Celery tasks should be sent to each worker process.

        :return: Number of tasks that should be used per process
        :rtype: int
Generated :<s>Returns the number of tasks per fetch process.</s>
---------------------------
Code: def anneal(self): strip = self.copy() gaps = strip.find_gaps(index=True) if not gaps: return for gap in gaps: before = strip[gap] after = strip[gap + 1] if strip.order == 'depth': t = (after.top.z-before.base.z)/2 before.base = before.base.z + t after.top = after.top.z - t else: t = (after.base-before.top)/2 before.top = before.top.z + t after.base = after.base.z - t return strip
Target: Fill in empty intervals by growing from top and base.

        Note that this operation happens in-place and destroys any information
        about the ``Position`` (e.g. metadata as

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1653.0), HTML(value='')))


Model saved due to improvement in 'val_loss' from 2.7199637416774984 to 2.7086113689887616
[Test model results]
---------------------------
Code: def authentication(login, password): session = requests.Session() response = session.get('https://m.vk.com') url = re.search(r'action="([^\"]+)"', response.text).group(1) data = {'email': login, 'pass': password} response = session.post(url, data=data) return session
Target: Authentication on vk.com.

    :param login: login on vk.com.
    :param password: password on vk.com.
    :returns: `requests.Session` session with cookies.
Generated :<s>Authenticate against m.vk.com.</s>
---------------------------
Target: determine if a level should print to
        stderr, includes all levels but INFO and QUIET
Generated :<s>Emit an error. Args: level (int): The level of error. Returns: bool: True if</s>
---------------------------
Code: def possible_exc_types(node): excs = [] if isinstance(node.exc, astroid.Name): inferred = utils.safe_infer(node.e

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=29253.0), HTML(value='')))

[Test model results]
---------------------------
Code: def _call(self, utterances_batch: list, utterances_ids: Optional[list]=None) -> list: batch_size = len(utterances_batch) ids = utterances_ids or list(range(batch_size)) batch_history = [self.history[utt_id] for utt_id in ids] responses = [] filtered = self.skills_filter(utterances_batch, batch_history) for skill_i, (filtered_utterances, skill) in enumerate(zip(filtered, self.wrapped_skills)): skill_i_utt_indexes = [utt_index for utt_index, utt_filter in enumerate(filtered_utterances) if utt_filter] if skill_i_utt_indexes: skill_i_utt_batch = [utterances_batch[i] for i in skill_i_utt_indexes] skill_i_utt_ids = [ids[i] for i in skill_i_utt_indexes] res = [(None, 0.)] * batch_size predicted, confidence = skill(skill_i_utt_batch, skill_i_utt_ids) for i, predicted, confidence in zip(skill_i_utt_indexes, predicted, confidence): res[i] = (predicted, confidence) responses.append(res) responses = self.skills_processor(utterances_batch, batc

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1653.0), HTML(value='')))


Training stopped due to no improvement in 0 epochs



In [None]:
torch.cuda.empty_cache()

In [None]:
with torch.no_grad():
    code = "def foo(x,y): x + y"
    inp = shared_spm.tokenize(code)
    inp = torch.tensor(inp).view(1,-1)
    generated = model.generate(input_ids=inp.to(model.device), 
                               decoder_start_token_id=shared_spm.bos_id(), 
                               num_beams=10,
                               max_length=100, no_repeat_ngram_size=4)
    generated = generated[0].cpu().tolist()
    print(shared_spm.decode(generated))