In [1]:
!pip install --upgrade transformers
!pip install sentencepiece
!pip install pandas



In [2]:
import re
import io
import os
import glob
import tokenize
import sentencepiece as sp
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import get_constant_schedule_with_warmup
from transformers.models.t5 import T5Model, T5Config, T5ForConditionalGeneration
from transformers.models.bart import BartForConditionalGeneration, BartConfig

In [3]:
# Constants

path = "."
MODEL_VERSION = 'result'
SAMPLE_SIZE = 0.4
VOCAB_SIZE = 24_000
P_BPE = 0.2
D_MODEL = 512
NUM_LAYERS = 6
NUM_DECODER_LAYERS = 6
D_FF = 1024
NUM_HEADS = 8
DROPOUT_RATE = 0.1
LEARNING_RATE = 5e-5
NUM_WARMUP_STEPS = 500
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 8
MAX_EPOCHS = 8
PATIENCE = 0

In [4]:
# ! wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
! unzip python.zip

Archive:  python.zip
   creating: python/
   creating: python/final/
   creating: python/final/jsonl/
   creating: python/final/jsonl/train/
  inflating: python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_10.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_0.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_6.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_2.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_4.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_8.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_11.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_5.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_13.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_3.jsonl.gz  
  inflating: python/final/jsonl/train/python_train_1.jsonl.gz  
  inflating: python/fin

In [5]:
def jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f,
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

def get_dfs(path):
    """Grabs the different data splits and converts them into dataframes"""
    dfs = []
    for split in ["train", "valid", "test"]:
        split_dir = os.path.join(path, split)
        files = []
        for file in os.listdir(split_dir):
            if file.endswith('.gz'):
                files.append(os.path.join(split_dir, file))
        files = sorted(files)
        print(files)
        df = jsonl_list_to_dataframe(files, ["code", "docstring"])
        dfs.append(df)
        
    return dfs

df_trn, df_val, df_tst = get_dfs("python/final/jsonl")
df_trn.head()

['python/final/jsonl/train/python_train_0.jsonl.gz', 'python/final/jsonl/train/python_train_1.jsonl.gz', 'python/final/jsonl/train/python_train_10.jsonl.gz', 'python/final/jsonl/train/python_train_11.jsonl.gz', 'python/final/jsonl/train/python_train_12.jsonl.gz', 'python/final/jsonl/train/python_train_13.jsonl.gz', 'python/final/jsonl/train/python_train_2.jsonl.gz', 'python/final/jsonl/train/python_train_3.jsonl.gz', 'python/final/jsonl/train/python_train_4.jsonl.gz', 'python/final/jsonl/train/python_train_5.jsonl.gz', 'python/final/jsonl/train/python_train_6.jsonl.gz', 'python/final/jsonl/train/python_train_7.jsonl.gz', 'python/final/jsonl/train/python_train_8.jsonl.gz', 'python/final/jsonl/train/python_train_9.jsonl.gz']
['python/final/jsonl/valid/python_valid_0.jsonl.gz']
['python/final/jsonl/test/python_test_0.jsonl.gz']


Unnamed: 0,code,docstring
0,"def train(train_dir, model_save_path=None, n_n...",Trains a k-nearest neighbors classifier for fa...
1,"def predict(X_img_path, knn_clf=None, model_pa...",Recognizes faces in given image using a traine...
2,"def show_prediction_labels_on_image(img_path, ...",Shows the face recognition results visually.\n...
3,"def _rect_to_css(rect):\n """"""\n Convert ...",Convert a dlib 'rect' object to a plain tuple ...
4,"def _trim_css_to_bounds(css, image_shape):\n ...","Make sure a tuple in (top, right, bottom, left..."


In [6]:
def isASCII(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

df_trn = df_trn.sample(frac = SAMPLE_SIZE)
df_val = df_val.sample(frac = SAMPLE_SIZE)
df_tst = df_tst.sample(frac = SAMPLE_SIZE)

df_trn = df_trn[df_trn['docstring'].apply(lambda x: isASCII(x))]
df_val = df_val[df_val['docstring'].apply(lambda x: isASCII(x))]
df_tst = df_tst[df_tst['docstring'].apply(lambda x: isASCII(x))]

df_trn = df_trn[df_trn['code'].apply(lambda x: isASCII(x))]
df_val = df_val[df_val['code'].apply(lambda x: isASCII(x))]
df_tst = df_tst[df_tst['code'].apply(lambda x: isASCII(x))]

In [7]:
def remove_comments_and_docstrings(source):
    io_obj = io.StringIO(source)
    out = ""
    prev_toktype = tokenize.INDENT
    last_lineno = -1
    last_col = 0
    for tok in tokenize.generate_tokens(io_obj.readline):
        token_type = tok[0]
        token_string = tok[1]
        start_line, start_col = tok[2]
        end_line, end_col = tok[3]
        ltext = tok[4]
        if start_line > last_lineno:
            last_col = 0
        if start_col > last_col:
            out += (" " * (start_col - last_col))
        if token_type == tokenize.COMMENT:
            pass
        elif token_type == tokenize.STRING:
            if prev_toktype != tokenize.INDENT:
                if prev_toktype != tokenize.NEWLINE:
                    if start_col > 0:
                        out += token_string
        else:
            out += token_string
        prev_toktype = token_type
        last_col = end_col
        last_lineno = end_line
    out = '\n'.join(l for l in out.splitlines() if l.strip())
    return out

def filter_docstrings(df):
    methods = []
    comments = []
    for i, row in tqdm(list(df.iterrows())):
        code = row["code"]
        methods.append(remove_comments_and_docstrings(code))
        comments.append(row["docstring"]) 
        
    new_df = pd.DataFrame(zip(methods, comments), columns = ["code", "docstring"])

    return new_df

df_trn = filter_docstrings(df_trn);
df_val = filter_docstrings(df_val);
df_tst = filter_docstrings(df_tst);

  0%|          | 0/162210 [00:00<?, ?it/s]

  0%|          | 0/9010 [00:00<?, ?it/s]

  0%|          | 0/8788 [00:00<?, ?it/s]

In [8]:
df_trn = df_trn[~(df_trn['docstring'] == '')]
df_val = df_val[~(df_val['docstring'] == '')]
df_tst = df_tst[~(df_tst['docstring'] == '')]

df_trn = df_trn[~df_trn['docstring'].duplicated()]
df_val = df_val[~df_val['docstring'].duplicated()]
df_tst = df_tst[~df_tst['docstring'].duplicated()]

len(df_trn), len(df_val), len(df_tst)

(157263, 8854, 8660)

In [9]:
df_trn.itertuples()

<map at 0x7f5ddba8b1f0>

In [10]:
def df_to_txt_file(df, output, cols):
    """Converts a dataframe and converts it into a text file that SentencePiece can use to train a BPE model"""
    file_path = os.path.join(output,'text.txt')
    with open(file_path, 'w') as f:
        for i, col in enumerate(cols):
            f.write('\n'.join(list(df[col])))
            f.write('\n')
            
    return file_path

def gen_sp_model(df, output, tokenizer_name, cols, vocab_size=8000):
    """Trains a SentencePiece BPE model from a pandas dataframe"""
    fname = df_to_txt_file(df, output, cols)
    sp.SentencePieceTrainer.train(f'--input={fname} --model_prefix={os.path.join(output, tokenizer_name)}'\
                                  f' --hard_vocab_limit=false --vocab_size={vocab_size}'\
                                  ' --unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3')

In [11]:
# os.makedirs('shared_bpe', exist_ok=True)

In [12]:
shared_bpe_path = 'shared_bpe'

In [13]:
shared_tokenizer = 'shared_bpe'
# gen_sp_model(df_trn.sample(frac = P_BPE), 'shared_bpe', shared_tokenizer, ["code", "docstring"], vocab_size=VOCAB_SIZE)

In [14]:
shared_spm = sp.SentencePieceProcessor()
shared_spm.Load(os.path.join(shared_bpe_path, shared_tokenizer + '.model'))

True

In [15]:
shared_spm.encode_as_ids("def bar(x,y):")

[25, 3244, 7, 82, 6, 165, 17]

In [16]:
shared_spm.encode_as_pieces("def bar(x,y):")

['▁def', '▁bar', '(', 'x', ',', 'y', '):']

In [17]:
en = shared_spm.tokenize("def bar(x,y):", int,add_bos=True, add_eos=True,)
en

[2, 25, 3244, 7, 82, 6, 165, 17, 3]

In [18]:
df_trn.head(5)

Unnamed: 0,code,docstring
0,"def transform(self, trans):\n if isinst...",Compute a transformation in place using a 4x4 ...
1,"def perform_word_selection(self, event=None):\...",Performs word selection\n :param event:...
2,"def create(self, stream, start, parameters, so...",Create a hitorics preview job.\n\n ...
3,"def reset_object(self, driver_wrapper=None):\n...",Reset each page element object\n\n :par...
4,"def _mom(self, k, dist, length, cache):\n ...",Moment generating function.\n\n Example...


In [19]:
shared_spm.pad_id()

1

In [20]:
class CodeToDocstringDataset(Dataset):
    def __init__(self, method_spm, comment_spm,
                 codes, docstrings,
                 max_code_len=340, max_docstring_len=340
                ):
        self.method_spm = method_spm
        self.comment_spm = comment_spm
        self.codes = [method_spm.tokenize(c, int) for c in tqdm(codes)]
        self.docstrings = [comment_spm.tokenize(d, int,
                                                add_bos=True, add_eos=True) for d in tqdm(docstrings)]
        self.max_code_len = max_code_len
        self.max_docstring_len = max_docstring_len
        
    def __getitem__(self, idx):
        return (self.codes[idx], self.docstrings[idx])
    
    def __len__(self):
        return len(self.codes)

    def collate(self, rows):
        codes = [torch.tensor(row[0][:self.max_code_len], 
                              dtype=torch.long) for row in rows]
        
        comments = [torch.tensor(row[1][:self.max_docstring_len],
                                 dtype=torch.long) for row in rows]
        
        code_tensor = pad_sequence(codes,batch_first=True,padding_value=self.method_spm.pad_id())
        doc_tensor = pad_sequence(comments,batch_first=True,padding_value=self.comment_spm.pad_id())
        
        return code_tensor, doc_tensor
        

In [21]:
train_dataset = CodeToDocstringDataset(shared_spm, shared_spm, 
                                       df_trn['code'], df_trn['docstring'])

  0%|          | 0/157263 [00:00<?, ?it/s]

  0%|          | 0/157263 [00:00<?, ?it/s]

In [22]:
validation_dataset = CodeToDocstringDataset(shared_spm, shared_spm, 
                                       df_val['code'], df_val['docstring'])

  0%|          | 0/8854 [00:00<?, ?it/s]

  0%|          | 0/8854 [00:00<?, ?it/s]

In [23]:
shared_spm.vocab_size()

24000

In [24]:
model_config = BartConfig(
    vocab_size=shared_spm.vocab_size(),
    d_model=D_MODEL,
    num_layers=NUM_LAYERS,
    num_decoder_layers=NUM_DECODER_LAYERS,
    d_ff=D_FF,
    num_heads=NUM_HEADS, 
    dropout_rate=DROPOUT_RATE,
    decoder_start_token_id=shared_spm.bos_id(),
    tie_word_embeddings = False
)

In [25]:
class BestModel:
    def __init__(self, path, initial_criterion):
        self.path = path
        self.criterion = initial_criterion
        
    def update(self, model, criterion):
        self.criterion = criterion
        torch.save({'model_state': model.state_dict(), 'criterion': criterion}, self.path)
        
    def load_model_data(self):
        return torch.load(self.path)
    
    def restore(self, model):
        model_data = self.load_model_data()
        model.load_state_dict(model_data['model_state'])

In [26]:
import torch.nn.functional as F

def masked_crossentropy(logits, targets, padding_idx):
    return F.cross_entropy(logits.view(-1, logits.size(-1)),
                           targets.view(-1), ignore_index=padding_idx)

In [27]:
model = BartForConditionalGeneration(model_config)

In [28]:
optim = torch.optim.Adam(lr=LEARNING_RATE,params=model.parameters())

In [29]:
scheduler = get_constant_schedule_with_warmup(optim, num_warmup_steps=NUM_WARMUP_STEPS)

In [30]:
next(iter(optim.param_groups))['lr']

0.0

In [31]:
def test_model(df, sample_size = 10):
    print("[Test model results]")
    
    sample = df.sample(sample_size)
    
    with torch.no_grad():
        for idx, row in sample.iterrows():
            code = row.code
            inp = shared_spm.tokenize(code)
            inp = torch.tensor(inp).view(1,-1)
            generated = model.generate(input_ids=inp.to(model.device), 
                                       decoder_start_token_id=shared_spm.bos_id(), 
                                       num_beams=10,
                                       max_length=25, no_repeat_ngram_size=4)
            generated = generated[0].cpu().tolist()
            print("---------------------------")
            print(f"Code: {code}")
            print(f"Target: {row.docstring}")
            print(f"Generated :{shared_spm.decode(generated)}")
            
    print("---------------------------")

In [32]:
def compute_loss(model, encoder_inputs, decoder_inputs, decoder_labels, padding_idx):
    attention_mask = (encoder_inputs != padding_idx).type(torch.long)
    decoder_attention_mask = (decoder_inputs != padding_idx).type(torch.long)
            
    pass_result = model.forward(encoder_inputs, attention_mask, 
                          decoder_input_ids=decoder_inputs, 
                          decoder_attention_mask=decoder_attention_mask, return_dict=True)
    logits = pass_result['logits']
            
    loss = masked_crossentropy(logits, decoder_labels, padding_idx)
    
    return loss

def train_model(model, optimizer, lr_scheduler, scheduler_step_frequency,
                loaders, max_epochs, device, 
                best_model: BestModel, patience, padding_idx):
    allowed_epochs_without_improvement = patience
    
    for epoch in tqdm(range(max_epochs)):
        model.train()
        train_iter = tqdm(loaders['train'])
        running_sum_loss = 0.0
        running_total_batches = 0
        for (xx, yy) in train_iter:
            optimizer.zero_grad()
            encoder_inputs = xx.to(device)
            decoder_inputs = yy[:,:-1].clone().to(device)
            decoder_labels = yy[:,1:].clone().to(device)

            loss = compute_loss(model, encoder_inputs, decoder_inputs, decoder_labels, padding_idx)
            
            loss.backward()
            optimizer.step()
            if lr_scheduler is not None and scheduler_step_frequency == 'step':
                lr_scheduler.step()
            
            running_sum_loss += loss.item()
            running_total_batches += 1
            
            train_iter.set_postfix({
                "avg_train_loss": np.round(running_sum_loss / running_total_batches, 4),
                "train_step_loss": np.round(loss.item(), 4)
            })

        if lr_scheduler is not None and scheduler_step_frequency == 'epoch':
            lr_scheduler.step()
            
        with torch.no_grad():
            model.eval()
            
            val_iter = tqdm(loaders['validation'])
            
            running_sum_loss = 0.0
            running_total_batches = 0
            
            for (xx, yy) in val_iter:
                encoder_inputs = xx.to(device)
                decoder_inputs = yy[:,:-1].clone().to(device)
                decoder_labels = yy[:,1:].clone().to(device)
                
                loss = compute_loss(model, encoder_inputs, decoder_inputs, decoder_labels, padding_idx)
                
                running_sum_loss += loss.item()
                running_total_batches += 1
                
                val_iter.set_postfix({
                    "avg_val_loss": np.round(running_sum_loss / running_total_batches, 4),
                    "val_step_loss": np.round(loss.item(), 4)
                })
                
            final_val_loss = running_sum_loss / running_total_batches
            prev_loss = best_model.criterion
            if final_val_loss < prev_loss:
                best_model.update(model, final_val_loss)
                print(f"Model saved due to improvement in 'val_loss'"\
                      f" from {prev_loss} to {final_val_loss}")
                allowed_epochs_without_improvement = patience
            else:
                if allowed_epochs_without_improvement <= 0:
                    print(f"Training stopped due to no improvement in {patience} epochs")
                    return
                allowed_epochs_without_improvement -= 1
           
        test_model(df_tst, 5)
        

In [33]:
dataloaders = {}
dataloaders['train'] = DataLoader(train_dataset, TRAIN_BATCH_SIZE, shuffle=True, collate_fn=train_dataset.collate)
dataloaders['validation'] = DataLoader(validation_dataset,
                                       VAL_BATCH_SIZE, collate_fn=validation_dataset.collate)

In [34]:
best_model = BestModel(f"model_{MODEL_VERSION}", initial_criterion=10000)

In [35]:
device = 'cuda'

In [36]:
model = model.to(device)

In [None]:
train_model(model, 
            optimizer=optim, 
            lr_scheduler=scheduler, 
            scheduler_step_frequency='step',
            loaders=dataloaders, 
            max_epochs=MAX_EPOCHS, 
            device=device,
            best_model=best_model,
            patience=PATIENCE, 
            padding_idx=shared_spm.pad_id()
            )

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/19658 [00:00<?, ?it/s]

  0%|          | 0/1107 [00:00<?, ?it/s]

Model saved due to improvement in 'val_loss' from 10000 to 4.6164942726748635
[Test model results]


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


---------------------------
Code: def from_ascii(path, seperator=None, names=True, skip_lines=0, skip_after=0, **kwargs):
    import vaex.ext.readcol as rc
    ds = vaex.dataframe.DataFrameArrays(path)
    if names not in [True, False]:
        namelist = names
        names = False
    else:
        namelist = None
    data = rc.readcol(path, fsep=seperator, asdict=namelist is None, names=names, skipline=skip_lines, skipafter=skip_after, **kwargs)
    if namelist:
        for name, array in zip(namelist, data.T):
            ds.add_column(name, array)
    else:
        for name, array in data.items():
            ds.add_column(name, array)
    return ds
Target: Create an in memory DataFrame from an ascii file (whitespace seperated by default).

    >>> ds = vx.from_ascii("table.asc")
    >>> ds = vx.from_ascii("table.csv", seperator=",", names=["x", "y", "z"])

    :param path: file path
    :param seperator: value seperator, by default whitespace, use "," for comma seperated values.


  0%|          | 0/19658 [00:00<?, ?it/s]

  0%|          | 0/1107 [00:00<?, ?it/s]

Model saved due to improvement in 'val_loss' from 4.6164942726748635 to 4.315538340467747
[Test model results]
---------------------------
Code: def diode_spec(calON_obs,calOFF_obs,calflux,calfreq,spec_in,average=True,oneflux=False,**kwargs):
    obs = Waterfall(calON_obs,max_load=150)
    freqs = obs.populate_freqs()
    ncoarse = obs.calc_n_coarse_chan()
    nchans = obs.header['nchans']
    chan_per_coarse = nchans/ncoarse
    f_ON, f_OFF = f_ratios(calON_obs,calOFF_obs,chan_per_coarse,**kwargs)
    centerfreqs = get_centerfreqs(freqs,chan_per_coarse)
    calfluxes = get_calfluxes(calflux,calfreq,spec_in,centerfreqs,oneflux)
    C_o = calfluxes/(1/f_ON-1/f_OFF)
    Tsys = C_o/f_OFF
    if average==True:
        return np.mean(C_o),np.mean(Tsys)
    else:
        return C_o,Tsys
Target: Calculate the coarse channel spectrum and system temperature of the noise diode in Jy given two noise diode
    measurements ON and OFF the calibrator source with the same frequency and time resolutio

  0%|          | 0/19658 [00:00<?, ?it/s]

  0%|          | 0/1107 [00:00<?, ?it/s]

Model saved due to improvement in 'val_loss' from 4.315538340467747 to 4.108572504815478
[Test model results]
---------------------------
Code: async def close(self):
        if self._is_closed:
            return
        else:
            await self.http.close()
            self._is_closed = True
Target: This function is a coroutine.

        Closes all connections.
Generated :Gets the ``OsidSession`` associated with the objective bank. arg: proxy (osid.proxy.Proxy): a
---------------------------
Code: def start(host, port, profiler_stats, dont_start_browser, debug_mode):
    stats_handler = functools.partial(StatsHandler, profiler_stats)
    if not debug_mode:
        sys.stderr = open(os.devnull, 'w')
    print('Starting HTTP server...')
    if not dont_start_browser:
        webbrowser.open('http://{}:{}/'.format(host, port))
    try:
        StatsServer((host, port), stats_handler).serve_forever()
    except KeyboardInterrupt:
        print('Stopping...')
        sys.exit(0)
Targe

  0%|          | 0/19658 [00:00<?, ?it/s]

In [None]:
torch.cuda.empty_cache()

In [None]:
with torch.no_grad():
    code = "def foo(x,y): x + y"
    inp = shared_spm.tokenize(code)
    inp = torch.tensor(inp).view(1,-1)
    generated = model.generate(input_ids=inp.to(model.device), 
                               decoder_start_token_id=shared_spm.bos_id(), 
                               num_beams=10,
                               max_length=100, no_repeat_ngram_size=4)
    generated = generated[0].cpu().tolist()
    print(shared_spm.decode(generated))