# After getting files

## Insert into database

In [1]:
from pathlib import Path
try:
    dataset
except:
    print('getting dataset location')
    dataset = Path('dataset').absolute()


getting dataset location


In [2]:
import os
os.chdir(dataset)

js_dir = dataset / 'javascript'


In [3]:
import xml.etree.ElementTree as ET

In [4]:
import sqlite3, json
from tqdm import tqdm
from contextlib import contextmanager

train = js_dir / 'final' / 'jsonl' / 'train'
os.chdir(train)

@contextmanager
def get_cursor(database_name='rsn_train'):
    with sqlite3.connect(database_name) as conn:
        yield conn.cursor()




In [5]:
import regex

def simple_parse_xml(content, nested):
    pattern = r'\s*<([^\s]*?)>\s*'
    open = re.search(pattern, content)
    if not open:
        return [content]
    [open_start, open_end] = open.span()
    before = content[:open_start]
    tag = open.group(1)
    inner_and_after = content[open_end:]
    close = re.search(f'</{re.escape(tag)}>', inner_and_after)
    if close:
        [close_start, close_end] = close.span()
    else:
        close = re.search(pattern, inner_and_after)
        if close:
            [close_start, _] = close.span()
        else:
            close_start = len(inner_and_after)
        close_end = close_start
    inner = inner_and_after[:close_start]
    after = inner_and_after[close_end:]
    inner = simple_parse_xml(inner, nested) if nested else inner
    return [[before, tag, inner], *simple_parse_xml(after, nested)]

def atom_to_re(s):
    tokens = [token.strip() for token in re.split(r'\s+|(?=\W)|(?<=\W)', s.strip()) if token]
    escaped = [regex.escape(token, special_only=True) for token in tokens]
    return r'\s*' + r'\s*'.join(escaped) + r'\s*'

def str_to_re(s):
    codes = re.split(r'\s*(?://[^\n]*(?:\n|$)|/\*.*?\*/|\.{3,})\s*', s,  flags=re.DOTALL)
    return '(?:.*?)'.join(atom_to_re(code) for code in codes)

def node_to_re(node, c):
    if type(node) == str:
        return str_to_re(node), []
    before, tag, content = node
    before_re = str_to_re(before)
    c[0] += 1
    open_gr = c[0]
    content_re, content_tags = make_regex(content, c)
    c[0] += 1
    close_gr = c[0]
    open_re = '\s*(|<'+re.escape(tag)+'>)\s*'
    close_re = '\s*(|</'+re.escape(tag)+'>)\s*'
    return before_re+open_re+content_re+close_re, [(tag, open_gr, close_gr, content_tags)]


def make_regex(tree, c):
    regs, tags = zip(*(node_to_re(node, c) for node in tree))
    return re.sub(r'(\\s\*)+', r'\\s*', ''.join(regs)), [t for tag in tags for t in tag] 



In [None]:
import json, re
omitted = []
output = []
for text_file in js_dir.glob('thread2-*.txt'):
    outfile = str.replace(str(text_file), '.txt', '.json')
    if os.path.exists(outfile):
        continue
    with open(text_file, 'r') as file:
        file_contents = file.read()
    parsed = simple_parse_xml(file_contents, nested=False)
    output.append({
        'file': str(text_file),
        'split': [p[0] for p in parsed[:-1]] + [parsed[-1]],
        'len': len(parsed),
        'ok': len(parsed) == 11
    })
    if len(parsed) != 11:
        omitted.append(text_file)
        continue
    with open(outfile, 'w') as file:
        file.write(json.dumps([p[1:] for p in parsed]))

with open('parse3.log', 'w') as file:
    file.write(json.dumps(output))
for out in output:
    if not out['ok']:
        continue
    print(out['file'])
    print(out['split'])

In [None]:
with get_cursor() as cursor:
    # cursor.execute('drop table if exists snippets')
    # cursor.execute('drop table if exists region')
    cursor.execute('create table if not exists snippets (ID INTEGER PRIMARY KEY, idx INTEGER, code TEXT, locations JSON, regions JSON, SRP boolean)')
    cursor.execute('create table if not exists region (ID INTEGER PRIMARY KEY, code TEXT, vector JSON)')


In [6]:
def insert_region(code):
    with get_cursor() as cursor:
        cursor.execute('insert into region (code) values (?)', (code, ))
        return cursor.lastrowid

def insert_snippet(id, index, code, locations, regions, srp):
    with get_cursor() as cursor:
        cursor.execute(
            'insert into snippets (ID, idx, code, locations, regions, SRP) values (?, ?, ?, ?, ?, ?)',
            (id, index, code, json.dumps(locations), json.dumps(regions), srp))

def flat_wrong_tags(tags, code, m):
    clean_tags = []
    for tag in tags:
        clean_tags += flat_wrong_tag(tag, clean_tags, code, m)
    return clean_tags

def flat_wrong_tag(tag, clean_tags, code, m):
    name, open, close, sub_tags = tag
    clean_sub_tags = flat_wrong_tags(sub_tags, code, m)
    if m.group(open):
        if len(clean_tags):
            pname, popen, pclose, psub_tags = clean_tags[-1]
            if not m.group(pclose):
                clean_tags[-1] = (pname, popen, close, psub_tags)
        return clean_sub_tags
    return [(name, open, close, clean_sub_tags)]

def tag_to_json(tag, code, m, handle_region):
    name, open, close, sub_tags = tag
    start = m.span(open)[0]
    regions, body, end = tags_to_json(start, sub_tags, code, m, handle_region)
    body += code[end:m.span(close)[0]] 
    region_id = handle_region(f'function {name} () {{\n{body}\n}}')
    regions = [(start, region_id)] + regions
    return regions


def tags_to_json(outer_index, tags, code, m, handle_region):
    regions = []
    outer_body = ''
    for tag in tags:
        name, open, close, _ = tag
        outer_body += code[outer_index:m.span(open)[0]] + '\n' + name + '();\n'
        outer_index = m.span(close)[0]
        regions += tag_to_json(tag, code, m, handle_region)
    return regions, outer_body, outer_index


def to_json(tags, code, m, handle_region):
    regions, body, end = tags_to_json(0, tags, code, m, handle_region)
    body += code[end:] 
    region_id = handle_region(body)
    regions = [(0, region_id)] + regions
    if len(regions) > 1:
        regions.append((end, region_id))
    return regions

def strip_js_comments(js_code):
    js_code = re.sub(r'\n?//.*?\n', '\n', js_code)
    js_code = re.sub(r'/\*.*?\*/', '', js_code, flags=re.DOTALL)
    return js_code    


In [None]:
n_ok, n_all = 0, 0
for text_file in tqdm(list(js_dir.glob('thread2-*.json'))):
    index = int(re.match('.*thread2-(.*)\.json', str(text_file)).group(1))
    limit = 10
    with get_cursor() as cursor:
        codes = list(cursor.execute('select id, code from shuffled limit ? offset ?', (limit, limit * index + 1)))
    with open(text_file, 'r') as file:
        file_contents = file.read()
    for (id, code), obj in zip(codes, json.loads(file_contents)):
        original = strip_js_comments(code)
        xml = obj[1]
        reg_str, tags = make_regex(simple_parse_xml(xml, True), [0])
        reg = regex.compile(reg_str, flags = regex.DOTALL)
        m = reg.match(original)
        n_all += 1
        if m:
            n_ok += 1
            tags = flat_wrong_tags(tags, original, m)
            if len(tags) == 1 and not len(tags[0][3]):
                tags = []
            regions = to_json(tags, original, m, insert_region)
            insert_snippet(id, index, original, *zip(*regions), len(tags) == 0)
            
            
print(f'{n_ok}/{n_all}')

In [None]:
with get_cursor() as cursor:
    print(list(cursor.execute('select idx < 650, count(*), SRP from snippets group by idx < 650, SRP')))
    print(list(cursor.execute('select count(*) from region')))

In [7]:
from transformers import RobertaTokenizerFast, RobertaModel
import torch

# Load tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained("microsoft/graphcodebert-base")
model = RobertaModel.from_pretrained("microsoft/graphcodebert-base")


Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def adapt(v):
     return v / 2 + .5
def get_labels(ids):
    with get_cursor() as cursor:
        regions = cursor.execute(f'select id, code from region where id in ({ids})')
        for region in list(regions):
            id, code = region
            tokenized_inputs = tokenizer([code], padding=True, truncation=True, return_tensors="pt")
            tokenized_inputs.to(device)
            with torch.no_grad():
                outputs = model(**tokenized_inputs)
                last_hidden_states = outputs.last_hidden_state
            average_hidden_states = last_hidden_states.mean(dim=1)
            yield id, list(average_hidden_states.cpu().numpy()[0])


In [11]:
with get_cursor() as cursor:
    print(next(cursor.execute('select count(*) from train_snippets'))[0])

5106


In [None]:
with get_cursor() as cursor:
    cursor.execute('drop table if exists tokenized')
    cursor.execute('create table if not exists tokenized (ID INTEGER PRIMARY KEY, input_ids JSON, region_ids JSON)')


In [None]:
with get_cursor() as cursor:
    snippets = list(cursor.execute('select id, code, locations, regions from snippets where idx < 650'))
for (id, code, locations, regions) in tqdm(snippets):
    tokens = tokenizer.encode_plus(code, truncation=True, return_offsets_mapping=True)
    offset_mapping = tokens['offset_mapping']
    regions, locations = json.loads(regions),json.loads(locations)
    if not len(regions):
        continue
    locations.append(len(code))
    i = 0
    region_ids = []
    for (start, end) in offset_mapping[1:-1]:
        while start > locations[i+1]:
            i += 1
        region_ids.append(regions[i])
    region_ids = [0] + region_ids + [0]
    with get_cursor() as cursor:
        cursor.execute(
            'insert into tokenized (ID, input_ids, region_ids) values (?, ?, ?)',
            (id, json.dumps(tokens['input_ids']), json.dumps(region_ids)))


In [None]:
with get_cursor() as cursor:
    cursor.execute('drop table if exists train_snippets')
    cursor.execute('create table train_snippets as select * from snippets where idx < 650')

In [None]:
with get_cursor() as cursor:
    cursor.execute('create table if not exists current_batch(idx INTEGER)')
    if not len(list(cursor.execute('select * from current_batch'))):
        

In [32]:
with get_cursor() as cursor:
    cursor.execute('create table if not exists last_batch(idx INTEGER, epoch_idx INTEGER)')


In [43]:
import torch
device = torch.device('cuda')

def set_last_batch(last_batch, epoch_idx):
    with get_cursor() as cursor:
        cursor.execute('update last_batch set idx = ?, epoch_idx = ?', (last_batch, epoch_idx))

def get_unsafe_last_batch(batches):
    with get_cursor() as cursor:
        cursor.execute('select * from last_batch')
        item = cursor.fetchone()
        if not item:
            cursor.execute('insert into last_batch values (0, 0)')
            return 0
        else:
            return item

def get_last_batch(batches):
    last_batch, last_epoch = get_unsafe_last_batch(batches)
    if last_batch >= batches:
        last_batch = 0
        last_epoch += 1
        set_last_batch(0, last_epoch)
    return last_batch, last_epoch
        
def get_batch_tokens(batch_size, i):
    with get_cursor() as cursor:
        ids = ','.join(
            str(x[0]) for x in cursor.execute('select id from train_snippets_shuffled limit ? offset ?', (batch_size, i))
        )
        tokens = cursor.execute(f'select input_ids, region_ids from tokenized where id in ({ids})')
        zz = [(json.loads(input_ids), json.loads(region_ids)) for (input_ids, region_ids) in tokens]
        print(batch_size, len(zz))
        return zz
        

def get_batch_regions(tokens):
    with get_cursor() as cursor:
        region_ids_str = ','.join(set(str(region_id) for (_, region_ids) in tokens for region_id in region_ids))
        return dict(get_labels(region_ids_str))

def prepare_iteration(input_ids, region_ids, regions):
    label_size = 768
    size = len(input_ids)
    if size > 512:
        print(size)
    empties = 512 - size
    null_vector = [0] * label_size
    for r_id in region_ids:
        if r_id and not (r_id in regions):
            print(r_id)
    label = [regions[r_id] if r_id else null_vector for r_id in region_ids]
    label += [null_vector] * empties
    input_ids += [0] * empties
    attention = [1] * size + [0] * empties
    try:
        torch.FloatTensor(label)
    except:
        print(regions[region_ids[1]])

    return input_ids, attention, label

def get_batch(batch_size, i):
    tokens = get_batch_tokens(batch_size, i)
    regions = get_batch_regions(tokens)
    batch = [prepare_iteration(input_ids, region_ids, regions) for (input_ids, region_ids) in tokens]
    input_ids, attention, label = zip(*batch)
    return torch.IntTensor(input_ids), torch.FloatTensor(attention), torch.FloatTensor(label)


def get_epoch_part(i0, epoch, batch_size):
    return ((i, get_batch(batch_size, i)) for i in range(i0, epoch, batch_size))

def tee(text):
    print(f"\r{text}\r")
    with open('train-resnet.log', 'a') as file:
        file.write(f"{text}\n")

def log(title, start_time, sizes, epoch, i, mean_loss, loss):
    _, num_epochs, batches = sizes
    i += 1
    dt = time.time() - start_time
    elapsed = timedelta(seconds=int(dt))
    remaining = timedelta(seconds=int(dt*(batches-i)/i))
    text = f"{title}: {elapsed}<{remaining} Epoch {epoch+1}/{num_epochs} - Batch {i}/{batches}, Loss: {mean_loss:.4f} {loss.item():.4f}"
    tee(text)

def save(epoch, i):
    
    torch.save(model.state_dict(), f'searchnet-bcemodel-{epoch}-{i}.pt')
    torch.save(optimizer.state_dict(), f'searchnet-bceoptimizer-{epoch}-{i}.pt')
    set_last_batch(i, epoch)
    tee(f'saving searchnet-bcemodel-{epoch}-{i}.pt\n')

def handle_train_batch(i, epoch, batch, performance, sizes):
    input_ids, attention, labels = (c.to(device) for c in batch)
    min_alpha, alpha, running_loss, start_time = performance
    train_batchs, num_epochs, batches = sizes

    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask=attention)
    loss = loss_fn(outputs.last_hidden_state, labels)
    loss.backward()
    optimizer.step()
    
    current_alpha = min_alpha + alpha
    running_loss = (1 - current_alpha) * running_loss + current_alpha * loss.item()
    alpha *= .5
    
    log('training', start_time, sizes, epoch, i, running_loss, loss)
    if i % 100 == 0:
        save(epoch, i)
    return running_loss, alpha
    
def handle_val_batch(i0, i, epoch, batch, start_time, cum_loss, sizes):
    input_ids, attention, labels = (c.to(device) for c in batch)
    _, num_epochs, batches = sizes

    with torch.no_grad():            
        outputs = model(input_ids, attention_mask=attention)
        loss = loss_fn(outputs.last_hidden_state, labels)
    
    cum_loss += loss.item()
    mean_loss = cum_loss / (i - i0 + 1)
    
    log('validation', start_time, sizes, epoch, i, mean_loss, loss)
    return cum_loss
    

def handle_epoch(batch_size, num_epochs):
    with get_cursor() as cursor:
        epoch_size = next(cursor.execute('select count(*) from train_snippets'))[0]
    batches = (epoch_size - 1) // batch_size + 1
    train_batchs = 4 * batches // 5
    i0, epoch0 = get_last_batch(train_batchs)
    num_epochs += epoch0
    i = i0 - 1
    min_alpha = .2
    start_time = time.time()
    
    
    running_loss = 0.0
    alpha = 1 - min_alpha

    model.train()
    sizes = (train_batchs, num_epochs, batches)
    tee('\nTraining\n')
    for (i, batch) in get_epoch_part(i0, train_batchs, 1):
        performance = (min_alpha, alpha, running_loss, start_time)
        running_loss, alpha = handle_train_batch(i, epoch, batch, performance, sizes)
    
    
    i0 = i + 1
    save(epoch, i0)
    
    model.eval()
    cum_loss = 0
    tee('\nValidation:\n')
    for (i, batch) in get_epoch_part(i0, batches, 1):
        cum_loss = handle_val_batch(i0, i, epoch, batch, start_time, cum_loss, sizes)
    
        
        


In [28]:
#set_last_batch(0, 0)

In [44]:
from transformers import RobertaTokenizerFast, RobertaModel
import torch
import torch.nn as nn
from torch.optim import AdamW
import time
from datetime import timedelta

xd = []
bce = nn.BCEWithLogitsLoss()
def loss_fn(result, label):
    return bce(result.flatten(end_dim=1), label.flatten(end_dim=1))

device = torch.device('cuda')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 10
batch_size = 32

set_last_batch(0, 0)
for epoch in range(num_epochs):
    handle_epoch(batch_size, num_epochs)



Training

1 1
training: 0:00:01<0:03:04 Epoch 1/10 - Batch 1/160, Loss: 0.7263 0.7263
saving searchnet-bcemodel-0-0.pt

1 1
training: 0:00:02<0:03:53 Epoch 1/10 - Batch 2/160, Loss: 0.6616 0.6185
1 1
training: 0:00:03<0:03:12 Epoch 1/10 - Batch 3/160, Loss: 0.6695 0.6814
1 1
training: 0:00:04<0:02:50 Epoch 1/10 - Batch 4/160, Loss: 0.6795 0.7027
1 1
training: 0:00:05<0:02:36 Epoch 1/10 - Batch 5/160, Loss: 0.6869 0.7094
1 1
training: 0:00:05<0:02:27 Epoch 1/10 - Batch 6/160, Loss: 0.6841 0.6745
1 1
training: 0:00:06<0:02:21 Epoch 1/10 - Batch 7/160, Loss: 0.6885 0.7048
1 1
training: 0:00:07<0:02:17 Epoch 1/10 - Batch 8/160, Loss: 0.6349 0.4287
1 1
training: 0:00:07<0:02:12 Epoch 1/10 - Batch 9/160, Loss: 0.6523 0.7203
1 1
training: 0:00:08<0:02:08 Epoch 1/10 - Batch 10/160, Loss: 0.6260 0.5218
1 1
training: 0:00:09<0:02:05 Epoch 1/10 - Batch 11/160, Loss: 0.6373 0.6822
1 1
training: 0:00:09<0:02:02 Epoch 1/10 - Batch 12/160, Loss: 0.6503 0.7023
1 1
training: 0:00:10<0:02:00 Epoch 1/10

KeyboardInterrupt: 

In [33]:
set_last_batch(0, 0)

In [1]:
from transformers import RobertaTokenizerFast, RobertaModel
import torch
device = torch.device('cuda')
model = RobertaModel.from_pretrained("microsoft/graphcodebert-base").to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
result, label = xd[0]

In [37]:
[bool(x > 1 or x < -1) for x in label[0][1].flatten()].index(True)

77

In [38]:
label[0][1][77]

tensor(-3.3694)

In [29]:
845

845