# After getting files

## Insert into database

In [1]:
from pathlib import Path
try:
    dataset
except:
    print('getting dataset location')
    dataset = Path('dataset').absolute()


getting dataset location


In [2]:
import os
os.chdir(dataset)

js_dir = dataset / 'javascript'


In [3]:
import xml.etree.ElementTree as ET

In [None]:
import json
omitted = []
output = []
for text_file in js_dir.glob('thread2-*.txt'):
    with open(text_file, 'r') as file:
        file_contents = file.read()
    parsed = simple_parse_xml(file_contents, nested=False)
    output.append({
        'file': str(text_file),
        'split': [p[0] for p in parsed[:-1]] + [parsed[-1]],
        'len': len(parsed),
        'ok': len(parsed) == 11
    })
    if len(parsed) != 11:
        omitted.append(text_file)
        continue
    with open(str.replace(str(text_file), '.txt', '.json'), 'w') as file:
        file.write(json.dumps([p[1:] for p in parsed]))

with open('parse2.log', 'w') as file:
    file.write(json.dumps(output))
for out in output:
    if not out['ok']:
        continue
    print(out['file'])
    print(out['split'])

In [49]:
import sqlite3, json
from tqdm import tqdm
from contextlib import contextmanager

train = js_dir / 'final' / 'jsonl' / 'train'
os.chdir(train)

@contextmanager
def get_cursor(database_name='rsn_train'):
    with sqlite3.connect(database_name) as conn:
        yield conn.cursor()




In [6]:
import regex

def simple_parse_xml(content, nested):
    pattern = r'\s*<([^\s]*?)>\s*'
    open = re.search(pattern, content)
    if not open:
        return [content]
    [open_start, open_end] = open.span()
    before = content[:open_start]
    tag = open.group(1)
    inner_and_after = content[open_end:]
    close = re.search(f'</{re.escape(tag)}>', inner_and_after)
    if close:
        [close_start, close_end] = close.span()
    else:
        close = re.search(pattern, inner_and_after)
        if close:
            [close_start, _] = close.span()
        else:
            close_start = len(inner_and_after)
        close_end = close_start
    inner = inner_and_after[:close_start]
    after = inner_and_after[close_end:]
    inner = simple_parse_xml(inner, nested) if nested else inner
    return [[before, tag, inner], *simple_parse_xml(after, nested)]

def atom_to_re(s):
    tokens = [token.strip() for token in re.split(r'\s+|(?=\W)|(?<=\W)', s.strip()) if token]
    escaped = [regex.escape(token, special_only=True) for token in tokens]
    return r'\s*' + r'\s*'.join(escaped) + r'\s*'

def str_to_re(s):
    codes = re.split(r'\s*(?://[^\n]*(?:\n|$)|/\*.*?\*/|\.{3,})\s*', s,  flags=re.DOTALL)
    return '(?:.*?)'.join(atom_to_re(code) for code in codes)

def node_to_re(node, c):
    if type(node) == str:
        return str_to_re(node), []
    before, tag, content = node
    before_re = str_to_re(before)
    c[0] += 1
    open_gr = c[0]
    content_re, content_tags = make_regex(content, c)
    c[0] += 1
    close_gr = c[0]
    open_re = '\s*(|<'+re.escape(tag)+'>)\s*'
    close_re = '\s*(|</'+re.escape(tag)+'>)\s*'
    return before_re+open_re+content_re+close_re, [(tag, open_gr, close_gr, content_tags)]


def make_regex(tree, c):
    regs, tags = zip(*(node_to_re(node, c) for node in tree))
    return re.sub(r'(\\s\*)+', r'\\s*', ''.join(regs)), [t for tag in tags for t in tag] 



In [830]:
with get_cursor() as cursor:
    cursor.execute('drop table if exists snippets')
    cursor.execute('drop table if exists region')
    cursor.execute('create table if not exists snippets (ID INTEGER PRIMARY KEY, idx INTEGER, code TEXT, locations JSON, regions JSON, SRP boolean)')
    cursor.execute('create table if not exists region (ID INTEGER PRIMARY KEY, code TEXT, vector JSON)')


In [831]:
def insert_region(code):
    with get_cursor() as cursor:
        cursor.execute('insert into region (code) values (?)', (code, ))
        return cursor.lastrowid

def insert_snippet(id, index, code, locations, regions, srp):
    with get_cursor() as cursor:
        cursor.execute(
            'insert into snippets (ID, idx, code, locations, regions, SRP) values (?, ?, ?, ?, ?, ?)',
            (id, index, code, json.dumps(locations), json.dumps(regions), srp))

def flat_wrong_tags(tags, code, m):
    clean_tags = []
    for tag in tags:
        clean_tags += flat_wrong_tag(tag, clean_tags, code, m)
    return clean_tags

def flat_wrong_tag(tag, clean_tags, code, m):
    name, open, close, sub_tags = tag
    clean_sub_tags = flat_wrong_tags(sub_tags, code, m)
    if m.group(open):
        if len(clean_tags):
            pname, popen, pclose, psub_tags = clean_tags[-1]
            if not m.group(pclose):
                clean_tags[-1] = (pname, popen, close, psub_tags)
        return clean_sub_tags
    return [(name, open, close, clean_sub_tags)]

def tag_to_json(tag, code, m, handle_region):
    name, open, close, sub_tags = tag
    start = m.span(open)[0]
    regions, body, end = tags_to_json(start, sub_tags, code, m, handle_region)
    body += code[end:m.span(close)[0]] 
    region_id = handle_region(f'function {name} () {{\n{body}\n}}')
    regions = [(start, region_id)] + regions
    return regions


def tags_to_json(outer_index, tags, code, m, handle_region):
    regions = []
    outer_body = ''
    for tag in tags:
        name, open, close, _ = tag
        outer_body += code[outer_index:m.span(open)[0]] + '\n' + name + '();\n'
        outer_index = m.span(close)[0]
        regions += tag_to_json(tag, code, m, handle_region)
    return regions, outer_body, outer_index


def to_json(tags, code, m, handle_region):
    regions, body, end = tags_to_json(0, tags, code, m, handle_region)
    body += code[end:] 
    region_id = handle_region(body)
    regions = [(0, region_id)] + regions
    if len(regions) > 1:
        regions.append((end, region_id))
    return regions


In [833]:
def strip_js_comments(js_code):
    js_code = re.sub(r'\n?//.*?\n', '\n', js_code)
    js_code = re.sub(r'/\*.*?\*/', '', js_code, flags=re.DOTALL)
    return js_code    

n_ok, n_all = 0, 0
for text_file in tqdm(list(js_dir.glob('thread2-*.json'))):
    index = int(re.match('.*thread2-(.*)\.json', str(text_file)).group(1))
    limit = 10
    with get_cursor() as cursor:
        codes = list(cursor.execute('select id, code from shuffled limit ? offset ?', (limit, limit * index + 1)))
    with open(text_file, 'r') as file:
        file_contents = file.read()
    for (id, code), obj in zip(codes, json.loads(file_contents)):
        original = strip_js_comments(code)
        xml = obj[1]
        reg_str, tags = make_regex(simple_parse_xml(xml, True), [0])
        reg = regex.compile(reg_str, flags = regex.DOTALL)
        m = reg.match(original)
        n_all += 1
        if m:
            n_ok += 1
            tags = flat_wrong_tags(tags, original, m)
            if len(tags) == 1 and not len(tags[0][3]):
                tags = []
            regions = to_json(tags, original, m, insert_region)
            insert_snippet(id, index, original, *zip(*regions), False)
            
            
print(f'{n_ok}/{n_all}')

100%|██████████| 677/677 [02:49<00:00,  3.99it/s]

6183/6770





In [7]:
with get_cursor() as cursor:
    print(list(cursor.execute('select idx < 650, count(*), SRP from snippets group by idx < 650, SRP')))
    print(list(cursor.execute('select count(*) from region')))

[(0, 533, 0), (0, 551, 1), (1, 2510, 0), (1, 2589, 1)]
[(13709,)]


In [8]:
from transformers import RobertaTokenizerFast, RobertaModel
import torch

# Load tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained("microsoft/graphcodebert-base")
model = RobertaModel.from_pretrained("microsoft/graphcodebert-base")


Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
with get_cursor() as cursor:
    regions = cursor.execute('select id, code from region')
    for region in tqdm(list(regions)):
        id, code = region
        tokenized_inputs = tokenizer([code], padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**tokenized_inputs)
            last_hidden_states = outputs.last_hidden_state
        average_hidden_states = last_hidden_states.mean(dim=1)[0]
        print(last_hidden_states.shape)
        print(average_hidden_states.shape)
        cursor.execute('select id, vector from region where json_array_length(vector) > 768')
        print(len(json.loads(cursor.fetchone()[1])))
        break
        cursor.execute('update region set vector = ? where id = ?', 
                       (json.dumps([float(x) for x in average_hidden_states.numpy()]), id))

  0%|          | 0/13709 [00:00<?, ?it/s]

torch.Size([1, 70, 768])
torch.Size([768])


  0%|          | 0/13709 [00:00<?, ?it/s]


TypeError: 'NoneType' object is not subscriptable

In [893]:
tokenizer.encode_plus(code, return_offsets_mapping=True)

{'input_ids': [0, 35435, 120, 791, 1069, 40039, 43048, 25522, 50118, 1437, 1437, 50118, 16435, 15664, 40039, 791, 1069, 47006, 50118, 50118, 32845, 791, 1069, 40039, 48271, 47006, 50118, 24303, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 8), (9, 12), (12, 13), (13, 15), (15, 18), (18, 20), (21, 22), (22, 23), (24, 24), (25, 25), (25, 26), (26, 29), (29, 33), (33, 36), (36, 37), (37, 39), (39, 42), (42, 43), (43, 44), (44, 50), (50, 51), (51, 53), (53, 56), (56, 60), (60, 63), (63, 64), (64, 65), (0, 0)]}

In [957]:
with get_cursor() as cursor:
    cursor.execute('drop table if exists tokenized')
    cursor.execute('create table if not exists tokenized (ID INTEGER PRIMARY KEY, input_ids JSON, region_ids JSON)')


In [958]:
with get_cursor() as cursor:
    snippets = list(cursor.execute('select id, code, locations, regions from snippets where idx < 650'))
for (id, code, locations, regions) in tqdm(snippets):
    tokens = tokenizer.encode_plus(code, truncation=True, return_offsets_mapping=True)
    offset_mapping = tokens['offset_mapping']
    regions, locations = json.loads(regions),json.loads(locations)
    if not len(regions):
        continue
    locations.append(len(code))
    i = 0
    region_ids = []
    for (start, end) in offset_mapping[1:-1]:
        while start > locations[i+1]:
            i += 1
        region_ids.append(regions[i])
    region_ids = [0] + region_ids + [0]
    with get_cursor() as cursor:
        cursor.execute(
            'insert into tokenized (ID, input_ids, region_ids) values (?, ?, ?)',
            (id, json.dumps(tokens['input_ids']), json.dumps(region_ids)))


100%|██████████| 5099/5099 [00:11<00:00, 430.60it/s]


In [1]:
0

0

In [11]:
with get_cursor() as cursor:
    print(list(cursor.execute('select * from tokenized limit 1')))

[(40, '[0, 35435, 18134, 17894, 43048, 25522, 50118, 1437, 10759, 25522, 5521, 4376, 35, 4285, 45106, 42119, 29, 6, 3156, 35, 4285, 45106, 22429, 42119, 29, 6, 7721, 35, 4285, 48837, 42119, 29, 24303, 5457, 120, 42119, 29, 7605, 250, 1020, 45463, 47006, 50118, 1437, 10759, 25522, 5521, 4376, 35, 1029, 45106, 42119, 29, 6, 3156, 35, 1029, 45106, 22429, 42119, 29, 6, 7721, 35, 1029, 48837, 42119, 29, 24303, 5457, 120, 42119, 29, 7605, 41555, 23185, 47006, 50140, 1437, 10759, 17928, 45985, 5457, 8932, 45985, 1640, 1043, 45106, 42119, 29, 6, 1029, 45106, 42119, 29, 4397, 50118, 1437, 10759, 3156, 45985, 5457, 8932, 45985, 1640, 1043, 45106, 22429, 42119, 29, 6, 1029, 45106, 22429, 42119, 29, 4397, 50118, 1437, 10759, 7721, 45985, 5457, 8932, 45985, 1640, 1043, 48837, 42119, 29, 6, 1029, 48837, 42119, 29, 4397, 50118, 1437, 10759, 34, 45985, 5457, 43912, 1640, 5521, 4376, 45985, 4, 32278, 23329, 45056, 3156, 45985, 4, 32278, 23329, 45056, 7721, 45985, 4, 32278, 23329, 4397, 50140, 1437, 114

In [10]:
with get_cursor() as cursor:
    cursor.execute('drop table if exists train_snippets')
    cursor.execute('create table train_snippets as select * from snippets where idx < 650')

In [94]:
def handle_epoch(batch_size):
    with get_cursor() as cursor:
        epoch = list(cursor.execute('select count(*) from train_snippets'))[0][0]
        cursor.execute('create table if not exists train_snippets_shuffled as select * from train_snippets ORDER BY RANDOM()')
        for i in range(0, epoch, batch_size):
            ids = ','.join(
                str(x[0]) for x in cursor.execute('select id from train_snippets_shuffled limit ? offset ?', (batch_size, i))
            )
            tokens = cursor.execute(f'select input_ids, region_ids from tokenized where id in ({ids})')
            tokens = [(json.loads(input_ids), json.loads(region_ids)) for (input_ids, region_ids) in tokens]
            region_ids_str = ','.join(set(str(region_id) for (_, region_ids) in tokens for region_id in region_ids))
            regions = dict((id, json.loads(vector)) for (id, vector) in
                cursor.execute(f'select id, vector from region where id in ({region_ids_str})'))
            batch = []
            for (input_ids, region_ids) in tokens:
                label_size = 768
                size = len(input_ids)
                if size > 512:
                    print(size)
                empties = 512 - size
                null_vector = [0] * label_size
                for r_id in region_ids:
                    if r_id and not (r_id in regions):
                        print(r_id)
                label = [regions[r_id] if r_id else null_vector for r_id in region_ids]
                label += [null_vector] * empties
                input_ids += [0] * empties
                attention = [1] * size + [0] * empties
                batch.append((input_ids, attention, label))
            yield tuple(torch.FloatTensor(t) for t in zip(*batch))
    

In [98]:
for batch in handle_epoch(16):
    input_ids, attention, label = batch
    print(label[0])
    break

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2525, -0.2023, -0.0671,  ..., -0.4390, -0.1260, -0.0548],
        [ 0.2525, -0.2023, -0.0671,  ..., -0.4390, -0.1260, -0.0548],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


In [15]:
dict([(1, 2)])

{1: 2}