# After getting files

## Insert into database

In [1]:
from pathlib import Path
try:
    dataset
except:
    print('getting dataset location')
    dataset = Path('dataset').absolute()


getting dataset location


In [2]:
import os
os.chdir(dataset)

js_dir = dataset / 'javascript'


In [6]:
import xml.etree.ElementTree as ET

In [None]:
import json
omitted = []
output = []
for text_file in js_dir.glob('thread2-*.txt'):
    with open(text_file, 'r') as file:
        file_contents = file.read()
    parsed = simple_parse_xml(file_contents, nested=False)
    output.append({
        'file': str(text_file),
        'split': [p[0] for p in parsed[:-1]] + [parsed[-1]],
        'len': len(parsed),
        'ok': len(parsed) == 11
    })
    if len(parsed) != 11:
        omitted.append(text_file)
        continue
    with open(str.replace(str(text_file), '.txt', '.json'), 'w') as file:
        file.write(json.dumps([p[1:] for p in parsed]))

with open('parse2.log', 'w') as file:
    file.write(json.dumps(output))
for out in output:
    if not out['ok']:
        continue
    print(out['file'])
    print(out['split'])

In [836]:
import sqlite3, json
from contextlib import contextmanager

train = js_dir / 'final' / 'jsonl' / 'train'
os.chdir(train)

@contextmanager
def get_cursor(database_name='rsn_train'):
    with sqlite3.connect(database_name) as conn:
        yield conn.cursor()




In [800]:
import regex

def simple_parse_xml(content, nested):
    pattern = r'\s*<([^\s]*?)>\s*'
    open = re.search(pattern, content)
    if not open:
        return [content]
    [open_start, open_end] = open.span()
    before = content[:open_start]
    tag = open.group(1)
    inner_and_after = content[open_end:]
    close = re.search(f'</{re.escape(tag)}>', inner_and_after)
    if close:
        [close_start, close_end] = close.span()
    else:
        close = re.search(pattern, inner_and_after)
        if close:
            [close_start, _] = close.span()
        else:
            close_start = len(inner_and_after)
        close_end = close_start
    inner = inner_and_after[:close_start]
    after = inner_and_after[close_end:]
    inner = simple_parse_xml(inner, nested) if nested else inner
    return [[before, tag, inner], *simple_parse_xml(after, nested)]

def atom_to_re(s):
    tokens = [token.strip() for token in re.split(r'\s+|(?=\W)|(?<=\W)', s.strip()) if token]
    escaped = [regex.escape(token, special_only=True) for token in tokens]
    return r'\s*' + r'\s*'.join(escaped) + r'\s*'

def str_to_re(s):
    codes = re.split(r'\s*(?://[^\n]*(?:\n|$)|/\*.*?\*/|\.{3,})\s*', s,  flags=re.DOTALL)
    return '(?:.*?)'.join(atom_to_re(code) for code in codes)

def node_to_re(node, c):
    if type(node) == str:
        return str_to_re(node), []
    before, tag, content = node
    before_re = str_to_re(before)
    c[0] += 1
    open_gr = c[0]
    content_re, content_tags = make_regex(content, c)
    c[0] += 1
    close_gr = c[0]
    open_re = '\s*(|<'+re.escape(tag)+'>)\s*'
    close_re = '\s*(|</'+re.escape(tag)+'>)\s*'
    return before_re+open_re+content_re+close_re, [(tag, open_gr, close_gr, content_tags)]


def make_regex(tree, c):
    regs, tags = zip(*(node_to_re(node, c) for node in tree))
    return re.sub(r'(\\s\*)+', r'\\s*', ''.join(regs)), [t for tag in tags for t in tag] 



In [830]:
with get_cursor() as cursor:
    cursor.execute('drop table if exists snippets')
    cursor.execute('drop table if exists region')
    cursor.execute('create table if not exists snippets (ID INTEGER PRIMARY KEY, idx INTEGER, code TEXT, locations JSON, regions JSON, SRP boolean)')
    cursor.execute('create table if not exists region (ID INTEGER PRIMARY KEY, code TEXT, vector JSON)')


In [831]:
def insert_region(code):
    with get_cursor() as cursor:
        cursor.execute('insert into region (code) values (?)', (code, ))
        return cursor.lastrowid

def insert_snippet(id, index, code, locations, regions, srp):
    with get_cursor() as cursor:
        cursor.execute(
            'insert into snippets (ID, idx, code, locations, regions, SRP) values (?, ?, ?, ?, ?, ?)',
            (id, index, code, json.dumps(locations), json.dumps(regions), srp))

def flat_wrong_tags(tags, code, m):
    clean_tags = []
    for tag in tags:
        clean_tags += flat_wrong_tag(tag, clean_tags, code, m)
    return clean_tags

def flat_wrong_tag(tag, clean_tags, code, m):
    name, open, close, sub_tags = tag
    clean_sub_tags = flat_wrong_tags(sub_tags, code, m)
    if m.group(open):
        if len(clean_tags):
            pname, popen, pclose, psub_tags = clean_tags[-1]
            if not m.group(pclose):
                clean_tags[-1] = (pname, popen, close, psub_tags)
        return clean_sub_tags
    return [(name, open, close, clean_sub_tags)]

def tag_to_json(tag, code, m, handle_region):
    name, open, close, sub_tags = tag
    start = m.span(open)[0]
    regions, body, end = tags_to_json(start, sub_tags, code, m, handle_region)
    body += code[end:m.span(close)[0]] 
    region_id = handle_region(f'function {name} () {{\n{body}\n}}')
    regions = [(start, region_id)] + regions
    return regions


def tags_to_json(outer_index, tags, code, m, handle_region):
    regions = []
    outer_body = ''
    for tag in tags:
        name, open, close, _ = tag
        outer_body += code[outer_index:m.span(open)[0]] + '\n' + name + '();\n'
        outer_index = m.span(close)[0]
        regions += tag_to_json(tag, code, m, handle_region)
    return regions, outer_body, outer_index


def to_json(tags, code, m, handle_region):
    regions, body, end = tags_to_json(0, tags, code, m, handle_region)
    body += code[end:] 
    region_id = handle_region(body)
    regions = [(0, region_id)] + regions
    if len(regions) > 1:
        regions.append((end, region_id))
    return regions


In [833]:
def strip_js_comments(js_code):
    js_code = re.sub(r'\n?//.*?\n', '\n', js_code)
    js_code = re.sub(r'/\*.*?\*/', '', js_code, flags=re.DOTALL)
    return js_code    

n_ok, n_all = 0, 0
for text_file in tqdm(list(js_dir.glob('thread2-*.json'))):
    index = int(re.match('.*thread2-(.*)\.json', str(text_file)).group(1))
    limit = 10
    with get_cursor() as cursor:
        codes = list(cursor.execute('select id, code from shuffled limit ? offset ?', (limit, limit * index + 1)))
    with open(text_file, 'r') as file:
        file_contents = file.read()
    for (id, code), obj in zip(codes, json.loads(file_contents)):
        original = strip_js_comments(code)
        xml = obj[1]
        reg_str, tags = make_regex(simple_parse_xml(xml, True), [0])
        reg = regex.compile(reg_str, flags = regex.DOTALL)
        m = reg.match(original)
        n_all += 1
        if m:
            n_ok += 1
            tags = flat_wrong_tags(tags, original, m)
            if not len(tags) or len(tags) == 1 and not len(tags[0][3]):
                insert_snippet(id, index, original, [], [], True)
                continue
            regions = to_json(tags, original, m, insert_region)
            insert_snippet(id, index, original, *zip(*regions), False)
            
            
print(f'{n_ok}/{n_all}')

100%|██████████| 677/677 [02:49<00:00,  3.99it/s]

6183/6770





In [838]:
with get_cursor() as cursor:
    print(list(cursor.execute('select count(*) from snippets group by SRP')))
    print(list(cursor.execute('select count(*) from region')))

[(3043,), (3140,)]
[(13709,)]


<function io.open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None)>