# After getting files

## Insert into database

In [1]:
from pathlib import Path
try:
    dataset
except:
    print('getting dataset location')
    dataset = Path('dataset').absolute()


getting dataset location


In [2]:
import os
os.chdir(dataset)

js_dir = dataset / 'javascript'


In [6]:
import xml.etree.ElementTree as ET

In [172]:
import re

html_tags = '|'.join([
    # Basic HTML Document Structure
    "!DOCTYPE", "html", "head", "title", "body",

    # Text Formatting
    "h1", "h2", "h3", "h4", "h5", "h6", "p", "br", "hr", "abbr", "acronym", "address", "b", "bdi", "bdo",
    "blockquote", "cite", "code", "del", "dfn", "em", "i", "ins", "kbd", "mark", "meter", "pre", "progress",
    "q", "rp", "rt", "ruby", "s", "samp", "small", "strike", "strong", "sub", "sup", "template", "time",
    "tt", "u", "var", "wbr",

    # Links and Resources
    "a", "link", "nav", "ul", "ol", "li", "dir", "dl", "dt", "dd", "figure", "figcaption", "main",

    # Images and Multimedia
    "img", "map", "area", "canvas", "figcaption", "figure", "picture", "svg", "audio", "source", "track", "video",

    # Tables
    "table", "caption", "th", "tr", "td", "thead", "tbody", "tfoot", "col", "colgroup",

    # Forms
    "form", "input", "textarea", "button", "select", "optgroup", "option", "label", "fieldset", "legend",
    "datalist", "output",

    # Frames
    "frame", "frameset", "noframes", "iframe",

    # Styles and Semantics
    "style", "div", "span", "header", "footer", "section", "article", "aside",

    # Meta Info
    "meta", "base", "basefont",

    # Programming
    "script", "noscript", "embed", "object", "param"
])

def simple_parse_xml(content, nested, against = ''):
    pattern = r'<([^\s]*?)>'
    before = ''
    while True:
        open = re.search(pattern, content)
        if not open:
            return [content]
        [open_start, open_end] = open.span()
        before += content[:open_start]
        if open.group(0) in against:
            before += open.group(0)
            content = content[open_end:]
            continue
        break
    tag = open.group(1)
    inner_and_after = content[open_end:]
    close = re.search(f'</{re.escape(tag)}>', inner_and_after)
    if not close:
        return [content]
    [close_start, close_end] = close.span()
    inner = inner_and_after[:close_start]
    after = inner_and_after[close_end:]
    inner = simple_parse_xml(inner, nested, against) if nested else inner
    return [[before, tag, inner], *simple_parse_xml(after, nested, against)]



In [37]:
[p[1] for p in parsed]

['function1',
 'buildScales',
 'prepareProject',
 'initializeProperties',
 'defineTranslations',
 '}']

In [91]:
import json
omitted = []
output = []
for text_file in js_dir.glob('thread2-*.txt'):
    with open(text_file, 'r') as file:
        file_contents = file.read()
    parsed = simple_parse_xml(file_contents, nested=False)
    output.append({
        'file': str(text_file),
        'split': [p[0] for p in parsed[:-1]] + [parsed[-1]],
        'len': len(parsed),
        'ok': len(parsed) == 11
    })
    if len(parsed) != 11:
        omitted.append(text_file)
        continue
    with open(str.replace(str(text_file), '.txt', '.json'), 'w') as file:
        file.write(json.dumps([p[1:] for p in parsed]))

with open('parse1.log', 'w') as file:
    file.write(json.dumps(output))
for out in output:
    if not out['ok']:
        continue
    print(out['file'])
    print(out['split'])

/root/py/CodeBERT/CodeReviewer/dataset/javascript/thread2-35.txt
['', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '']
/root/py/CodeBERT/CodeReviewer/dataset/javascript/thread2-100.txt
['```javascript\n', '\n```\n\n```javascript\n', '\n```\n\n```javascript\n', '\n```\n\n```javascript\n', '\n```\n\n```javascript\n', '\n```\n\n```javascript\n', '\n```\n\n```javascript\n', '\n```\n\n```javascript\n', '\n```\n\n```javascript\n', '\n```\n\n```javascript\n', "\n```\n\nIn all these annotations, I've only tagged lines or blocks of code with distinct, identifiable responsibilities, following the balanced SRP approach, avoiding over-segmenting and under-segmenting. Each tagged responsibility represents a specific purpose within the function."]
/root/py/CodeBERT/CodeReviewer/dataset/javascript/thread2-23.txt
['', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '']
/root/py/CodeBERT/CodeReviewer/dataset/javascript/thread2-217.txt
['Here are the an

In [131]:
source = omitted
omitted = []
output2 = []
for text_file in source:
    with open(text_file, 'r') as file:
        file_contents = file.read()
    parsed = simple_parse_xml(file_contents, nested=False)
    output2.append({
        'file': str(text_file),
        'split': [p[0] for p in parsed[:-1]] + [parsed[-1]],
        'len': len(parsed),
        'ok': len(parsed) == 11
    })
    if len(parsed) != 11:
        omitted.append(text_file)
        continue
    with open(str.replace(str(text_file), '.txt', '.json'), 'w') as file:
        file.write(json.dumps([p[1:] for p in parsed]))


In [140]:
!pwd

/root/py/CodeBERT/CodeReviewer/dataset


In [150]:
import sqlite3, json
from contextlib import contextmanager

train = js_dir / 'final' / 'jsonl' / 'train'
os.chdir(train)

@contextmanager
def get_cursor(database_name='rsn_train'):
    with sqlite3.connect(database_name) as conn:
        yield conn.cursor()


with get_cursor() as cursor:
    print(list(cursor.execute('select count(*) from shuffled')))

[(123889,)]


In [392]:
def strip_js_comments(js_code):
    # Remove all occurrences of single line comments
    js_code = re.sub(r'\n?//.*?\n', '\n', js_code)

    # Remove all occurrences of multi-line comments
    js_code = re.sub(r'/\*.*?\*/', '', js_code, flags=re.DOTALL)

    return js_code


for text_file in js_dir.glob('thread2-*.json'):
    index = int(re.match('.*thread2-(.*)\.json', str(text_file)).group(1))
    print(index)
    limit = 10
    with get_cursor() as cursor:
        codes = list(x for x, *_ in cursor.execute('select code from shuffled limit ? offset ?', (limit, limit * index + 1)))
    with open(text_file, 'r') as file:
        file_contents = file.read()
    for code, obj in zip(codes, json.loads(file_contents)):
        original = strip_js_comments(code)
        xml = obj[1]
        reg_str, tags = make_regex(simple_parse_xml(xml, True))
        reg = regex.compile(reg_str, flags = regex.DOTALL)
        print(len(reg_str))
        if not reg.match(original):
            v = original, obj
            raise ''

166
1511
3711
502
729
1077
4266
1735
1650
2719
1996
217
176
2181
538
380


TypeError: exceptions must derive from BaseException

In [388]:
import regex

def simple_parse_xml(content, nested):
    pattern = r'\s*<([^\s]*?)>\s*'
    open = re.search(pattern, content)
    if not open:
        return [content]
    [open_start, open_end] = open.span()
    before = content[:open_start]
    tag = open.group(1)
    inner_and_after = content[open_end:]
    close = re.search(f'</{re.escape(tag)}>', inner_and_after)
    if close:
        [close_start, close_end] = close.span()
    else:
        close = re.search(pattern, inner_and_after)
        if close:
            [close_start, _] = close.span()
        else:
            close_start = len(inner_and_after)
        close_end = close_start
    inner = inner_and_after[:close_start]
    after = inner_and_after[close_end:]
    inner = simple_parse_xml(inner, nested) if nested else inner
    return [[before, tag, inner], *simple_parse_xml(after, nested)]

def atom_to_re(s):
    escaped = regex.escape(s.strip(), special_only=True)
    # remove tail slash
    escaped = re.sub('(?<!\\\\)\\\\(\\s)', '\\1', escaped)
    return r'\s*' + re.sub(r'\s+', r'\\s+', escaped) + r'\s*'

def str_to_re(s):
    codes = re.split(r'\s*(?://[^\n]*(?:\n|$)|/\*.*?\*/|\.{3,})\s*', s,  flags=re.DOTALL)
    return '(?:.*?)'.join(atom_to_re(code) for code in codes)

def node_to_re(node):
    if type(node) == str:
        return str_to_re(node), []
    before, tag, content = node
    before_re = str_to_re(before)
    content_re, content_tags = make_regex(content)
    open_re = '\s*(|<'+re.escape(tag)+'>)\s*'
    close_re = '\s*(?:|</'+re.escape(tag)+'>)\s*'
    return before_re+open_re+content_re+close_re, [tag]+content_tags


def make_regex(tree):
    regs, tags = zip(*(node_to_re(node) for node in tree))
    return re.sub(r'(\\s\*)+', r'\\s*', ''.join(regs)), [t for tag in tags for t in tag] 



In [393]:
reg_str, tags = make_regex(simple_parse_xml(v[1][1], True))
reg = regex.compile(reg_str, flags = regex.DOTALL)
print(len(reg_str))
if not reg.match(v[0]):
    last_j = 0
    print(0, len(reg_str), 100)
    for i in range(0, len(reg_str), 100):
        print('enter')
        for j in range(100):
            try:
                if regex.compile(reg_str[:i+j], flags = regex.DOTALL).match(v[0]):
                    last_j = j
                    print('ok', i, j)
                    break
            except:
                pass
        else:
            print('not ok')
            print(i, last_j, j)
            for j in range(100):
                try:
                    if regex.compile(reg_str[:i-j], flags = regex.DOTALL).match(v[0]):
                        print('ok', i, j)
                        break
                except:
                    pass
            break


380
0 380 100
enter
ok 0 0
enter
not ok
100 0 99
ok 100 72


In [397]:
regex.compile(reg_str[:100-72], flags = regex.DOTALL).match(v[0])
reg_str[:100-72 + 20]

'\\s*function\\s+setHandle\\s+\\(handleNumber,\\s+to,\\'

In [398]:
#v[1][1]
v[0][:30]

'function setHandle ( handleNum'

In [390]:
from tqdm import tqdm
import regex
for iv, v in enumerate(tqdm(l)):
    print(iv)
    reg_str, tags = make_regex(simple_parse_xml(v[1][1], True))
    reg = regex.compile(reg_str, flags = regex.DOTALL)
    if not reg.match(v[0]):
        last_j = 0
        for i in range(0, len(reg_str), 100):
            for j in range(100):
                try:
                    if regex.compile(reg_str[:i+j], flags = regex.DOTALL).match(v[0]):
                        last_j = j
                        break
                except:
                    pass
            else:
                print(i, last_j, j)
                print(reg_str[:i])
                print('"""""""""""')
                print(v[0])
                print('-------------')
                print(v[1][1])
                raise Exception('')

100%|██████████| 10/10 [00:00<00:00, 264.21it/s]

0
1
2
3
4
5
6
7
8
9





In [302]:
regex.compile(reg_str[:900+j], flags = regex.DOTALL).match(v[0])

In [243]:
re.match(reg, v[0])

KeyboardInterrupt: 