# After getting files

## Insert into database

In [1]:
from pathlib import Path
try:
    dataset
except:
    print('getting dataset location')
    dataset = Path('dataset').absolute()


getting dataset location


In [2]:
import os
os.chdir(dataset)

js_dir = dataset / 'javascript'


In [6]:
import xml.etree.ElementTree as ET

In [449]:
import re

def simple_parse_xml(content, nested, against = ''):
    pattern = r'<([^\s]*?)>'
    before = ''
    while True:
        open = re.search(pattern, content)
        if not open:
            return [content]
        [open_start, open_end] = open.span()
        before += content[:open_start]
        if open.group(0) in against:
            before += open.group(0)
            content = content[open_end:]
            continue
        break
    tag = open.group(1)
    inner_and_after = content[open_end:]
    close = re.search(f'</{re.escape(tag)}>', inner_and_after)
    if not close:
        return [content]
    [close_start, close_end] = close.span()
    inner = inner_and_after[:close_start]
    after = inner_and_after[close_end:]
    inner = simple_parse_xml(inner, nested, against) if nested else inner
    return [[before, tag, inner], *simple_parse_xml(after, nested, against)]



In [450]:
import json
omitted = []
output = []
for text_file in js_dir.glob('thread2-*.txt'):
    with open(text_file, 'r') as file:
        file_contents = file.read()
    parsed = simple_parse_xml(file_contents, nested=False)
    output.append({
        'file': str(text_file),
        'split': [p[0] for p in parsed[:-1]] + [parsed[-1]],
        'len': len(parsed),
        'ok': len(parsed) == 11
    })
    if len(parsed) != 11:
        omitted.append(text_file)
        continue
    with open(str.replace(str(text_file), '.txt', '.json'), 'w') as file:
        file.write(json.dumps([p[1:] for p in parsed]))

with open('parse2.log', 'w') as file:
    file.write(json.dumps(output))
for out in output:
    if not out['ok']:
        continue
    print(out['file'])
    print(out['split'])

/root/py/CodeBERT/CodeReviewer/dataset/javascript/thread2-35.txt
['', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '']
/root/py/CodeBERT/CodeReviewer/dataset/javascript/thread2-337.txt
['```javascript\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n```\n\nThe functions above are annotated following the Single Responsibility Principle. Each function is enclosed within tags that describe its responsibilities. There are no nested tags since each function, within the scope of this exercise, seems to exhibit a single cohesive behavior or purpose. Some functions combine several actions, but they are all contributing to a single responsibility within the context that the function operates.']
/root/py/CodeBERT/CodeReviewer/dataset/javascript/thread2-413.txt
['', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '\n\n', '']
/root/py/CodeBERT/CodeReviewer/dataset/javascript/thread2-244.txt
['```javascript\n', '\n\n', '\n\n',

In [131]:
source = omitted
omitted = []
output2 = []
for text_file in source:
    with open(text_file, 'r') as file:
        file_contents = file.read()
    parsed = simple_parse_xml(file_contents, nested=False)
    output2.append({
        'file': str(text_file),
        'split': [p[0] for p in parsed[:-1]] + [parsed[-1]],
        'len': len(parsed),
        'ok': len(parsed) == 11
    })
    if len(parsed) != 11:
        omitted.append(text_file)
        continue
    with open(str.replace(str(text_file), '.txt', '.json'), 'w') as file:
        file.write(json.dumps([p[1:] for p in parsed]))


In [140]:
!pwd

/root/py/CodeBERT/CodeReviewer/dataset


In [451]:
import sqlite3, json
from contextlib import contextmanager

train = js_dir / 'final' / 'jsonl' / 'train'
os.chdir(train)

@contextmanager
def get_cursor(database_name='rsn_train'):
    with sqlite3.connect(database_name) as conn:
        yield conn.cursor()


with get_cursor() as cursor:
    print(list(cursor.execute('select count(*) from shuffled')))

[(123889,)]


In [535]:
import regex

def simple_parse_xml(content, nested):
    pattern = r'\s*<([^\s]*?)>\s*'
    open = re.search(pattern, content)
    if not open:
        return [content]
    [open_start, open_end] = open.span()
    before = content[:open_start]
    tag = open.group(1)
    inner_and_after = content[open_end:]
    close = re.search(f'</{re.escape(tag)}>', inner_and_after)
    if close:
        [close_start, close_end] = close.span()
    else:
        close = re.search(pattern, inner_and_after)
        if close:
            [close_start, _] = close.span()
        else:
            close_start = len(inner_and_after)
        close_end = close_start
    inner = inner_and_after[:close_start]
    after = inner_and_after[close_end:]
    inner = simple_parse_xml(inner, nested) if nested else inner
    return [[before, tag, inner], *simple_parse_xml(after, nested)]

def atom_to_re(s):
    tokens = [token.strip() for token in re.split(r'\s+|(?=\W)|(?<=\W)', s.strip()) if token]
    escaped = [regex.escape(token, special_only=True) for token in tokens]
    return r'\s*' + r'\s*'.join(escaped) + r'\s*'

def str_to_re(s):
    codes = re.split(r'\s*(?://[^\n]*(?:\n|$)|/\*.*?\*/|\.{3,})\s*', s,  flags=re.DOTALL)
    return '(?:.*?)'.join(atom_to_re(code) for code in codes)

def node_to_re(node, c):
    if type(node) == str:
        return str_to_re(node), []
    before, tag, content = node
    before_re = str_to_re(before)
    c[0] += 1
    open_gr = c[0]
    content_re, content_tags = make_regex(content, c)
    c[0] += 1
    close_gr = c[0]
    open_re = '\s*(|<'+re.escape(tag)+'>)\s*'
    close_re = '\s*(|</'+re.escape(tag)+'>)\s*'
    return before_re+open_re+content_re+close_re, [(tag, open_gr, close_gr, content_tags)]


def make_regex(tree, c):
    regs, tags = zip(*(node_to_re(node, c) for node in tree))
    return re.sub(r'(\\s\*)+', r'\\s*', ''.join(regs)), [t for tag in tags for t in tag] 



In [794]:
with get_cursor() as cursor:
    cursor.execute('create table if not exists snippets (ID INTEGER PRIMARY KEY, code TEXT, locations JSON, regions JSON, SRP boolean)')

with get_cursor() as cursor:
    cursor.execute('create table if not exists region (ID INTEGER PRIMARY KEY, code TEXT, vector JSON)')


In [643]:
xx[4].tags

[('fileOperations', 1, 2, []), ('templateStringReplacement', 3, 4, [])]

In [795]:
region_id = 0
def insert_region(code):
    global region_id
    region_id += 1
    # print(f'inserting region {region_id}:\n {code}')
    return region_id

def insert_snippet(id, code, locations, regions, srp):
    with get_cursor() as cursor:
        cursor.execute(
            'insert into snippets (ID, code, locations, regions, SRP) values (?, ?, ?, ?, ?)',
            (id, code, json.dumps(locations), json.dumps(regions), srp))

def flat_wrong_tags(tags, code, m):
    clean_tags = []
    for tag in tags:
        clean_tags += flat_wrong_tag(tag, clean_tags, code, m)
    return clean_tags

def flat_wrong_tag(tag, clean_tags, code, m):
    name, open, close, sub_tags = tag
    clean_sub_tags = flat_wrong_tags(sub_tags, code, m)
    if m.group(open):
        if len(clean_tags):
            pname, popen, pclose, psub_tags = clean_tags[-1]
            if not m.group(pclose):
                clean_tags[-1] = (pname, popen, close, psub_tags)
        return clean_sub_tags
    return [(name, open, close, clean_sub_tags)]

def tag_to_json(tag, code, m, handle_region):
    name, open, close, sub_tags = tag
    start = m.span(open)[0]
    regions, body, end = tags_to_json(start, sub_tags, code, m, handle_region)
    body += code[end:m.span(close)[0]] 
    region_id = handle_region(f'function {name} () {{\n{body}\n}}')
    regions = [(start, region_id)] + regions
    return regions


def tags_to_json(outer_index, tags, code, m, handle_region):
    regions = []
    outer_body = ''
    for tag in tags:
        name, open, close, _ = tag
        outer_body += code[outer_index:m.span(open)[0]] + '\n' + name + '();\n'
        outer_index = m.span(close)[0]
        regions += tag_to_json(tag, code, m, handle_region)
    return regions, outer_body, outer_index


def to_json(tags, code, m, handle_region):
    regions, body, end = tags_to_json(0, tags, code, m, handle_region)
    body += code[end:] 
    region_id = handle_region(body)
    regions = [(0, region_id)] + regions
    if len(regions) > 1:
        regions.append((end, region_id))
    return regions


In [796]:
def strip_js_comments(js_code):
    js_code = re.sub(r'\n?//.*?\n', '\n', js_code)
    js_code = re.sub(r'/\*.*?\*/', '', js_code, flags=re.DOTALL)
    return js_code    

n_ok, n_all = 0, 0
for text_file in tqdm(list(js_dir.glob('thread2-*.json'))):
    index = int(re.match('.*thread2-(.*)\.json', str(text_file)).group(1))
    limit = 10
    with get_cursor() as cursor:
        codes = list(cursor.execute('select id, code from shuffled limit ? offset ?', (limit, limit * index + 1)))
    with open(text_file, 'r') as file:
        file_contents = file.read()
    for (id, code), obj in zip(codes, json.loads(file_contents)):
        original = strip_js_comments(code)
        xml = obj[1]
        reg_str, tags = make_regex(simple_parse_xml(xml, True), [0])
        reg = regex.compile(reg_str, flags = regex.DOTALL)
        m = reg.match(original)
        n_all += 1
        if m:
            n_ok += 1
            tags = flat_wrong_tags(tags, original, m)
            if not len(tags) or len(tags) == 1 and not len(tags[0][3]):
                insert_snippet(id, original, [], [], True)
                continue
            regions = to_json(tags, original, m, insert_region)
            insert_snippet(id, original, *zip(*regions), False)
            
            
print(f'{n_ok}/{n_all}')

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 524/524 [01:31<00:00,  5.76it/s]

4804/5240





In [767]:
tags = flat_wrong_tags(tags, original, m)
tags

[('initializeOutputs', 1, 2, []),
 ('iterateOverAllParts', 3, 4, []),
 ('createCurrentPartsList', 5, 6, []),
 ('skipNonUniqueOrInvalidParts', 7, 8, []),
 ('handleNonSortedArrays', 9, 10, []),
 ('checkExistenceAndSort', 11, 12, []),
 ('generateChain', 13, 14, []),
 ('checkAndHandleChildrenOfPart', 15, 18, []),
 ('handleLastPart', 19, 22, []),
 ('addToOutput', 23, 24, []),
 ('verifyAndHandleTodoAndCallback', 25, 32, []),
 ('generateChildren', 33, 34, []),
 ('returnTypes', 35, 36, []),
 ('assembleTypeBody', 37, 38, []),
 ('returnFinalInterface', 39, 46, [])]

In [763]:
open

<function io.open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None)>

In [768]:
to_json(tags, original, m, print)

function initializeOutputs () {
let output = '';
	let children = '';

	
}
function iterateOverAllParts () {
for (const part of allParts) {
		
}
function createCurrentPartsList () {
const parts = prefix.concat([part]);

		
}
function skipNonUniqueOrInvalidParts () {
if (prefix.indexOf(part) !== -1 || !verify(parts, true)) {
			
			continue;
		}

		
		
}
function handleNonSortedArrays () {
if (!isArraySorted(parts)) {
			
}
function checkExistenceAndSort () {
if (exists(parts)) {
				parts.sort();

				
}
function generateChain () {
let chain;
				
}
function checkAndHandleChildrenOfPart () {
if (hasChildren(parts)) {
					chain = parts.join('_') + '<T>';
				} else {
					
					
					
}
function handleLastPart () {
const last = parts.pop();
					const joined = parts.join('_');
					chain = `${joined}<T>['${last}']`;
				}

				
}
function addToOutput () {
output += `\t${part}: Register_${chain};\n`;
			}

			continue;
		}

		
		
		
		
}
function verifyAndHandleTodoAndCallback () {
if (v

[(0, None),
 (37, None),
 (76, None),
 (109, None),
 (149, None),
 (235, None),
 (267, None),
 (311, None),
 (326, None),
 (420, None),
 (534, None),
 (614, None),
 (1054, None),
 (1096, None),
 (1141, None),
 (1190, None),
 (1408, None)]

In [742]:
print(original)

function generatePrefixed(prefix) {
	let output = '';
	let children = '';

	for (const part of allParts) {
		const parts = prefix.concat([part]);

		if (prefix.indexOf(part) !== -1 || !verify(parts, true)) {
			
			continue;
		}

		
		if (!isArraySorted(parts)) {
			if (exists(parts)) {
				parts.sort();

				let chain;
				if (hasChildren(parts)) {
					chain = parts.join('_') + '<T>';
				} else {
					
					
					const last = parts.pop();
					const joined = parts.join('_');
					chain = `${joined}<T>['${last}']`;
				}

				output += `\t${part}: Register_${chain};\n`;
			}

			continue;
		}

		
		
		
		if (verify(parts, false)) {
			if (arrayHas(parts)('todo')) {
				
				output += `\t${part}: (name: string) => void;\n`;
			} else {
				if (arrayHas(parts)('cb')) {
					output += `\t${part}: CallbackRegisterBase<T>`;
				} else {
					output += `\t${part}: RegisterBase<T>`;
				}

				if (hasChildren(parts)) {
					
					const joined = parts.join('_');
					output += ` & Register_$

In [746]:
print(xml)


function generatePrefixed(prefix) {
	<initializeOutputs>
	let output = '';
	let children = '';
    <iterateOverAllParts>
	for (const part of allParts) {
		<createCurrentPartsList>
		const parts = prefix.concat([part]);
        <skipNonUniqueOrInvalidParts>
		if (prefix.indexOf(part) !== -1 || !verify(parts, true)) {
			
			continue;
		}
		<handleNonSortedArrays>
		if (!isArraySorted(parts)) {
			<checkExistenceAndSort>
			if (exists(parts)) {
				parts.sort();
                <generateChain>
				let chain;
                <checkAndHandleChildrenOfPart>
				if (hasChildren(parts)) {
					chain = parts.join('_') + '<T>';
				} else {
					<handleLastPart>
					const last = parts.pop();
					const joined = parts.join('_');
					chain = `${joined}<T>['${last}']`;
				}
                <addToOutput>
				output += `\t${part}: Register_${chain};\n`;
			}
			continue;
		}
        <verifyAndHandleTodoAndCallback>
		if (verify(parts, false)) {
			if (arrayHas(parts)('todo')) {
				
				output +

In [745]:
print(reg_str)

\s*function\s*generatePrefixed\s*\(\s*prefix\s*\)\s*\{\s*(|<initializeOutputs>)\s*let\s*output\s*=\s*'\s*'\s*;\s*let\s*children\s*=\s*'\s*'\s*;\s*(|</initializeOutputs>)\s*(|<iterateOverAllParts>)\s*for\s*\(\s*const\s*part\s*of\s*allParts\s*\)\s*\{\s*(|</iterateOverAllParts>)\s*(|<createCurrentPartsList>)\s*const\s*parts\s*=\s*prefix\s*\.\s*concat\s*\(\s*\[\s*part\s*\]\s*\)\s*;\s*(|</createCurrentPartsList>)\s*(|<skipNonUniqueOrInvalidParts>)\s*if\s*\(\s*prefix\s*\.\s*indexOf\s*\(\s*part\s*\)\s*!\s*=\s*=\s*\-\s*1\s*\|\s*\|\s*!\s*verify\s*\(\s*parts\s*,\s*true\s*\)\s*\)\s*\{\s*continue\s*;\s*\}\s*(|</skipNonUniqueOrInvalidParts>)\s*(|<handleNonSortedArrays>)\s*if\s*\(\s*!\s*isArraySorted\s*\(\s*parts\s*\)\s*\)\s*\{\s*(|</handleNonSortedArrays>)\s*(|<checkExistenceAndSort>)\s*if\s*\(\s*exists\s*\(\s*parts\s*\)\s*\)\s*\{\s*parts\s*\.\s*sort\s*\(\s*\)\s*;\s*(|</checkExistenceAndSort>)\s*(|<generateChain>)\s*let\s*chain\s*;\s*(|</generateChain>)\s*(|<checkAndHandleChildrenOfPart>)\s*if\s*\(

In [704]:
to_json(flat_wrong_tags(xx[7].tags, strip_js_comments(xx[7].code), xx[7].m), strip_js_comments(xx[7].code), xx[7].m)

inserting region 1:
 function sessionStorageCheck () {
if (_this.settings.sessionStorage === true && window.sessionStorage && window.sessionStorage.getItem('myGeo')){
				_this.writeDebug('Using Session Saved Values for GEO');
				_this.autoGeocodeQuery(JSON.parse(window.sessionStorage.getItem('myGeo')));
				return false;
			}
			
}
inserting region 2:
 function positionProcessing () {
var pos = {
						coords: {
							latitude : position.coords.latitude,
							longitude: position.coords.longitude,
							accuracy : position.coords.accuracy
						}
					};

					
					
}
inserting region 3:
 function sessionStorageUpdate () {
if (_this.settings.sessionStorage === true && window.sessionStorage) {
						window.sessionStorage.setItem('myGeo',JSON.stringify(pos));
					}

					
					
}
inserting region 4:
 function callbackExecution () {
if (_this.settings.callbackAutoGeoSuccess) {
						_this.settings.callbackAutoGeoSuccess.call(this, pos);
					}

					
}
inserting region 5:
 function

[(0, 6), (83, 1), (361, 5), (521, 2), (709, 3), (865, 4), (1089, 6)]

In [663]:
print(xx[7].obj[1])


function() {
  this.writeDebug('htmlGeocode',arguments);
  var _this = this;

  <sessionStorageCheck>
  if (_this.settings.sessionStorage === true && window.sessionStorage && window.sessionStorage.getItem('myGeo')){
      _this.writeDebug('Using Session Saved Values for GEO');
      _this.autoGeocodeQuery(JSON.parse(window.sessionStorage.getItem('myGeo')));
      return false;
  }
  </sessionStorageCheck>
  <geolocationCheck>
  else if (navigator.geolocation) {
      navigator.geolocation.getCurrentPosition(function(position){
          _this.writeDebug('Current Position Result');
          
          <positionProcessing>
          var pos = {
              coords: {
                  latitude : position.coords.latitude,
                  longitude: position.coords.longitude,
                  accuracy : position.coords.accuracy
              }
          };
          </positionProcessing>

          <sessionStorageUpdate>
          if (_this.settings.sessionStorage === true && window.

In [706]:
[x.code for x in xx if re.match(r'<\w.*>', x.code)]

[]

In [661]:
print(xx[7].code)

function() {
			this.writeDebug('htmlGeocode',arguments);
			var _this = this;

			if (_this.settings.sessionStorage === true && window.sessionStorage && window.sessionStorage.getItem('myGeo')){
				_this.writeDebug('Using Session Saved Values for GEO');
				_this.autoGeocodeQuery(JSON.parse(window.sessionStorage.getItem('myGeo')));
				return false;
			}
			else if (navigator.geolocation) {
				navigator.geolocation.getCurrentPosition(function(position){
					_this.writeDebug('Current Position Result');
					// To not break autoGeocodeQuery then we create the obj to match the geolocation format
					var pos = {
						coords: {
							latitude : position.coords.latitude,
							longitude: position.coords.longitude,
							accuracy : position.coords.accuracy
						}
					};

					// Have to do this to get around scope issues
					if (_this.settings.sessionStorage === true && window.sessionStorage) {
						window.sessionStorage.setItem('myGeo',JSON.stringify(pos));
					}

					// Callba

In [629]:
[x.tags for x in xx if x.reg]

[[],
 [],
 [],
 [],
 [('fileOperations', 1, 2, []), ('templateStringReplacement', 3, 4, [])],
 [('parameterHandling', 1, 2, []), ('propertyExtension', 3, 4, [])],
 [('bigQueryClientInitialization', 1, 2, []),
  ('bigQueryDataLoading', 3, 4, []),
  ('errorHandling', 5, 6, [])],
 [('sessionStorageCheck', 1, 2, []),
  ('geolocationCheck',
   3,
   10,
   [('positionProcessing', 4, 5, []),
    ('sessionStorageUpdate', 6, 7, []),
    ('callbackExecution', 8, 9, [])])],
 [('protocolDetermination', 1, 2, []),
  ('httpRequestCreation', 3, 4, []),
  ('requestBodyHandling', 5, 6, [])],
 []]

In [572]:
xx[1].reg_str[:448]
#regex.compile(xx[1].reg_str[:500], flags = regex.DOTALL).match(xx[1].code)

'\\s*function\\s*check\\s*\\(\\s*node\\s*\\)\\s*\\{\\s*(|<analyzeNode>)\\s*if\\s*\\(\\s*node\\s*\\.\\s*arguments\\s*\\.\\s*length\\s*!\\s*=\\s*=\\s*1\\s*\\&\\s*\\&\\s*node\\s*\\.\\s*callee\\s*\\.\\s*type\\s*=\\s*=\\s*=\\s*"\\s*Identifier\\s*"\\s*\\&\\s*\\&\\s*node\\s*\\.\\s*callee\\s*\\.\\s*name\\s*=\\s*=\\s*=\\s*"\\s*Array\\s*"\\s*\\&\\s*\\&\\s*!\\s*node\\s*\\.\\s*typeParameters\\s*\\)\\s*\\{\\s*(|<reportProblem>)\\s*context\\s*\\.\\s*report\\s*\\(\\s*\\{\\s*node\\s*,\\s*message\\s*:\\s*"\\s*The\\s*array\\s*literal\\s*notation\\s*\\[\\'

In [432]:
reg_str, tags = make_regex(simple_parse_xml(v[1][1], True))
reg = regex.compile(reg_str, flags = regex.DOTALL)
print(len(reg_str))
if not reg.match(v[0]):
    last_j = 0
    print(0, len(reg_str), 100)
    for i in range(0, len(reg_str), 100):
        print('enter')
        for j in range(100):
            try:
                if regex.compile(reg_str[:i+j], flags = regex.DOTALL).match(v[0]):
                    last_j = j
                    print('ok', i, j)
                    break
            except:
                pass
        else:
            print('not ok')
            print(i, last_j, j)
            for j in range(100):
                try:
                    if regex.compile(reg_str[:i-j], flags = regex.DOTALL).match(v[0]):
                        print('ok', i, j)
                        break
                except:
                    pass
            break


2470
0 2470 100
enter
ok 0 0
enter
ok 100 0
enter
ok 200 2
enter
ok 300 0
enter
ok 400 0
enter
ok 500 2
enter
ok 600 0
enter
ok 700 0
enter
ok 800 0
enter
ok 900 19
enter
ok 1000 0
enter
ok 1100 1
enter
ok 1200 0
enter
ok 1300 0
enter
ok 1400 2
enter
not ok
1500 2 99
ok 1500 86


In [442]:
regex.compile(reg_str[:1500-86], flags = regex.DOTALL).match(v[0])
reg_str#[:1500-86 + 50]

'\\s*function\\s*DefaultArrayItem\\s*\\(\\s*props\\s*\\)\\s*\\{\\s*(|<DefineButtonStyle>)\\s*const\\s*btnStyle\\s*=\\s*\\{\\s*flex\\s*:\\s*1\\s*,\\s*paddingLeft\\s*:\\s*6\\s*,\\s*paddingRight\\s*:\\s*6\\s*,\\s*fontWeight\\s*:\\s*"\\s*bold\\s*",\\s*\\};\\s*(?:|</DefineButtonStyle>)\\s*return\\s*\\(\\s*(|<RenderComponentStructure>)\\s*<\\s*div\\s*key\\s*=\\{\\s*props\\s*\\.\\s*index\\s*\\}\\s*className\\s*=\\{\\s*props\\s*\\.\\s*className\\s*\\}>\\s*<\\s*div\\s*className\\s*=\\{\\s*props\\s*\\.\\s*hasToolbar\\s*\\?\\s*"\\s*col\\s*\\-\\s*xs\\s*\\-\\s*9\\s*"\\s*:\\s*"\\s*col\\s*\\-\\s*xs\\s*\\-\\s*12\\s*"\\}>\\s*\\{\\s*props\\s*\\.\\s*children\\s*\\}\\s*(|</div>)\\s*\\{\\s*props\\s*\\.\\s*hasToolbar\\s*\\&\\&\\s*\\(\\s*(?:|<//div>)\\s*(|<RenderToolbox>)\\s*<\\s*div\\s*className\\s*="\\s*col\\s*\\-\\s*xs\\s*\\-\\s*3\\s*array\\s*\\-\\s*item\\s*\\-\\s*toolbox\\s*">\\s*<\\s*div\\s*className\\s*="\\s*btn\\s*\\-\\s*group\\s*"\\s*style\\s*=\\{\\{\\s*display\\s*:\\s*"\\s*flex\\s*",\\s*justifyConte

In [441]:
#v[1][1]
v[0][:1100]

'function DefaultArrayItem(props) {\n  const btnStyle = {\n    flex: 1,\n    paddingLeft: 6,\n    paddingRight: 6,\n    fontWeight: "bold",\n  };\n  return (\n    <div key={props.index} className={props.className}>\n      <div className={props.hasToolbar ? "col-xs-9" : "col-xs-12"}>\n        {props.children}\n      </div>\n\n      {props.hasToolbar && (\n        <div className="col-xs-3 array-item-toolbox">\n          <div\n            className="btn-group"\n            style={{\n              display: "flex",\n              justifyContent: "space-around",\n            }}>\n            {(props.hasMoveUp || props.hasMoveDown) && (\n              <IconButton\n                icon="arrow-up"\n                className="array-item-move-up"\n                tabIndex="-1"\n                style={btnStyle}\n                disabled={props.disabled || props.readonly || !props.hasMoveUp}\n                onClick={props.onReorderClick(props.index, props.index - 1)}\n              />\n           

In [390]:
from tqdm import tqdm
import regex
for iv, v in enumerate(tqdm(l)):
    print(iv)
    reg_str, tags = make_regex(simple_parse_xml(v[1][1], True))
    reg = regex.compile(reg_str, flags = regex.DOTALL)
    if not reg.match(v[0]):
        last_j = 0
        for i in range(0, len(reg_str), 100):
            for j in range(100):
                try:
                    if regex.compile(reg_str[:i+j], flags = regex.DOTALL).match(v[0]):
                        last_j = j
                        break
                except:
                    pass
            else:
                print(i, last_j, j)
                print(reg_str[:i])
                print('"""""""""""')
                print(v[0])
                print('-------------')
                print(v[1][1])
                raise Exception('')

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [00:00<00:00, 264.21it/s]

0
1
2
3
4
5
6
7
8
9





In [302]:
regex.compile(reg_str[:900+j], flags = regex.DOTALL).match(v[0])

In [243]:
re.match(reg, v[0])

KeyboardInterrupt: 