### Print Node Frequency

In [1]:
import os, re

def read_out_dep_freq(data_set_name):
    with open(os.path.join('srcxml', f'{data_set_name}_out', 'de2freq.txt')) as f:
        lines = f.readlines()
        
        node_freq = {}
        
        for line in lines:
            sp = re.split('\s+', line.strip())
            freq = sp[0]
            nodes = [s for s in re.split('[-|↓|↑]', sp[1]) if s != '']

            for node in nodes:
                if node_freq.get(node) == None:
                    node_freq[node] = 0
                node_freq[node] += int(freq)
                
        node_freq_list = []
        for k, v in node_freq.items():
            node_freq_list.append((k, v))
            
        node_freq_list = sorted(node_freq_list, key=lambda a: a[1], reverse=True)
        
        return node_freq_list
                
        
read_out_dep_freq('juliet')

[('name', 19246),
 ('expr', 4746),
 ('decl', 2954),
 ('literal', 2802),
 ('type', 2800),
 ('if', 1644),
 ('try', 1552),
 ('function', 1272),
 ('block_content', 1204),
 ('call', 1136),
 ('condition', 1104),
 ('catch', 1090),
 ('throws', 1002),
 ('block', 773),
 ('init', 738),
 ('specifier', 722),
 ('if_stmt', 461),
 ('operator', 392),
 ('parameter_list', 388),
 ('parameter', 388),
 ('finally', 348),
 ('argument', 238),
 ('decl_stmt', 212),
 ('index', 116),
 ('for', 110),
 ('else', 101),
 ('return', 75),
 ('control', 60),
 ('expr_stmt', 20),
 ('incr', 20)]

### Get Filtered Data

In [14]:
def read_out_data(data_set_name):
    def read_out_de_2_id(data_set_name):
        with open(os.path.join('srcxml', f'{data_set_name}_out', 'de2id.txt')) as f:
            lines = f.readlines()
            
            id2de = {}
            
            for line in lines:
                sp = re.split('\s+', line.strip())
                id2de[int(sp[-1])] = sp[0]
                
            return id2de
        
    id2de = read_out_de_2_id('juliet')

    def read_out_voc_2_id(data_set_name):
        with open(os.path.join('srcxml', f'{data_set_name}_out', 'voc2id.txt')) as f:
            lines = f.readlines()
            
            id2voc = {}
            
            for line in lines:
                sp = re.split('\s+', line.strip())
                id2voc[int(sp[-1])] = sp[0]
                
            return id2voc

    id2voc = read_out_voc_2_id('juliet')
    
    data_list = []
    with open(os.path.join('srcxml', f'{data_set_name}_out', 'data.txt')) as f:
        lines = f.readlines()
        
        
        for line in lines:
            sp = line.strip().split(' ')
            
            num_words = int(sp[0])
            num_dep_rels = int(sp[1])
            
            token_id_list = sp[2:2+num_words]
            dep_path_list = sp[2+num_words:]
            
            decode_dep_path = []
            
            for dep_path in dep_path_list:
                srcT, desT, dep = dep_path.split('|')
                decode_dep_path.append(
                    {
                        "srcT": id2voc[int(token_id_list[int(srcT)])],
                        "desT": id2voc[int(token_id_list[int(desT)])],
                        "path": id2de[int(dep)],
                    }
                )
        
            data_list.append(
                decode_dep_path
            )
            
    return data_list
        
from tqdm import tqdm

def filter_decoded_data_list(data_set_name, removed_node_list):
    decoded_data_list = read_out_data('juliet')
    filtered_decoded_data_list = []
    for i in tqdm(range(len(decoded_data_list))):
        decoded_data = decoded_data_list[i]
        filtered_decoded_data = []
        for j in range(len(decoded_data)):
            dep_path = decoded_data[j]
            current_nodes = [s for s in re.split('[-|↓|↑]', dep_path['path']) if s != '']
            removed = False
            for node in current_nodes:
                if node in removed_node_list:
                    removed = True
                    break
                
            if removed:
                dep_path['path'] = '-***-'
                filtered_decoded_data.append(dep_path)
            else:
                filtered_decoded_data.append(dep_path)
            
        filtered_decoded_data_list.append(filtered_decoded_data)
        
    return filtered_decoded_data_list

def get_rs_obj_from_filtered_data_list(data_set_name, removed_node_list):
    
    decoded_data_list = filter_decoded_data_list(data_set_name, removed_node_list)
    
    rs_obj = dict(
        token_id_map = {},
        token_frequecy_map = {},
        dep_id_map = {
            '-***-': 0   
        },
        dep_frequecy_map = {},
        sentence_triples = [],
    )
    
    for decoded_data in decoded_data_list:
        triples = []
        for triple in decoded_data:
            srcT = triple['srcT'] 
            desT = triple['desT'] 
            dep_path = triple['path']
            
            if rs_obj['token_id_map'].get(srcT) == None:
                rs_obj['token_id_map'][srcT] = len(rs_obj['token_id_map'].keys())
                
            if rs_obj['token_frequecy_map'].get(srcT) == None:
                rs_obj['token_frequecy_map'][srcT] = 0
                
            rs_obj['token_frequecy_map'][srcT] += 1
                
            if rs_obj['token_id_map'].get(desT) == None:
                rs_obj['token_id_map'][desT] = len(rs_obj['token_id_map'].keys())
                
            if rs_obj['token_frequecy_map'].get(desT) == None:
                rs_obj['token_frequecy_map'][desT] = 0
                
            rs_obj['token_frequecy_map'][desT] += 1
            
            if rs_obj['dep_id_map'].get(dep_path) == None:
                rs_obj['dep_id_map'][dep_path] = len(rs_obj['dep_id_map'].keys())
                
            if rs_obj['dep_frequecy_map'].get(dep_path) == None:
                rs_obj['dep_frequecy_map'][dep_path] = 0
            
            rs_obj['dep_frequecy_map'][dep_path] += 1
            
            triples.append((
                rs_obj['token_id_map'][srcT],
                rs_obj['token_id_map'][desT],
                rs_obj['dep_id_map'][dep_path]
            ))
        
        rs_obj['sentence_triples'].append(triples)
    
    return rs_obj

In [15]:
filter_decoded_data_list('juliet', ['specifier'])

100%|██████████| 118/118 [00:00<00:00, 4649.00it/s]


[[{'srcT': 'public', 'desT': 'void', 'path': '-***-'},
  {'srcT': 'public', 'desT': 'action', 'path': '-***-'},
  {'srcT': 'public', 'desT': 'throws', 'path': '-***-'},
  {'srcT': 'void', 'desT': 'public', 'path': '-***-'},
  {'srcT': 'void', 'desT': 'action', 'path': 'name↑type↑-function-name↓'},
  {'srcT': 'void', 'desT': 'throws', 'path': 'name↑type↑-function-throws↓'},
  {'srcT': 'action', 'desT': 'public', 'path': '-***-'},
  {'srcT': 'action', 'desT': 'void', 'path': 'name↑-function-type↓name↓'},
  {'srcT': 'action', 'desT': 'throws', 'path': 'name↑-function-throws↓'},
  {'srcT': 'string', 'desT': 'data', 'path': 'name↑type↑-decl-name↓'},
  {'srcT': 'data', 'desT': 'string', 'path': 'name↑-decl-type↓name↓'},
  {'srcT': 'throws', 'desT': 'public', 'path': '-***-'},
  {'srcT': 'throws', 'desT': 'void', 'path': 'throws↑-function-type↓name↓'},
  {'srcT': 'throws', 'desT': 'action', 'path': 'throws↑-function-name↓'},
  {'srcT': 'throws',
   'desT': 'throwable',
   'path': '-throws-arg

In [16]:
# remove all deps which contain 'specifier' node in the path
rs_obj = get_rs_obj_from_filtered_data_list('juliet', ['specifier'])

100%|██████████| 118/118 [00:00<00:00, 4114.80it/s]


In [17]:
def outputData(data_set_name, rs_obj):
    out_dir = os.path.join('srcxml', f'{data_set_name}_out')
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    
    with open(os.path.join(out_dir, f'voc2id.txt'), 'w') as f:
        for _tk, _id in rs_obj['token_id_map'].items():
            f.write(f'{_tk}\t{_id}\r\n')
            
    with open(os.path.join(out_dir, f'id2freq.txt'), 'w') as f:
        for _tk, _feq in rs_obj['token_frequecy_map'].items():
            _id = rs_obj['token_id_map'][_tk]
            f.write(f'{_id}\t{_feq}\r\n')

    with open(os.path.join(out_dir, f'de2id.txt'), 'w') as f:
        for _dep, _id in rs_obj['dep_id_map'].items():
            f.write(f'{_dep}\t{_id}\r\n')
            
    with open(os.path.join(out_dir, f'de2freq.txt'), 'w') as f:
        for _dep, _feq in rs_obj['dep_frequecy_map'].items():
            f.write(f'{_feq}\t\t{_dep}\r\n')
            
    with open(os.path.join(out_dir, f'data.txt'), 'w') as f:
        for sentence_triples in rs_obj['sentence_triples']:
            tk_set = set([])
            dep_path_list = []
            for triple in sentence_triples:
                srcT_id, desT_id, dep_path_id = triple
                tk_set.add(srcT_id)
                tk_set.add(desT_id)
                
                dep_path_list.append((srcT_id, desT_id, dep_path_id))
                
            tk_id_list = list(tk_set)
            
            # dep_path_list = [f"{sdd[0]}|{sdd[1]}|{sdd[2]}" for sdd in dep_path_list]
            dep_path_list = [f"{tk_id_list.index(sdd[0])}|{tk_id_list.index(sdd[1])}|{sdd[2]}" for sdd in dep_path_list]
            
            f.write(f'{len(tk_id_list)} ')
            f.write(f'{len(dep_path_list)} ')
            f.write(' '.join([str(_id) for _id in tk_id_list]))
            f.write(' ')
            f.write(' '.join(dep_path_list))
            f.write('\r\n')
        

In [18]:
outputData('juliet_new', rs_obj)