In [42]:
import json
import glob
import os
import nltk
from random import sample, shuffle, seed

In [43]:
caveat_files_dir = './output/java_12_spec_caveat_sentences/'
caveats_dict = {}

files = glob.glob(caveat_files_dir + '*.json')
for file in files:
    with open(file) as f:
        arr = json.load(f)
        for caveat in arr:
            caveats_dict[caveat['id']] = caveat

In [44]:
# Print statistics about the caveats found using keywords

caveat_type_count = {} # map caveat types to their counts
print("Number of classes: {}".format(len(files)))

class_level_sentences = 0
caveat_sentences = 0
caveat_misc_objects = 0
deprecated = 0
num_caveats = len(caveats_dict)

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if "class_level_caveat_sentences" in caveat:
        class_level_sentences += len(caveat["class_level_caveat_sentences"])
    else:
        caveat_sentences += len(caveat['caveat_sentences'])
        caveat_misc_objects += len(caveat['caveat_misc'])

    if caveat['deprecated']:
        deprecated += 1
    
    if 'type' in caveat:
        if caveat['type'] == '':
            print(caveat)
            break
        
        if not caveat['type'] in caveat_type_count:
            caveat_type_count[caveat['type']] = 1
        else:
            caveat_type_count[caveat['type']] += 1

print("Number of caveats: {}".format(num_caveats))
print("Number of class level caveat sentences: {}".format(class_level_sentences))
print("Number of api level caveat sentences: {}".format(caveat_sentences))  
print("Number of misc caveat objects: {}".format(caveat_misc_objects))
print("Number of deprecated caveats: {}".format(deprecated))
print(caveat_type_count)

Number of classes: 4712
Number of caveats: 49023
Number of class level caveat sentences: 10589
Number of api level caveat sentences: 38270
Number of misc caveat objects: 26426
Number of deprecated caveats: 1522
{'constructor': 4172, 'method': 33827, 'field': 6403}


In [45]:
def write_to_doccano_import_file(file, arr):
    """ Write the dict objects within an input array to a json file in doccano import format """
    with open(file, 'w+') as f:
        for obj in arr:
            f.write(json.dumps(obj) + '\n')

In [109]:
caveat_sentences_array = []
sentence_id_to_caveat_id = {}

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if caveat['deprecated']:
        continue
 
    if 'class_level_caveat_sentences' in caveat:
        for sentence in caveat['class_level_caveat_sentences']:
            if len(sentence) > 0:
                sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                caveat_sentences_array.append({
                    'text': sentence, 
                     'id': len(caveat_sentences_array),
                     'type': 'class'
                })
    else:
        for sentence in caveat['caveat_sentences']:
            if len(sentence) > 0:
                sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                caveat_sentences_array.append({
                    'text': sentence, 
                     'id': len(caveat_sentences_array),
                     'type': caveat['type']
                })

            # add sentences for throws, returns etc...
            if caveat['type'] in ['method', 'constructor']:
                for misc in caveat['caveat_misc']:
                    for misc_sentence in misc['text_list']:
                        if len(misc_sentence) > 0:
                            sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                            caveat_sentences_array.append({
                                'text': misc_sentence, 
                                 'id': len(caveat_sentences_array),
                                'type': 'misc@' + misc['name']
                            })

class_sentences = [o for o in caveat_sentences_array if o['type'] == 'class' and len(o['text']) <= 400]
method_sentences = [o for o in caveat_sentences_array if o['type'] == 'method' and len(o['text']) <= 400]
field_sentences = [o for o in caveat_sentences_array if o['type'] == 'field' and len(o['text']) <= 400]
constructor_sentences = [o for o in caveat_sentences_array if o['type'] == 'constructor' and len(o['text']) <= 400]
misc_sentences = [o for o in caveat_sentences_array if 'misc' in o['type'] and len(o['text']) <= 400]
        
print("Number of class sentences: {}".format(len(class_sentences)))
print("Number of method sentences: {}".format(len(method_sentences)))
print("Number of field sentences: {}".format(len(field_sentences)))
print("Number of constructor sentences: {}".format(len(constructor_sentences)))

exception_sentences = [o for o in misc_sentences if 'Throws:' == o['type'].split('@')[1]]
return_sentences = [o for o in misc_sentences if 'Returns:' == o['type'].split('@')[1]]
parameter_sentences = [o for o in misc_sentences if 'Parameters:' == o['type'].split('@')[1]]

print('-----------------------------------')
print('Number of exception sentences: {}'.format(len(exception_sentences)))
print('Number of return sentences: {}'.format(len(return_sentences)))
print('Number of parameter sentences: {}'.format(len(parameter_sentences)))


Number of class sentences: 10125
Number of method sentences: 33155
Number of field sentences: 1844
Number of constructor sentences: 2184
Number of misc sentences: 55216
{'text': 'the approximate accumulated elapsed time in milliseconds that a thread entered the FIELD_0 state; -1 if thread contention monitoring is disabled.', 'id': 5, 'type': 'misc@Returns:'}
-----------------------------------
Number of exception sentences: 31679
Number of return sentences: 11279
Number of parameter sentences: 11376


In [110]:
def get_unique_text_objs(arr):
    seen_text = set()
    filtered = []
    for obj in arr:
        if not obj['text'] in seen_text:
            filtered.append(obj)
            seen_text.add(obj['text'])
        
    return filtered

seed(42)

filtered_class_sentences = get_unique_text_objs(class_sentences)
filtered_method_sentences = get_unique_text_objs(method_sentences)
filtered_field_sentences = get_unique_text_objs(field_sentences)
filtered_constructor_sentences = get_unique_text_objs(constructor_sentences)
# misc sentences
filtered_exception_sentences = get_unique_text_objs(exception_sentences)
filtered_return_sentences = get_unique_text_objs(return_sentences)
filtered_parameter_sentences = get_unique_text_objs(parameter_sentences)

print("Number of unique class sentences: {}".format(len(filtered_class_sentences)))
print("Number of unique method sentences: {}".format(len(filtered_method_sentences)))
print("Number of unique field sentences: {}".format(len(filtered_field_sentences)))
print("Number of unique constructor sentences: {}".format(len(filtered_constructor_sentences)))
print('-----------------------------------')
print('Number of exception sentences: {}'.format(len(filtered_exception_sentences)))
print('Number of return sentences: {}'.format(len(filtered_return_sentences)))
print('Number of parameter sentences: {}'.format(len(filtered_parameter_sentences)))

# sample 384 of each list for 95% confidence interval with 5% error margin
filtered_class_sentences = sample(filtered_class_sentences, 384)
filtered_method_sentences = sample(filtered_method_sentences, 384)
filtered_field_sentences = sample(filtered_field_sentences, 384)
filtered_constructor_sentences = sample(filtered_constructor_sentences, 384)

Number of unique class sentences: 9089
Number of unique method sentences: 22309
Number of unique field sentences: 1580
Number of unique constructor sentences: 1302
-----------------------------------
Number of exception sentences: 6098
Number of return sentences: 4076
Number of parameter sentences: 2767


In [52]:
# write data to annotate to different files
write_to_doccano_import_file('./output/method_sentences_doccano.jsonl', filtered_method_sentences)
write_to_doccano_import_file('./output/constructor_sentences_doccano.jsonl', filtered_constructor_sentences)

In [93]:
# modify labelled data from doccano
def get_labelled_doccano_data(labelled_file, metadata):
    with open(labelled_file) as f, open(metadata) as metadata_f:
        id_to_label = {}
        arr = json.load(metadata_f)
        for e in arr:
            id_to_label[e['id']] = e['text']

        data = []
        for line in f:
            line = line.strip()
            obj = json.loads(line)
            obj['labels'] = [id_to_label[i['label']] for i in obj['annotations']]
            obj.pop('annotations', None)
            obj.pop('meta', None)
            obj.pop('annotation_approver', None)
            data.append(obj)
    
        return data

In [99]:
def get_labelled_data(file):
    with open(file) as f:
        return json.load(f)
    
def get_label_counts(arr):
    counts = {}
    
    for e in arr:
        for label in e['labels']:
            if not label in counts:
                counts[label] = 1
            else:
                counts[label] += 1
                
    return counts

In [120]:
labelled_methods_sents = get_labelled_data('labelled_method_sentences.json')
labelled_constructor_sents = get_labelled_data('labelled_constructor_sentences.json')

In [123]:
arr = get_labelled_doccano_data('./output/method_sentences_doccano.jsonl.jsonl', './output/method_metadata.json')
    
for e in arr:
    if 'Temporal' in e['labels']:
        e['labels'] = ['Other']

with open('labelled_method_sentences.json', 'w+') as f:
    json.dump(arr, f)

print(get_label_counts(labelled_methods_sents))
print(get_label_counts(labelled_constructor_sents))

FileNotFoundError: [Errno 2] No such file or directory: './output/method_sentences.jsonl'