In [37]:
import json
import glob
import os
import nltk
from random import sample, shuffle, seed

In [38]:
caveat_files_dir = './output/java_12_spec_caveat_sentences/'
caveats_dict = {}

files = glob.glob(caveat_files_dir + '*.json')
for file in files:
    with open(file) as f:
        arr = json.load(f)
        for caveat in arr:
            caveats_dict[caveat['id']] = caveat

In [39]:
# Print statistics about the caveats found using keywords

caveat_type_count = {} # map caveat types to their counts
print("Number of classes: {}".format(len(files)))

class_level_sentences = 0
caveat_sentences = 0
caveat_misc_objects = 0
deprecated = 0
num_caveats = len(caveats_dict)

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if "class_level_caveat_sentences" in caveat:
        class_level_sentences += len(caveat["class_level_caveat_sentences"])
    else:
        caveat_sentences += len(caveat['caveat_sentences'])
        caveat_misc_objects += len(caveat['caveat_misc'])

    if caveat['deprecated']:
        deprecated += 1
    
    if 'type' in caveat:
        if caveat['type'] == '':
            print(caveat)
            break
        
        if not caveat['type'] in caveat_type_count:
            caveat_type_count[caveat['type']] = 1
        else:
            caveat_type_count[caveat['type']] += 1

print("Number of caveats: {}".format(num_caveats))
print("Number of class level caveat sentences: {}".format(class_level_sentences))
print("Number of api level caveat sentences: {}".format(caveat_sentences))  
print("Number of misc caveat objects: {}".format(caveat_misc_objects))
print("Number of deprecated caveats: {}".format(deprecated))
print(caveat_type_count)

Number of classes: 4712
Number of caveats: 49023
Number of class level caveat sentences: 10526
Number of api level caveat sentences: 38270
Number of misc caveat objects: 26426
Number of deprecated caveats: 1522
{'constructor': 4172, 'method': 33827, 'field': 6403}


In [40]:
def write_to_doccano_import_file(file, arr):
    """ Write the dict objects within an input array to a json file in doccano import format """
    with open(file, 'w+') as f:
        for obj in arr:
            f.write(json.dumps(obj) + '\n')

In [41]:
caveat_sentences_array = []
sentence_id_to_caveat_id = {}

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if caveat['deprecated']:
        continue
 
    if 'class_level_caveat_sentences' in caveat:
        for sentence in caveat['class_level_caveat_sentences']:
            sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
            caveat_sentences_array.append({
                'text': sentence, 
                 'id': len(caveat_sentences_array),
                 'type': 'class'
            })
    else:
        for sentence in caveat['caveat_sentences']:
            sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
            caveat_sentences_array.append({
                'text': sentence, 
                 'id': len(caveat_sentences_array),
                 'type': caveat['type']
            })

            # add sentences for throws, returns etc...
            if caveat['type'] in ['method', 'constructor']:
                for misc in caveat['caveat_misc']:
                    for sentence in misc['text_list']:
                        sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                        caveat_sentences_array.append({
                            'text': sentence, 
                             'id': len(caveat_sentences_array),
                            'type': 'misc@' + misc['name'] 
                        })

class_sentences = [o for o in caveat_sentences_array if o['type'] == 'class']
method_sentences = [o for o in caveat_sentences_array if o['type'] == 'method']
field_sentences = [o for o in caveat_sentences_array if o['type'] == 'field']
constructor_sentences = [o for o in caveat_sentences_array if o['type'] == 'constructor']
misc_sentences = [o for o in caveat_sentences_array if 'misc' in o['type']]

print("Number of class sentences: {}".format(len(class_sentences)))
print("Number of method sentences: {}".format(len(method_sentences)))
print("Number of field sentences: {}".format(len(field_sentences)))
print("Number of constructor sentences: {}".format(len(constructor_sentences)))
print("Number of misc sentences: {}".format(len(misc_sentences)))

Number of class sentences: 10364
Number of method sentences: 33398
Number of field sentences: 1845
Number of constructor sentences: 2189
Number of misc sentences: 56385


In [42]:
def get_unique_text_objs(arr):
    seen_text = set()
    filtered = []
    for obj in arr:
        if not obj['text'] in seen_text:
            filtered.append(obj)
            seen_text.add(obj['text'])
        
    return filtered

seed(42)

filtered_class_sentences = get_unique_text_objs(class_sentences)
filtered_method_sentences = get_unique_text_objs(method_sentences)
filtered_field_sentences = get_unique_text_objs(field_sentences)
filtered_constructor_sentences = get_unique_text_objs(constructor_sentences)
filtered_misc_sentences = get_unique_text_objs(misc_sentences)

shuffle(filtered_class_sentences)
shuffle(filtered_method_sentences)
shuffle(filtered_field_sentences)
shuffle(filtered_constructor_sentences)
shuffle(filtered_misc_sentences)

print("Number of unique class sentences: {}".format(len(filtered_class_sentences)))
print("Number of unique method sentences: {}".format(len(filtered_method_sentences)))
print("Number of unique field sentences: {}".format(len(filtered_field_sentences)))
print("Number of unique constructor sentences: {}".format(len(filtered_constructor_sentences)))
print("Number of unique misc sentences: {}".format(len(filtered_misc_sentences)))

Number of unique class sentences: 9465
Number of unique method sentences: 22531
Number of unique field sentences: 1581
Number of unique constructor sentences: 1307
Number of unique misc sentences: 13369


In [43]:
# write data to annotate to different files

write_to_doccano_import_file('./output/class_sentences_doccano.jsonl', filtered_class_sentences)
write_to_doccano_import_file('./output/method_sentences_doccano.jsonl', filtered_method_sentences)
write_to_doccano_import_file('./output/field_sentences_doccano.jsonl', filtered_field_sentences)
write_to_doccano_import_file('./output/constructor_sentences_doccano.jsonl', filtered_constructor_sentences)
write_to_doccano_import_file('./output/misc_sentences_doccano.jsonl', filtered_misc_sentences)