In [30]:
import json
import glob
import os
import nltk
from random import sample 

In [18]:
caveat_files_dir = './output/java_12_spec_caveat_sentences/'
caveats_dict = {}

files = glob.glob(caveat_files_dir + '*.json')
for file in files:
    with open(file) as f:
        arr = json.load(f)
        for caveat in arr:
            caveats_dict[caveat['id']] = caveat

In [24]:
# Print statistics about the caveats found using keywords

caveat_type_count = {} # map caveat types to their counts
print("Number of classes: {}".format(len(files)))

class_level_sentences = 0
caveat_sentences = 0
caveat_misc_objects = 0
deprecated = 0
num_caveats = len(caveats_dict)

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if "class_level_caveat_sentences" in caveat:
        class_level_sentences += len(caveat["class_level_caveat_sentences"])
    else:
        caveat_sentences += len(caveat['caveat_sentences'])
        caveat_misc_objects += len(caveat['caveat_misc'])

    if caveat['deprecated']:
        deprecated += 1
    
    if 'type' in caveat:
        if not caveat['type'] in caveat_type_count:
            caveat_type_count[caveat['type']] = 1
        else:
            caveat_type_count[caveat['type']] += 1

print("Number of caveats: {}".format(num_caveats))
print("Number of class level caveat sentences: {}".format(class_level_sentences))
print("Number of api level caveat sentences: {}".format(caveat_sentences))  
print("Number of misc caveat objects: {}".format(caveat_misc_objects))
print("Number of deprecated caveats: {}".format(deprecated))
print(caveat_type_count)

Number of classes: 4712
Number of caveats: 49114
Number of class level caveat sentences: 10552
Number of api level caveat sentences: 38382
Number of misc caveat objects: 26490
Number of deprecated caveats: 1522
{'constructor': 4172, 'method': 33827, 'field': 6403}


In [13]:
def write_to_doccano_import_file(file, arr):
    """ Write the dict objects within an input array to a json file in doccano import format """
    with open(file, 'w+') as f:
        for obj in arr:
            f.write(json.dumps(obj) + '\n')

In [28]:
caveat_sentences_array = []
sentence_id_to_caveat_id = {}

for key in caveats_dict:
    caveat = caveats_dict[key]
 
    if 'class_level_caveat_sentences' in caveat:
        for sentence in caveat['class_level_caveat_sentences']:
            sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
            caveat_sentences_array.append({
                'text': sentence, 
                 'id': len(caveat_sentences_array),
                 'type': 'class'
            })
    else:
        for sentence in caveat['caveat_sentences']:
            sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
            caveat_sentences_array.append({
                'text': sentence, 
                 'id': len(caveat_sentences_array),
                 'type': caveat['type']
            })

            # add sentences for throws, returns etc...
            if caveat['type'] in ['method', 'constructor']:
                for misc in caveat['caveat_misc']:
                    for sentence in misc['text']:
                        sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                        caveat_sentences_array.append({
                            'text': sentence, 
                             'id': len(caveat_sentences_array),
                            'type': 'misc@' + misc['misc_name'] 
                        })

print("Number of method sentences: {}".format(
    len([o for o in caveat_sentences_array if o['type'] == 'method'])))
print("Number of field sentences: {}".format(
    len([o for o in caveat_sentences_array if o['type'] == 'field'])))
print("Number of constructor sentences: {}".format(
    len([o for o in caveat_sentences_array if o['type'] == 'constructor'])))
print("Number of misc sentences: {}".format(
    len([o for o in caveat_sentences_array if 'misc' in o['type']])))

Number of method sentences: 34183
Number of field sentences: 1930
Number of constructor sentences: 2269
Number of misc sentences: 57454


In [36]:
# write data to annotate to file
non_misc_sentences = [obj for obj in caveat_sentences_array if 'misc' not in obj['type']]
misc_sentences = [obj for obj in caveat_sentences_array if 'misc' in obj['type']]

num_non_misc_to_annotate = 500
num_misc_to_annotate = 500

print(num_non_misc_to_annotate)
print(num_misc_to_annotate)

data_to_annotate = sample(non_misc_sentences, num_non_misc_to_annotate) + sample(misc_sentences, num_misc_to_annotate)
write_to_doccano_import_file('./output/caveat_sentences_doccano.jsonl', data_to_annotate)

500
500
