In [106]:
import ujson
import glob
import os
import nltk
import re
from random import sample, shuffle, seed

In [114]:
# load the java document caveats
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
caveats_dict = {}
class_to_caveat_ids = {}

files = glob.glob(caveat_files_dir + '*.json')
for file in files:
    with open(file) as f:
        arr = ujson.load(f)
        full_class_name = os.path.splitext(os.path.basename(file))[0]
        simple_class_name = full_class_name.split('.')[-1]
        for caveat in arr:
            caveats_dict[caveat['id']] = caveat
            
            if simple_class_name not in class_to_caveat_ids:
                class_to_caveat_ids[simple_class_name] = []
            class_to_caveat_ids[simple_class_name].append(caveat['id'])

In [115]:
# Print statistics about the caveats found using keywords

caveat_type_count = {} # map caveat types to their counts

class_level_sentences = 0
caveat_sentences = 0
caveat_misc_sentences = 0
deprecated = 0
num_caveats = len(caveats_dict)
sent_deprecated = 0

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if caveat['deprecated']:
        if "class_caveat_sentences" in caveat:
            sent_deprecated += len(caveat["class_caveat_sentences"])
        else:        
            sent_deprecated += len(caveat['sentences'])
            for misc in caveat['caveat_misc']:
                if 'parameters' in misc:
                    for obj in misc['parameters']:
                        sent_deprecated += len(obj['sentences'])
                elif 'exceptions' in misc:
                    for obj in misc['exceptions']:
                        sent_deprecated += len(obj['sentences'])
                else:
                     sent_deprecated += len(misc['list'])

    if "class_caveat_sentences" in caveat:
        class_level_sentences += len(caveat["class_caveat_sentences"])
    else:        
        caveat_sentences += len(caveat['sentences'])
        for misc in caveat['caveat_misc']:
            if 'parameters' in misc:
                for obj in misc['parameters']:
                    caveat_misc_sentences += len(obj['sentences'])
            elif 'exceptions' in misc:
                for obj in misc['exceptions']:
                    caveat_misc_sentences += len(obj['sentences'])
            else:
                 caveat_misc_sentences += len(misc['list'])

    if caveat['deprecated']:
        deprecated += 1
    
    if 'type' in caveat:        
        if not caveat['type'] in caveat_type_count:
            caveat_type_count[caveat['type']] = 1
        else:
            caveat_type_count[caveat['type']] += 1

print("-------------Overall Statistics-------------")
print("Number of classes: {}".format(len(files)))
print("Number of class level caveat sentences: {}".format(class_level_sentences))
print("Number of api level caveat sentences: {}".format(caveat_sentences))  
print("Number of misc sentences: {}".format(caveat_misc_sentences))
print("Number of deprecated caveats: {}".format(deprecated))
print("Number of sentences for deprecated caveats: {}".format(sent_deprecated))
print(caveat_type_count)
print(sum([caveat_type_count[k] for k in caveat_type_count]))
print("Total sentences: {}".format(class_level_sentences + caveat_sentences + caveat_misc_sentences))

-------------Overall Statistics-------------
Number of classes: 4712
Number of class level caveat sentences: 9964
Number of api level caveat sentences: 37578
Number of misc sentences: 67701
Number of deprecated caveats: 1522
Number of sentences for deprecated caveats: 2562
{'constructor': 4172, 'method': 33827, 'field': 6403}
44402
Total sentences: 115243


In [116]:
def write_to_doccano_import_file(file, arr):
    """ Write the dict objects within an input array to a json file in doccano import format """
    with open(file, 'w+') as f:
        for obj in arr:
            f.write(ujson.dumps(obj) + '\n')

In [130]:
caveat_sentences_array = []
sentence_id_to_caveat_id = {}
c = 0

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if caveat['deprecated']:
        continue
 
    if 'class_caveat_sentences' in caveat:
        for sentence in caveat['class_caveat_sentences']:
            if len(sentence) > 0:
                sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                if '// Encode a String into bytes String inputString = "blahblahblah"' in sentence:
                    print(caveat)
                caveat_sentences_array.append({
                    'text': sentence, 
                     'id': len(caveat_sentences_array),
                     'type': 'class'
                })
    else:
        for sentence in caveat['sentences']:
            if len(sentence) > 0:
                sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                caveat_sentences_array.append({
                    'text': sentence, 
                     'id': len(caveat_sentences_array),
                     'type': caveat['type']
                })

            # add sentences for throws, returns etc...
            if caveat['type'] in ['method', 'constructor']:
                for misc in caveat['caveat_misc']:
                    for misc_obj in misc['list']:
                        if type(misc_obj) == str:
                            sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                            caveat_sentences_array.append({
                                'text': misc_obj, 
                                 'id': len(caveat_sentences_array),
                                'type': 'misc@' + misc['name']})
                        else: # parameters and exception objects
                            for misc_sentence in misc_obj['sentences']:
                                sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                                caveat_sentences_array.append({
                                    'text': misc_sentence, 
                                     'id': len(caveat_sentences_array),
                                    'type': 'misc@' + misc['name']})

# restrict text length
# class_sentences = [o for o in caveat_sentences_array if o['type'] == 'class' and len(o['text']) <= 400]
# method_sentences = [o for o in caveat_sentences_array if o['type'] == 'method' and len(o['text']) <= 400]
# field_sentences = [o for o in caveat_sentences_array if o['type'] == 'field' and len(o['text']) <= 400]
# constructor_sentences = [o for o in caveat_sentences_array if o['type'] == 'constructor' and len(o['text']) <= 400]
# misc_sentences = [o for o in caveat_sentences_array if 'misc' in o['type'] and len(o['text']) <= 400]

class_sentences = [o for o in caveat_sentences_array if o['type'] == 'class']
method_sentences = [o for o in caveat_sentences_array if o['type'] == 'method']
field_sentences = [o for o in caveat_sentences_array if o['type'] == 'field']
constructor_sentences = [o for o in caveat_sentences_array if o['type'] == 'constructor']
misc_sentences = [o for o in caveat_sentences_array if 'misc' in o['type']]

print("-------------Filtered Sentence Statistics-------------")
print("Number of class sentences: {}".format(len(class_sentences)))
print("Number of method sentences: {}".format(len(method_sentences)))
print("Number of field sentences: {}".format(len(field_sentences)))
print("Number of constructor sentences: {}".format(len(constructor_sentences)))

exception_sentences = [o for o in misc_sentences if 'Throws:' == o['type'].split('@')[1]]
return_sentences = [o for o in misc_sentences if 'Returns:' == o['type'].split('@')[1]]
parameter_sentences = [o for o in misc_sentences if 'Parameters:' == o['type'].split('@')[1]]

print('Number of exception sentences: {}'.format(len(exception_sentences)))
print('Number of return sentences: {}'.format(len(return_sentences)))
print('Number of parameter sentences: {}'.format(len(parameter_sentences)))
print('Total: {}'.format(len(caveat_sentences_array)))


{'class_caveat_sentences': ['try { // Encode a String into bytes String inputString = "blahblahblah"; byte[] input = inputString.getBytes("UTF-8"); // Compress the bytes byte[] output = new byte[100]; Deflater compresser = new Deflater(); compresser.setInput(input); compresser.finish(); int compressedDataLength = compresser.deflate(output); compresser.end(); // Decompress the bytes Inflater decompresser = new Inflater(); decompresser.setInput(output, 0, compressedDataLength); byte[] result = new byte[100]; int resultLength = decompresser.inflate(result); decompresser.end(); // Decode the bytes into a String String outputString = new String(result, 0, resultLength, "UTF-8"); } catch (java.io.UnsupportedEncodingException ex) { // handle } catch (java.util.zip.DataFormatException ex) { // handle }'], 'deprecated': False, 'id': 28877}
-------------Filtered Sentence Statistics-------------
Number of class sentences: 9819
Number of method sentences: 32190
Number of field sentences: 1779
Numb

In [118]:
a = [o for o in caveat_sentences_array if o['type'] == 'class' and len(o['text']) > 400]
print(a[10])

{'text': "Examples: Creating and using text boundaries: public static void main(String args[]) { if (args.length == 1) { String stringToExamine = args[0]; //print each word in order BreakIterator boundary = BreakIterator.getWordInstance(); boundary.setText(stringToExamine); printEachForward(boundary, stringToExamine); //print each sentence in reverse order boundary = BreakIterator.getSentenceInstance(Locale.US); boundary.setText(stringToExamine); printEachBackward(boundary, stringToExamine); printFirst(boundary, stringToExamine); printLast(boundary, stringToExamine); } } Print each element in order: public static void printEachForward(BreakIterator boundary, String source) { int start = boundary.first(); for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary.next()) { System.out.println(source.substring(start,end)); } } Print each element in reverse order: public static void printEachBackward(BreakIterator boundary, String source) { int end = boundary.la

In [119]:
sent_id = 3619
print(sentence_id_to_caveat_id[sent_id])

for k in class_to_caveat_ids:
    if caveats_dict[sentence_id_to_caveat_id[sent_id]]['id'] in class_to_caveat_ids[k]:
        print(k)
        break

7645
BreakIterator


In [102]:
def get_unique_text_objs(arr):
    seen_text = set()
    filtered = []
    for obj in arr:
        if not obj['text'] in seen_text:
            filtered.append(obj)
            seen_text.add(obj['text'])
        
    return filtered

seed(42)


filtered_class_sentences = get_unique_text_objs(class_sentences)
filtered_method_sentences = get_unique_text_objs(method_sentences)
filtered_field_sentences = get_unique_text_objs(field_sentences)
filtered_constructor_sentences = get_unique_text_objs(constructor_sentences)
# misc sentences
filtered_exception_sentences = get_unique_text_objs(exception_sentences)
filtered_return_sentences = get_unique_text_objs(return_sentences)
filtered_parameter_sentences = get_unique_text_objs(parameter_sentences)

print("-------------Unique Filtered Sentence Statistics-------------")
print("Number of unique class sentences: {}".format(len(filtered_class_sentences)))
print("Number of unique method sentences: {}".format(len(filtered_method_sentences)))
print("Number of unique field sentences: {}".format(len(filtered_field_sentences)))
print("Number of unique constructor sentences: {}".format(len(filtered_constructor_sentences)))
print('Number of unique exception sentences: {}'.format(len(filtered_exception_sentences)))
print('Number of unique return sentences: {}'.format(len(filtered_return_sentences)))
print('Number of unique parameter sentences: {}'.format(len(filtered_parameter_sentences)))

# # sample 384 of each list for 95% confidence interval with 5% error margin
# filtered_class_sentences = sample(filtered_class_sentences, 384)
# filtered_method_sentences = sample(filtered_method_sentences, 384)
# filtered_field_sentences = sample(filtered_field_sentences, 384)
# filtered_constructor_sentences = sample(filtered_constructor_sentences, 384)
# # misc sentences
# filtered_exception_sentences = sample(filtered_exception_sentences, 384)
# filtered_return_sentences = sample(filtered_return_sentences, 384)
# filtered_parameter_sentences = sample(filtered_parameter_sentences, 384)

-------------Unique Filtered Sentence Statistics-------------
Number of unique class sentences: 8718
Number of unique method sentences: 21984
Number of unique field sentences: 1535
Number of unique constructor sentences: 1958
Number of unique exception sentences: 4852
Number of unique return sentences: 4306
Number of unique parameter sentences: 2653


In [107]:
# write data to annotate to different files
shuffle(filtered_exception_sentences)
shuffle(filtered_parameter_sentences)
write_to_doccano_import_file('./output/exception_sentences_doccano.jsonl', filtered_exception_sentences)
write_to_doccano_import_file('./output/parameter_sentences_doccano.jsonl', filtered_parameter_sentences)

In [120]:
# modify labelled data from doccano
def get_modified_doccano_data(labelled_file, metadata):
    with open(labelled_file) as f, open(metadata) as metadata_f:
        id_to_label = {}
        arr = ujson.load(metadata_f)
        for e in arr:
            id_to_label[e['id']] = e['text']

        data = []
        for line in f:
            line = line.strip()
            obj = ujson.loads(line)
            obj['labels'] = [id_to_label[i['label']] for i in obj['annotations']]
            obj.pop('annotations', None)
            obj.pop('meta', None)
            obj.pop('annotation_approver', None)
            data.append(obj)
    
        return data

In [121]:
def get_labelled_data(file):
    with open(file) as f:
        return ujson.load(f)
    
def get_label_counts(arr):
    counts = {}
    
    for e in arr:
        for label in e['labels']:
            if not label in counts:
                counts[label] = 1
            else:
                counts[label] += 1
                
    return counts

def write_modified_doccano_data(doccano_file, metadata_file, output_file):
    with open(output_file, 'w+') as f:
        arr = get_modified_doccano_data(doccano_file, metadata_file)
        for obj in arr:
            f.write(ujson.dumps(obj) + '\n')

In [122]:
write_modified_doccano_data('./output/labelled/new_labelled_param.jsonl' ,
                           './output/labelled/new_param_labels.json',
                           './labelled_data/new_parameter_full.jsonl')

write_modified_doccano_data('./output/labelled/new_labelled_exception.jsonl' ,
                           './output/labelled/new_except_labels.json',
                           './labelled_data/new_exception_full.jsonl')

In [133]:
# load all labelled sentences
constructor_sents = get_labelled_data('./labelled_data/labelled_constructor_sentences.json')[:321]
# exception_sents = get_labelled_data('./labelled_exception_sentences.json')[:]
method_sents = get_labelled_data('./labelled_data/labelled_method_sentences.json')[:377]
return_sents = get_labelled_data('./labelled_data/labelled_return_sentences.json')[:353]
parameter_sents = get_labelled_data('./labelled_data/labelled_parameter_sentences.json')[:336]

print(get_label_counts(constructor_sents))
# print(get_label_counts(exception_sents))
print(get_label_counts(method_sents))
print(get_label_counts(return_sents))
print(get_label_counts(parameter_sents))

{'Exception': 21, 'Guard': 32, 'Other': 264, 'Temporal': 6}
{'Other': 360, 'Temporal': 4, 'Exception': 9, 'Guard': 5}
{'Other': 351, 'Temporal': 1}
{'Other': 257, 'Temporal': 2, 'Guard': 75, 'Exception': 2}


In [23]:
# POS and dependency parsing
from nltk.parse.corenlp import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9010')
next(parser.raw_parse('The quick brown fox sucks at jumping.'))

                ROOT                          
                 |                             
                 S                            
       __________|__________________________   
      |                     VP              | 
      |                 ____|___            |  
      |                |        PP          | 
      |                |     ___|_____      |  
      |                |    |         S     | 
      |                |    |         |     |  
      NP               |    |         VP    | 
  ____|__________      |    |         |     |  
 DT   JJ    JJ   NN   VBZ   IN       VBG    . 
 |    |     |    |     |    |         |     |  
The quick brown fox  sucks  at     jumping  . 



In [124]:
def get_label_counts_doccano_file(path):
    counts = {}
    with open(path) as f:
        for line in f:
            obj = ujson.loads(line)
            if len(obj['labels']) > 0:
                for label in obj['labels']:
                    if label not in counts:
                        counts[label] = 0
                    counts[label] += 1
                    
    return counts

In [126]:
print(get_label_counts_doccano_file('./labelled_data/new_parameter_full.jsonl'))
print(get_label_counts_doccano_file('./labelled_data/new_exception_full.jsonl'))

{'ambiguous': 291, 'not null': 26, 'range limitation': 19}
{'ambiguous': 242, 'not null': 73, 'range limitation': 46, 'type restriction': 1}


In [128]:
# calculate how many exception sentences have more than 1 label
with open('./labelled_data/new_exception_full.jsonl') as f:
    c = 0
    for line in f:
        obj = ujson.loads(line)
        if len(obj['labels']) > 1:
            c += 1
    print(c)

6


In [30]:
with open('./repos.txt') as f:
    s = set()
    for line in f:
        s.add(line)
        
    print(len(s))

291152
