In [6]:
import json
import glob
import os
import nltk
import re
from random import sample, shuffle, seed

In [2]:
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
caveats_dict = {}

files = glob.glob(caveat_files_dir + '*.json')
for file in files:
    with open(file) as f:
        arr = json.load(f)
        for caveat in arr:
            caveats_dict[caveat['id']] = caveat

In [3]:
# Print statistics about the caveats found using keywords

caveat_type_count = {} # map caveat types to their counts
print("Number of classes: {}".format(len(files)))

class_level_sentences = 0
caveat_sentences = 0
caveat_misc_sentences = 0
deprecated = 0
num_caveats = len(caveats_dict)

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if "class_level_caveat_sentences" in caveat:
        class_level_sentences += len(caveat["class_level_caveat_sentences"])
    else:
        caveat_sentences += len(caveat['caveat_sentences'])
        for misc in caveat['caveat_misc']:
            caveat_misc_sentences += len(misc['text_list'])

    if caveat['deprecated']:
        deprecated += 1
    
    if 'type' in caveat:        
        if not caveat['type'] in caveat_type_count:
            caveat_type_count[caveat['type']] = 1
        else:
            caveat_type_count[caveat['type']] += 1

print("Number of caveats: {}".format(num_caveats))
print("Number of class level caveat sentences: {}".format(class_level_sentences))
print("Number of api level caveat sentences: {}".format(caveat_sentences))  
print("Number of misc sentences: {}".format(caveat_misc_sentences))
print("Number of deprecated caveats: {}".format(deprecated))
print(caveat_type_count)

Number of classes: 4712
Number of caveats: 49023
Number of class level caveat sentences: 9784
Number of api level caveat sentences: 36447
Number of misc sentences: 35567
Number of deprecated caveats: 1522
{'constructor': 4172, 'method': 33827, 'field': 6403}


In [4]:
def write_to_doccano_import_file(file, arr):
    """ Write the dict objects within an input array to a json file in doccano import format """
    with open(file, 'w+') as f:
        for obj in arr:
            f.write(json.dumps(obj) + '\n')

In [12]:
caveat_sentences_array = []
sentence_id_to_caveat_id = {}
 
def normalize_sentence(sentence):
    

for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if caveat['deprecated']:
        continue
 
    if 'class_level_caveat_sentences' in caveat:
        for sentence in caveat['class_level_caveat_sentences']:
            if len(sentence) > 0:
                sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                caveat_sentences_array.append({
                    'text': sentence, 
                     'id': len(caveat_sentences_array),
                     'type': 'class'
                })
    else:
        for sentence in caveat['caveat_sentences']:
            if len(sentence) > 0:
                sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                caveat_sentences_array.append({
                    'text': sentence, 
                     'id': len(caveat_sentences_array),
                     'type': caveat['type']
                })

            # add sentences for throws, returns etc...
            if caveat['type'] in ['method', 'constructor']:
                for misc in caveat['caveat_misc']:
                    for misc_sentence in misc['text_list']:
                        if len(misc_sentence) > 0:
                            sentence_id_to_caveat_id[len(caveat_sentences_array)] = caveat['id']
                            caveat_sentences_array.append({
                                'text': misc_sentence, 
                                 'id': len(caveat_sentences_array),
                                'type': 'misc@' + misc['name']
                            })

class_sentences = [o for o in caveat_sentences_array if o['type'] == 'class' and len(o['text']) <= 400]
method_sentences = [o for o in caveat_sentences_array if o['type'] == 'method' and len(o['text']) <= 400]
field_sentences = [o for o in caveat_sentences_array if o['type'] == 'field' and len(o['text']) <= 400]
constructor_sentences = [o for o in caveat_sentences_array if o['type'] == 'constructor' and len(o['text']) <= 400]
misc_sentences = [o for o in caveat_sentences_array if 'misc' in o['type'] and len(o['text']) <= 400]
        
print("Number of class sentences: {}".format(len(class_sentences)))
print("Number of method sentences: {}".format(len(method_sentences)))
print("Number of field sentences: {}".format(len(field_sentences)))
print("Number of constructor sentences: {}".format(len(constructor_sentences)))

exception_sentences = [o for o in misc_sentences if 'Throws:' == o['type'].split('@')[1]]
return_sentences = [o for o in misc_sentences if 'Returns:' == o['type'].split('@')[1]]
parameter_sentences = [o for o in misc_sentences if 'Parameters:' == o['type'].split('@')[1]]

print('-----------------------------------')
print('Number of exception sentences: {}'.format(len(exception_sentences)))
print('Number of return sentences: {}'.format(len(return_sentences)))
print('Number of parameter sentences: {}'.format(len(parameter_sentences)))


Number of class sentences: 9333
Number of method sentences: 31403
Number of field sentences: 1726
Number of constructor sentences: 2224
-----------------------------------
Number of exception sentences: 29841
Number of return sentences: 10800
Number of parameter sentences: 10875


In [14]:
def get_unique_text_objs(arr):
    seen_text = set()
    filtered = []
    for obj in arr:
        if not obj['text'] in seen_text:
            filtered.append(obj)
            seen_text.add(obj['text'])
        
    return filtered

seed(42)

filtered_class_sentences = get_unique_text_objs(class_sentences)
filtered_method_sentences = get_unique_text_objs(method_sentences)
filtered_field_sentences = get_unique_text_objs(field_sentences)
filtered_constructor_sentences = get_unique_text_objs(constructor_sentences)
# misc sentences
filtered_exception_sentences = get_unique_text_objs(exception_sentences)
filtered_return_sentences = get_unique_text_objs(return_sentences)
filtered_parameter_sentences = get_unique_text_objs(parameter_sentences)

print("Number of unique class sentences: {}".format(len(filtered_class_sentences)))
print("Number of unique method sentences: {}".format(len(filtered_method_sentences)))
print("Number of unique field sentences: {}".format(len(filtered_field_sentences)))
print("Number of unique constructor sentences: {}".format(len(filtered_constructor_sentences)))
print('-----------------------------------')
print('Number of unique exception sentences: {}'.format(len(filtered_exception_sentences)))
print('Number of unique return sentences: {}'.format(len(filtered_return_sentences)))
print('Number of unique parameter sentences: {}'.format(len(filtered_parameter_sentences)))

# # sample 384 of each list for 95% confidence interval with 5% error margin
# filtered_class_sentences = sample(filtered_class_sentences, 384)
# filtered_method_sentences = sample(filtered_method_sentences, 384)
# filtered_field_sentences = sample(filtered_field_sentences, 384)
# filtered_constructor_sentences = sample(filtered_constructor_sentences, 384)
# # misc sentences
# filtered_exception_sentences = sample(filtered_exception_sentences, 384)
# filtered_return_sentences = sample(filtered_return_sentences, 384)
# filtered_parameter_sentences = sample(filtered_parameter_sentences, 384)

Number of unique class sentences: 8558
Number of unique method sentences: 21321
Number of unique field sentences: 1471
Number of unique constructor sentences: 1439
-----------------------------------
Number of unique exception sentences: 5961
Number of unique return sentences: 4043
Number of unique parameter sentences: 2732


In [15]:
# write data to annotate to different files
# write_to_doccano_import_file('./output/method_sentences_doccano.jsonl', filtered_method_sentences)
# write_to_doccano_import_file('./output/constructor_sentences_doccano.jsonl', filtered_constructor_sentences)
write_to_doccano_import_file('./output/exception_sentences_doccano.jsonl', filtered_exception_sentences)
# write_to_doccano_import_file('./output/return_sentences_doccano.jsonl', filtered_return_sentences)
write_to_doccano_import_file('./output/parameter_sentences_doccano.jsonl', filtered_parameter_sentences)

In [7]:
# modify labelled data from doccano
def get_labelled_doccano_data(labelled_file, metadata):
    with open(labelled_file) as f, open(metadata) as metadata_f:
        id_to_label = {}
        arr = json.load(metadata_f)
        for e in arr:
            id_to_label[e['id']] = e['text']

        data = []
        for line in f:
            line = line.strip()
            obj = json.loads(line)
            obj['labels'] = [id_to_label[i['label']] for i in obj['annotations']]
            obj.pop('annotations', None)
            obj.pop('meta', None)
            obj.pop('annotation_approver', None)
            data.append(obj)
    
        return data

In [11]:
def get_labelled_data(file):
    with open(file) as f:
        return json.load(f)
    
def get_label_counts(arr):
    counts = {}
    
    for e in arr:
        for label in e['labels']:
            if not label in counts:
                counts[label] = 1
            else:
                counts[label] += 1
                
    return counts

def write_labelled_doccano_data(doccano_file, metadata_file, output_file):
    with open(output_file, 'w+') as f:
        json.dump(get_labelled_doccano_data(doccano_file, metadata_file), output_file)

In [14]:
# load all labelled sentences
constructor_sents = get_labelled_data('./labelled_constructor_sentences.json')
exception_sents = get_labelled_data('./labelled_exception_sentences.json')
method_sents = get_labelled_data('./labelled_method_sentences.json')
parameter_sents = get_labelled_data('./labelled_parameter_sentences.json')
return_sents = get_labelled_data('./labelled_return_sentences.json')

print(get_label_counts(constructor_sents))
print(get_label_counts(exception_sents))
print(get_label_counts(method_sents))
print(get_label_counts(parameter_sents))
print(get_label_counts(return_sents))

{'Exception': 21, 'Guard': 37, 'Other': 322, 'Temporal': 6}
{'Explicit': 78, 'Implicit': 301, 'Temporal': 5}
{'Other': 367, 'Temporal': 4, 'Exception': 9, 'Guard': 5}
{'Other': 293, 'Temporal': 2, 'Guard': 87, 'Exception': 2}
{'Other': 382, 'Temporal': 1}


In [23]:
# POS and dependency parsing
from nltk.parse.corenlp import CoreNLPParser
parser = CoreNLPParser(url='http://localhost:9010')
next(parser.raw_parse('The quick brown fox sucks at jumping.'))

                ROOT                          
                 |                             
                 S                            
       __________|__________________________   
      |                     VP              | 
      |                 ____|___            |  
      |                |        PP          | 
      |                |     ___|_____      |  
      |                |    |         S     | 
      |                |    |         |     |  
      NP               |    |         VP    | 
  ____|__________      |    |         |     |  
 DT   JJ    JJ   NN   VBZ   IN       VBG    . 
 |    |     |    |     |    |         |     |  
The quick brown fox  sucks  at     jumping  . 



In [46]:
regex_strs = []
with open('./heuristic_regex_exception.txt') as f:
    for line in f:
        if line == '\n' or line[0] == '#':
            continue
        else:
            regex_strs.append(line.strip())
            
important = []
for obj in exception_sents:
    for s in regex_strs:
        m = re.search(s, obj['text'], re.IGNORECASE)
        if m:
            important.append(obj['id'])
            break

print(len(important))
for obj in exception_sents:
    if 'Explicit' in obj['labels'] and obj['id'] not in important:
        print(obj['text'])
        break

156
CLASS_0 - if defaultSelection is null


In [11]:
regex = r"example"
import re
if re.search(regex, "This is an Example sentence", re.IGNORECASE):
    print("FOUND")

FOUND


In [16]:
print(filtered_exception_sentences[18])

{'text': 'CLASS_1 - if PARAMETER_0 is METHOD_0 and METHOD_3 returns false', 'id': 222, 'type': 'misc@Throws:'}


In [20]:
for key in caveats_dict:
    caveat = caveats_dict[key]
    
    if caveat['id'] == sentence_id_to_caveat_id[222]:
        print(caveat)

{'name': 'setLookAndFeel', 'type': 'method', 'signature': 'public static void setLookAndFeel(LookAndFeel newLookAndFeel) throws UnsupportedLookAndFeelException', 'deprecated': False, 'caveat_sentences': ['If the current look and feel is METHOD_0 uninitialize is invoked on it.', 'If PARAMETER_0 is METHOD_0, METHOD_1 is invoked on it followed by METHOD_2.', 'If the PARAMETER_0 is null, the look and feel defaults are set to null.', 'A value of null can be used to set the look and feel to null.', 'As the CLASS_0 is required for most of Swing to function, setting the CLASS_0 to null is strongly discouraged.'], 'caveat_misc': [{'name': 'Throws:', 'text_list': ['CLASS_1 - if PARAMETER_0 is METHOD_0 and METHOD_3 returns false']}], 'id': 48468, 'mappings': {'parameters': ['newLookAndFeel'], 'fields': [], 'methods': ['non-null', 'initialize', 'getDefaults', 'newLookAndFeel.isSupportedLookAndFeel()', 'getLookAndFeel()'], 'primitives': [], 'classes': ['LookAndFeel', 'UnsupportedLookAndFeelExceptio

In [28]:
print(' '.join([]) == '')

True
