In [1]:
import ujson
import glob
import os
import re
import pandas as pd
from nltk.parse.corenlp import CoreNLPParser

parser = CoreNLPParser(url='http://localhost:9010')

In [2]:
# load the non-dreprecated java doc caveat sentences of methods (parameters or exception level)
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
method_caveats = []
parameter_caveats = []
exception_caveats = []

files = sorted(glob.glob(caveat_files_dir + '*.json'))
for file in files:
    with open(file) as f:
        arr = ujson.load(f)
        full_class_name = os.path.splitext(os.path.basename(file))[0]
        simple_class_name = full_class_name.split('.')[-1]
        
        for caveat in arr:
            if not caveat['deprecated'] and 'name' in caveat:
                
                # collect the name of parameters
                collected = False
                parameters = []
                for misc_obj in caveat['caveat_misc']:
                    if misc_obj['name'] == 'Parameters:':
                        for obj in misc_obj['list']:
                            parameters.append(obj['parameter'])
                        collected = True
                        break
                        
                for sentence in caveat['sentences']:
                    method_caveats.append({
                                    'obj': '',
                                    'simple_class_name': simple_class_name,
                                    'full_class_name': full_class_name,
                                    'api': caveat['name'],
                                    'signature': caveat['signature'],
                                    'sentence': sentence,
                                    'parameters': parameters,
                                    'type': 'method'
                                })

                if collected:
                    # add all parameter and exception level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    if misc_obj['name'] == 'Parameters:':
                                        parameter_caveats.append({
                                            'obj': obj['parameter'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'parameter'
                                        })
                                    else:
                                        exception_caveats.append({
                                            'obj': obj['exception'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'exception'
                                        })
                                        
method_caveat_df = pd.DataFrame(method_caveats)
parameter_caveat_df = pd.DataFrame(parameter_caveats)
exception_caveat_df = pd.DataFrame(exception_caveats)

In [3]:
def filter_by_pattern(sentence, patterns, lowercase):
    if lowercase:
        sentence = sentence.lower()
    if sentence:
        for pattern in patterns:
            matches = re.search(pattern, sentence)
            if matches:
                return True
    
    return False

# Regex patterns for filtering
exception_not_null_patterns = [
#     '(be|equal|equals|is|are) null',
#     'be (equal|equivalent) to null',
#     'non-null',
     'null'
]

exception_range_limitation_patterns = [
    r'<|>|=',
    r'equal|equal to|equivalent to|illegal value| is (nan|infinite|empty)',
    r'\b(less|smaller|greater|larger)\b',
    r'\b(range|negative|positive|non-negative|non-positive)\b'
]

exception_type_restriction_patterns = [
    r'is( not)? an? [A-Z][a-z]+([A-Za-z_0-9\.]*)*',
    r'instance of|return type'
]

parameter_not_null_patterns = [
    r'not( be)? null',
    r'non-null',
]

parameter_range_limitation_patterns = [
     r'<|>|=',
     r'(less|smaller|greater|larger) than',
     r'negative|positive|non-negative|non-positive'
]

def exception_not_null_filter(sentence):
    return filter_by_pattern(sentence, exception_not_null_patterns, lowercase=True)

def exception_range_limitation_filter(sentence):
    return filter_by_pattern(sentence, exception_range_limitation_patterns, lowercase=True)

def exception_type_restriction_filter(sentence):
    return filter_by_pattern(sentence, exception_type_restriction_patterns, lowercase=False)

def parameter_not_null_filter(sentence):
    return filter_by_pattern(sentence, parameter_not_null_patterns, lowercase=True)

def parameter_range_limitation_filter(sentence):
    return filter_by_pattern(sentence, parameter_range_limitation_patterns, lowercase=True)

In [4]:
exception_filters = {
    'not_null': (exception_not_null_filter, 'NullnessNotAllowed'),
    'range': (exception_range_limitation_filter, 'RangeLimitation'),
    'type': (exception_type_restriction_filter, 'TypeRestriction')
}

parameter_filters = {
    'not_null': (parameter_not_null_filter, 'NullnessNotAllowed'),
    'range': (parameter_range_limitation_filter, 'RangeLimitation')
}

def analyse_labelled_results(file, filters):
    with open(file) as f:
        counts = {}
        c = 0
        for line in f:
            obj = ujson.loads(line)

            if obj['labels']:
                c += 1
                if c > 384:
                    break

                for label in obj['labels']:
                    if not label in counts:
                        counts[label] = 1
                    else:
                        counts[label] += 1
        print(counts)

        for key in filters:
            print('Filter results for {}'.format(key))
            correct, retrieved = 0, 0
            c = 0
            f.seek(0)
            for line in f:
                obj = ujson.loads(line)

                if obj['labels']:
                    c += 1
                    if c > 384:
                        break

                    if filters[key][0](obj['text']):
                        retrieved += 1
                        if filters[key][1] in obj['labels']:
                            correct += 1
#                         else:
#                             print(obj)
#                     elif filters[key][1] in obj['labels']:
#                         print('MISS')
#                         print(obj)

            print('Correct: {}'.format(correct))
            print('Retrieved: {}\n'.format(retrieved))
  
# Test filtering rules
print('Exception caveat sentences')
analyse_labelled_results('./labelled_data/labelled_exception_full.jsonl', exception_filters)
print('Parameter caveat sentences')
analyse_labelled_results('./labelled_data/labelled_parameter_full.jsonl', parameter_filters)

Exception caveat sentences
{'Ambiguous': 256, 'NullnessNotAllowed': 92, 'RangeLimitation': 27, 'TypeRestriction': 9, 'Dependent': 9}
Filter results for not_null
Correct: 92
Retrieved: 92

Filter results for range
Correct: 27
Retrieved: 35

Filter results for type
Correct: 9
Retrieved: 10

Parameter caveat sentences
{'NullnessNotAllowed': 36, 'Ambiguous': 222, 'ExpectedValue': 3, 'RangeLimitation': 3}
Filter results for not_null
Correct: 36
Retrieved: 37

Filter results for range
Correct: 3
Retrieved: 7



In [7]:
method_caveat_df = pd.DataFrame(method_caveats)
parameter_caveat_df = pd.DataFrame(parameter_caveats)
exception_caveat_df = pd.DataFrame(exception_caveats)

def get_unique_filtered_df(df, filter_func):
    filtered_df = df[df['sentence'].apply(filter_func)]
    unique_df = filtered_df.drop_duplicates('sentence').sample(frac=1)
    print('Filtered results: {}'.format(len(filtered_df.index)))
    print('Unique results: {}\n'.format(len(unique_df.index)))
    return unique_df

# Not null exception
not_null_exception_df = get_unique_filtered_df(exception_caveat_df, exception_not_null_filter)

# Not null paramater
not_null_parameter_df = get_unique_filtered_df(parameter_caveat_df, parameter_not_null_filter)

# Range limitation exception
range_limit_exception_df = get_unique_filtered_df(exception_caveat_df, exception_range_limitation_filter)

# Range limitation parameter
range_limit_parameter_df = get_unique_filtered_df(parameter_caveat_df, parameter_range_limitation_filter)

# Type restriction exception
type_restrict_exception_df = get_unique_filtered_df(exception_caveat_df, exception_type_restriction_filter)

def create_annotation_file(df, size, out_path, include_obj=False):
    with open(out_path, 'w+') as out_f:
        c = 0
        for i in df.index:
            c += 1
            if c == 1:
                out_f.write('-----------------------------')
            
            if c > size:
                break
            
            out_f.write(str(i) + '\n')
            out_f.write(df.loc[i, 'signature'] + '\n')
            if include_obj:
                out_f.write(df.loc[i,'obj'] + '\n')
            out_f.write(df.loc[i, 'sentence'] + '\n\n')
            out_f.write('-----------------------------')
            

# create_annotation_file(type_restrict_exception_df, 100, './output/labelled_caveat_rules/type_restrict_exception.txt')
create_annotation_file(not_null_exception_df, 100, './output/labelled_caveat_rules/not_null_exception.txt')
create_annotation_file(not_null_parameter_df, 100, './output/labelled_caveat_rules/not_null_parameter.txt', True)
create_annotation_file(range_limit_exception_df, 100, './output/labelled_caveat_rules/range_limit_exception.txt')
create_annotation_file(range_limit_parameter_df, 100, './output/labelled_caveat_rules/range_limit_parameter.txt', True)

Filtered results: 3844
Unique results: 1447

Filtered results: 1202
Unique results: 495

Filtered results: 1946
Unique results: 834

Filtered results: 335
Unique results: 193

Filtered results: 208
Unique results: 149



In [8]:
next(parser.raw_parse('The quick brown fox sucks at jumping.')).pretty_print()

                ROOT                          
                 |                             
                 S                            
       __________|__________________________   
      |                     VP              | 
      |                 ____|___            |  
      |                |        PP          | 
      |                |     ___|_____      |  
      |                |    |         S     | 
      |                |    |         |     |  
      NP               |    |         VP    | 
  ____|__________      |    |         |     |  
 DT   JJ    JJ   NN   VBZ   IN       VBG    . 
 |    |     |    |     |    |         |     |  
The quick brown fox  sucks  at     jumping  . 

