In [3]:
import ujson
import glob
import os
import re
import pandas as pd
from nltk.parse.corenlp import CoreNLPDependencyParser, CoreNLPParser

In [4]:
# load the non-dreprecated java doc caveat sentences of methods (parameters or exception level)
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
caveats = []
method_caveats = []
parameter_caveats = []
exception_caveats = []

files = sorted(glob.glob(caveat_files_dir + '*.json'))
for file in files:
    with open(file) as f:
        arr = ujson.load(f)
        full_class_name = os.path.splitext(os.path.basename(file))[0]
        simple_class_name = full_class_name.split('.')[-1]
        
        for caveat in arr:
            if not caveat['deprecated'] and 'name' in caveat:
                
                # collect the name of parameters
                collected = False
                parameters = []
                for misc_obj in caveat['caveat_misc']:
                    if misc_obj['name'] == 'Parameters:':
                        for obj in misc_obj['list']:
                            parameters.append(obj['parameter'])
                        collected = True
                        break
                        
                for sentence in caveat['sentences']:
                    e = {
                        'obj': '',
                        'simple_class_name': simple_class_name,
                        'full_class_name': full_class_name,
                        'api': caveat['name'],
                        'signature': caveat['signature'],
                        'sentence': sentence,
                        'parameters': parameters,
                        'type': 'method'
                    }
                    method_caveats.append(e)
                    caveats.append(e)

                if collected:
                    # add all parameter and exception level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    if misc_obj['name'] == 'Parameters:':
                                        e = {
                                            'obj': obj['parameter'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'parameter'
                                        }
                                        parameter_caveats.append(e)
                                        caveats.append(e)
                                    else:
                                        e = {
                                            'obj': obj['exception'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'exception'
                                        }
                                        exception_caveats.append(e)
                                        caveats.append(e)

caveats_df = pd.DataFrame(caveats)
method_caveat_df = pd.DataFrame(method_caveats)
parameter_caveat_df = pd.DataFrame(parameter_caveats)
exception_caveat_df = pd.DataFrame(exception_caveats)

In [5]:
def filter_by_pattern(sentence, patterns, lowercase):
    if lowercase:
        sentence = sentence.lower()
    if sentence:
        for pattern in patterns:
            matches = re.search(pattern, sentence)
            if matches:
                return True
    
    return False

# Regex patterns for filtering
exception_not_null_patterns = [
     'null'
]

exception_range_limitation_patterns = [
    r'<|>|=',
    r'equal|equal to|equivalent to|illegal value| is (nan|infinite|empty)',
    r'\b(less|smaller|greater|larger)\b',
    r'\b(range|negative|positive|non-negative|non-positive)\b'
]

exception_type_restriction_patterns = [
    r'is( not)? an? [A-Z][a-z]+([A-Za-z_0-9\.]*)*',
    r'instance of|return type'
]

parameter_not_null_patterns = [
    r'not( be)? null',
    r'non-null',
]

parameter_range_limitation_patterns = [
     r'<|>|=',
     r'(less|smaller|greater|larger) than',
     r'negative|positive|non-negative|non-positive'
]

def exception_not_null_filter(sentence):
    return filter_by_pattern(sentence, exception_not_null_patterns, lowercase=True)

def exception_range_limitation_filter(sentence):
    return filter_by_pattern(sentence, exception_range_limitation_patterns, lowercase=True)

def exception_type_restriction_filter(sentence):
    return filter_by_pattern(sentence, exception_type_restriction_patterns, lowercase=False)

def parameter_not_null_filter(sentence):
    return filter_by_pattern(sentence, parameter_not_null_patterns, lowercase=True)

def parameter_range_limitation_filter(sentence):
    return filter_by_pattern(sentence, parameter_range_limitation_patterns, lowercase=True)

In [6]:
exception_filters = {
    'not_null': (exception_not_null_filter, 'NullnessNotAllowed'),
    'range': (exception_range_limitation_filter, 'RangeLimitation'),
    'type': (exception_type_restriction_filter, 'TypeRestriction')
}

parameter_filters = {
    'not_null': (parameter_not_null_filter, 'NullnessNotAllowed'),
    'range': (parameter_range_limitation_filter, 'RangeLimitation')
}

def analyse_labelled_results(file, filters):
    with open(file) as f:
        counts = {}
        c = 0
        for line in f:
            obj = ujson.loads(line)

            if obj['labels']:
                c += 1
                if c > 384:
                    break

                for label in obj['labels']:
                    if not label in counts:
                        counts[label] = 1
                    else:
                        counts[label] += 1
        print(counts)

        for key in filters:
            print('Filter results for {}'.format(key))
            correct, retrieved = 0, 0
            c = 0
            f.seek(0)
            for line in f:
                obj = ujson.loads(line)

                if obj['labels']:
                    c += 1
                    if c > 384:
                        break

                    if filters[key][0](obj['text']):
                        retrieved += 1
                        if filters[key][1] in obj['labels']:
                            correct += 1
            print('Correct: {}'.format(correct))
            print('Retrieved: {}\n'.format(retrieved))

print('Exception caveat sentences')
analyse_labelled_results('./labelled_data/labelled_exception_full.jsonl', exception_filters)
print('Parameter caveat sentences')
analyse_labelled_results('./labelled_data/labelled_parameter_full.jsonl', parameter_filters)

Exception caveat sentences
{'Ambiguous': 256, 'NullnessNotAllowed': 92, 'RangeLimitation': 27, 'TypeRestriction': 9, 'Dependent': 9}
Filter results for not_null
Correct: 92
Retrieved: 92

Filter results for range
Correct: 27
Retrieved: 35

Filter results for type
Correct: 9
Retrieved: 10

Parameter caveat sentences
{'NullnessNotAllowed': 36, 'Ambiguous': 222, 'ExpectedValue': 3, 'RangeLimitation': 3}
Filter results for not_null
Correct: 36
Retrieved: 37

Filter results for range
Correct: 3
Retrieved: 7



In [7]:
method_caveat_df = pd.DataFrame(method_caveats)
parameter_caveat_df = pd.DataFrame(parameter_caveats)
exception_caveat_df = pd.DataFrame(exception_caveats)

def get_unique_filtered_df(df, filter_func):
    filtered_df = df[df['sentence'].apply(filter_func)]
    unique_df = filtered_df.drop_duplicates('sentence').sample(frac=1)
    print('Filtered results: {}'.format(len(filtered_df.index)))
    print('Unique results: {}\n'.format(len(unique_df.index)))
    return unique_df

# Not null exception
not_null_exception_df = get_unique_filtered_df(exception_caveat_df, exception_not_null_filter)

# Not null paramater
not_null_parameter_df = get_unique_filtered_df(parameter_caveat_df, parameter_not_null_filter)

# Range limitation exception
range_limit_exception_df = get_unique_filtered_df(exception_caveat_df, exception_range_limitation_filter)

# Range limitation parameter
range_limit_parameter_df = get_unique_filtered_df(parameter_caveat_df, parameter_range_limitation_filter)

# Type restriction exception
type_restrict_exception_df = get_unique_filtered_df(exception_caveat_df, exception_type_restriction_filter)

def create_annotation_file(df, size, out_path, include_obj=False):
    with open(out_path, 'w+') as out_f:
        c = 0
        for i in df.index:
            c += 1
            if c == 1:
                out_f.write('-----------------------------')
            
            if c > size:
                break
            
            out_f.write(str(i) + '\n')
            out_f.write(df.loc[i, 'signature'] + '\n')
            if include_obj:
                out_f.write(df.loc[i,'obj'] + '\n')
            out_f.write(df.loc[i, 'sentence'] + '\n\n')
            out_f.write('-----------------------------')
            

# create_annotation_file(type_restrict_exception_df, 100, './output/labelled_caveat_rules/type_restrict_exception.txt')
# create_annotation_file(not_null_exception_df, 100, './output/labelled_caveat_rules/not_null_exception.txt')
# create_annotation_file(not_null_parameter_df, 100, './output/labelled_caveat_rules/not_null_parameter.txt', True)
# create_annotation_file(range_limit_exception_df, 100, './output/labelled_caveat_rules/range_limit_exception.txt')
# create_annotation_file(range_limit_parameter_df, 100, './output/labelled_caveat_rules/range_limit_parameter.txt', True)

Filtered results: 3844
Unique results: 1447

Filtered results: 1202
Unique results: 495

Filtered results: 1946
Unique results: 834

Filtered results: 335
Unique results: 193

Filtered results: 208
Unique results: 149



In [8]:
def read_labelled_rule_results(path, parser, rule_start=3, offset=4):
    with open(path) as f:
        lines = f.readlines()
        
        indices = [re.sub("[^0-9]", "", line) for line in lines[::offset]]
        rules = [line.strip() for line in lines[rule_start::offset]]
        
        if len(indices) > len(rules):
            indices = indices[:len(rules)]
        
        results = {}
        for i, val in enumerate(indices):
            results[val] = parser(rules[i])
            
        return results
    
def or_parser(expression):
    if expression == '':
        return None
    return expression.split('||')

def type_parser(expression):
    if expression == '':
        return None
    
    expressions = expression.split(' & ')
    results = []
    for exp in expressions:
        components = exp.split(' -> ')
        types = components[1].split('||')
        negated = components[0][0] == '!'
        
        obj = components[0]
        if components[0][0] == '!':
            obj = components[0][1:]
        results.append({'obj': obj, 'types': types, 'negated':negated})
    
    return results
        
not_null_exception_results = read_labelled_rule_results('./labelled_data/not_null_exception.txt', or_parser)
print(len([x for x in not_null_exception_results if not_null_exception_results[x]]))

not_null_parameter_results = read_labelled_rule_results('./labelled_data/not_null_parameter.txt', or_parser, 4, 5)
print(len([x for x in not_null_parameter_results if not_null_parameter_results[x]]))

range_limit_exception_results = read_labelled_rule_results('./labelled_data/range_limit_exception.txt', or_parser)
print(len([x for x in range_limit_exception_results if range_limit_exception_results[x]]))

range_limit_parameter_results = read_labelled_rule_results('./labelled_data/range_limit_parameter.txt', or_parser, 4, 5)
print(len([x for x in range_limit_parameter_results if range_limit_parameter_results[x]]))

type_restrict_exception_results = read_labelled_rule_results('./labelled_data/type_restrict_exception.txt', type_parser)
print(len([x for x in type_restrict_exception_results if type_restrict_exception_results[x]]))

87
90
72
49
33


In [9]:
def get_type_rules(api_caveat_obj):
    skip_patterns = [
        r'subclass|at the specified',
        r'any .*element of .* is( not)? an?',
        r'and ([a-z]\w+) is( not)? an? ((?:[A-Z]\w+(?:, )?)+)(?:or(?: a)? ([A-Z]\w+))?',
        r' ([a-z]\w+) is( not)? an? ((?:[A-Z]\w+(?:, )?)+)(?:or(?: a)? ([A-Z]\w+))? and'
    ]
    for pattern in skip_patterns:
        if re.search(pattern, api_caveat_obj['sentence']):
            return
    matches = re.search(r' ([a-z]\w+) is( not)? a Class that implements interface ([A-Z]\w+)', api_caveat_obj['sentence'])
    if matches:
        results = []
        negated = matches.group(1) != None
        results.append({'obj': matches.group(0), 'types': [matches.group(2)], 'negated': negated})
    
    else:
        matches = re.findall(r' ([a-z]\w+) is( not)? an? ((?:[A-Z]\w+,? ?)+)(?:or ([A-Z]\w+))?', api_caveat_obj['sentence'])
        if matches:
            results = []
            for match in matches:
                if match[0] in api_caveat_obj['parameters']:
                    types = [e.strip() for e in match[2].split(', ') if len(e) > 0]
                    if match[3]:
                        types.append(match[3])
                    # only look at class names (first char is uppercase)
                    types = [t for t in types if t[0].isupper()]
                    if len(types) == 0:
                        continue

                    negated = match[1] == '' # not we check the reversed negation
                    results.append({'obj': match[0], 'types': types, 'negated': negated})

            return results
    
def is_not_null_parameter_caveat(api_caveat_obj):
    sentence = api_caveat_obj['sentence'].lower()
    
    # skip conditionals
    if r'if\b' in sentence:
        return False
    
    patterns = [
        r'not( be)? null',
        r'non-null'
    ]
    
    for pattern in patterns:
        if re.search(pattern, sentence):
            return True
    return False

def get_not_null_exception_rules(api_caveat_obj):
    # skip additional condition patterns
    skip_patterns = [
        r' and .* is null',
        r' is null and',
        r' any of the arguments',
        r' contains null element'
    ]
    
    types = []
    
    for pattern in skip_patterns:
        if re.search(pattern, api_caveat_obj['sentence']):
            return types
    
    # assume the caveat is about single parameters
    if len(api_caveat_obj['parameters']) == 1 and re.search(' null', api_caveat_obj['sentence']):
        return api_caveat_obj['parameters']
    
    match = re.search(r'([Ii]f|[Ww]hen) (.*) null', api_caveat_obj['sentence'])
    if match:
        tokens = re.sub(', ', ' ', match.group(2)).split()
        types = [t for t in tokens if t in api_caveat_obj['parameters']]
    return list(set(types))

In [287]:
# Sentence normalisation functions

def normalise_operators(sentence):
    # list of expression substitutions for logical phrases, note ordering is used
    patterns = [
        (r'(not? (less|shorter) than)', '>='),
        (r'(not? (greater|larger|longer) than)', '<='),
        (r'((greater|larger) than or equal to)', '>='),
        (r'((less|shorter) than or equal to)', '<='),
        (r'((less|shorter) than)', '<'),
        (r'((greater|larger|longer) than)', '>'),
        (r'((is|are)? not negative)', '>= 0'),
        (r'((is|are)? not positive)', '<= 0'),
        (r'((is|are)? negative)', '< 0'), (r'((be)? non-negative)', '>= 0'),
        (r'((is|are)? positive)', '< 0'), (r'((be)? non-positive)', '<= 0'),
        (r'(not equal( to)?)', '!='),
        (r'(equal to)', '==')
    ]

    # normalise all equality phrases
    for t in patterns:
        sentence = re.sub(t[0], t[1], sentence)
                
    return sentence

def normalise_args(sentence, parameters):
    # variable substitutions
    variable_subs = [
        '\W(-?[0-9]+(?:,[0-9]+)*(?:(?:\.[0-9]+)?[a-z]*))', # specific numeric value
        r'\W("[^"]+")', # simple string
        r'\b([A-Za-z_]+[A-Za-z_0-9]*\.[A-Za-z_][A-Za-z_0-9]*\([^\)]*\))', # class method
        r'\b((^(java\.|javax\.|org\.))?([A-Za-z_]\w*\.)+\w+)\b', # member value of object/Class
        r'\b([A-Z]+_[A-Z]+(_[A-Z]+)*)\b', # all uppercase and at least 1 underscore
        r'\W([a-z_][A-Za-z_0-9]*\([^\)]*\))' # standalone methods 
    ]
    
    param_prefix = '@PARAM'
    param_counter = 'a'        

    variable_prefix = '@ARG'
    variable_counter = 'a'
    variable_dict = {}
    
    # add default mappings
    for var in ['null']:
        key = variable_prefix + variable_counter
        variable_dict[key] = var
        variable_counter = chr(ord(variable_counter) + 1)
    
        # replace all occurences of the default var
        sentence = re.sub(r'\b' + var + r'\b', key, sentence)
    
    # normalise numerical values first, # specific numeric value
    match = re.search('\W(-?[0-9]+(\.[0-9]+)+[a-z]?)\W', sentence)
    while match:
        key = variable_prefix + str(variable_counter)
        variable_dict[key] = match.group(1)
        next_pattern = r'\b' + re.escape(match.group(1)) + r'\b'
        sentence = re.sub(next_pattern, key, sentence)
        variable_counter = chr(ord(variable_counter) + 1)

        match = re.search('\W(-?[0-9]+(\.[0-9]+)+[a-z]?)\W', sentence)

    # normalise all variables/methods/fields that match predefined regex patterns
    for pattern in variable_subs:
        match = re.search(pattern, sentence)
        while match:
            key = variable_prefix + str(variable_counter)
            variable_dict[key] = match.group(1)
            next_pattern = re.escape(match.group(1))
            sentence = re.sub(next_pattern, key, sentence)
            variable_counter = chr(ord(variable_counter) + 1)

            match = re.search(pattern, sentence)
     
     # normalise all API call parameters
    for parameter in parameters:
        key = param_prefix + param_counter
        variable_dict[key] = parameter
        sentence = re.sub(r'\b'+parameter+r'\b', key, sentence)
        param_counter = chr(ord(param_counter) + 1)

    return sentence, variable_dict

def normalise_enum_lists(sentence):
    # locate enumerations (lists of items)
    list_dict = {}
    list_counter = 0
    list_prefix = '@LIST'
    pattern = '\W((@ARG\w|)(?:,\s*(@ARG\w)\s*)+,?\s*or\s*(@ARG\w))'
    match = re.search(pattern, sentence)

    while match:
        key = list_prefix + str(list_counter)
        sentence = re.sub(match.group(1), key, sentence)
        list_dict[key] = match.group(1)

        match = re.search(pattern, sentence)
        list_counter += 1
        
    return sentence, list_dict

def normalise_explicit_expressions(sentence):
    expressions_dict = {}
    expression_prefix = '@EEXPR'
    expression_counter = 0
    start_index = 0
    bracket_counter = 0
    old_len = 0
    first = True

    while first or len(expressions_dict) != old_len:
        first = False
        old_len = len(expressions_dict)
        for i, c in enumerate(sentence):
            if c == '(': 
                if bracket_counter == 0:
                    start_index = i
                
                bracket_counter += 1
            elif c == ')':
                bracket_counter -= 1
                if bracket_counter == 0:
                    substr = sentence[start_index:i+1]
                    start_index = i + 1

                    if re.search(r'-|\+|\*|/|<|<=|>|>=|==|!=|&&|\|\||\.\.', substr):
                        escaped = re.escape(substr)
                        key = expression_prefix + str(expression_counter)
                        expressions_dict[key] = substr
                        sentence = re.sub(escaped, key, sentence)
                        expression_counter += 1

    return sentence, expressions_dict

def normalise_spaces(sentence):
    # remove left-side spaces around operators if adjacent to VAR* or EXPR* modifiers
    seen_expressions = set()
    match = re.search(r'(@ARG\w|@PARAM\w|@IEXPR\w)\s+(-|\+|\*|/|\.\.)', sentence)
    while match and not match in seen_expressions:
        sub = match.group(1) + match.group(2)
        escaped = re.escape(match.group(0))
        sentence = re.sub(escaped, sub, sentence)
        
        seen_expressions.add(match)
        match = re.search(r'(@ARG\w|@PARAM\w|@IEXPR\w)\s+(-|\+|\*|/|\.\.)', sentence)

    # remove right-side spaces around operators if adjacent to VAR*, PARAM* EXPR* modifiers
    match = re.search(r'(-|\+|\*|/|<|<=|>|>=|==|!=|&&|\|\||&|\.\.)\s+(@ARG\w|@PARAM\w|@IEXPR\w)', sentence)
    while match and not match in seen_expressions:
        sub = match.group(1) + match.group(2)
        escaped = re.escape(match.group(0))
        sentence = re.sub(escaped, sub, sentence)

        seen_expressions.add(match)
        match = re.search(r'(-|\+|\*|/|<|<=|>|>=|==|!=|&&|\|\||&|\.\.)\s+(@ARG\w|@PARAM\w|@IEXPR\w)', sentence)
    
    return sentence

def normalise_ranges(sentence):
    patterns = [
        r'\b((?:range )?(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\.\.(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w))\b',
        r'\b(\[?(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\.\.(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\]?)\b',
        r'\b((?:[Ff]rom|range) (@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w) to (@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w))\b',
        r'between ((@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w) and (@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w))',
        r'((@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\s*<=?\s*(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\s*<=?\s*(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w))',
        r'((@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\s*>=?\s*(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\s*>=?\s*(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w))'
    ]
    
    range_dict = {}
    range_prefix = '@RANGE'
    counter = 0

    for pattern in patterns:
        match = re.search(pattern, sentence)
        while match:
            key = range_prefix + str(counter)
            range_dict[key] = match.group(1)
            next_pattern = re.escape(match.group(1))
            sentence = re.sub(next_pattern, key, sentence)
            counter += 1
            match = re.search(pattern, sentence)
    
    return sentence, range_dict    
    
def tokenize(sentence): 
    # substitute '#' to separate composite sentences
    sentence = re.sub(r'[^A-Za-z0-9<>=! @]', ' ', sentence)
    return sentence.split()

def filter_tokens(tokens):
    return [x for x in tokens if re.search('^(@\w+|<|<=|>|>=|==|!=|&&|\|\||if|then|and|or|either|is|not|true|false)$', x)]

def normalise_implicit_expressions(tokens):
    expr = {}
    prefix = '@IEXPR'
    counter = 0
    new_tokens = []
    
    for token in tokens:
        match = re.search(r'(-|\+|\*|/|<|<=|>|>=|==|!=|&&|\|\||&|\.\.)@', token)
        if match:
            key = prefix + str(counter)
            expr[key] = token
            token = key
            counter += 1
            
        new_tokens.append(token)

    return new_tokens, expr

def normalise(sentence, parameters):
    if sentence[-1] == '.':
        sentence = sentence[:-1]
    
    # apply all sentence normalisations
    sentence = normalise_operators(sentence)
    sentence, args = normalise_args(sentence, parameters)
    sentence, lists = normalise_enum_lists(sentence)
    sentence, explicit_exprs = normalise_explicit_expressions(sentence)
    sentence = normalise_spaces(sentence)
    sentence, ranges = normalise_ranges(sentence)
        
    tokens = tokenize(sentence)
    tokens, implicit_exprs = normalise_implicit_expressions(tokens)
    
    for arg in args:
        for expr in explicit_exprs:
            if arg in explicit_exprs[expr]:
                explicit_exprs[expr] = re.sub(arg, args[arg], explicit_exprs[expr])
        
        for expr in implicit_exprs:
            if arg in implicit_exprs[expr]:
                implicit_exprs[expr] = re.sub(arg, args[arg], implicit_exprs[expr])

    placeholders = {}
    placeholders.update(args)
    placeholders.update(lists)
    placeholders.update(ranges)
    placeholders.update(explicit_exprs)
    placeholders.update(implicit_exprs)
        
    return tokens, placeholders

def resolve_rule_placeholders(rules, placeholders):
    resolved_rules = []
    
    # substitute all placeholders with original value
    for rule in rules:
        resolved_param = rule['param']
        
        if resolved_param in placeholders:
            resolved_param = placeholders[resolved_param]
        
        resolved_constraint = rule['constraint']         
        match = re.search(r'(@\w+)', resolved_constraint)
        while match:
            resolved_constraint = re.sub(re.escape(match.group(1)), placeholders[match.group(1)], resolved_constraint)
            match = re.search(r'(@\w+)', resolved_constraint)
        
        op = None
        if 'op' in rule:
            op = rule['op']
        else:
            if len(resolved_constraint) > 1:
                if resolved_constraint[:2] in ['<=', '>=', '==', '!=']:
                    op = resolved_constraint[:2]
                    resolved_constraint = resolved_constraint[2:]
                elif resolved_constraint[:1] in ['<', '>', '=']:
                    op = resolved_constraint[:1]
                    resolved_constraint = resolved_constraint[1:]
        
        if op:        
            resolved_rules.append({'param': resolved_param, 'op': op, 'constraint': resolved_constraint})
    
    return resolved_rules

In [288]:
reverse_rules = {
    '<': '>',
    '>': '<',
    '>=': '<=',
    '<=': '>='
}

negate_rules = {
    '>': '<=',
    '<': '>=',
    '=': '!=',
    '==': '!=',
    '>=': '<',
    '<=': '>'
}

def get_range_rules(api_caveat_obj):
    try:
        tokens, placeholders = normalise(api_caveat_obj['sentence'], api_caveat_obj['parameters'])

        tokens = filter_tokens(tokens)

        unresolved_rules = []
        prev_params = []
        params = []
        negate = False
        last_op = ''
        cc = '' # last coordinating conjunction
                
        for i, token in enumerate(tokens):
            if token in ['not', 'false']:
                negate = True
            elif token.startswith('@PARAM') or token.startswith('@ARG'):
                is_param = token.startswith('@PARAM')
                if not is_param:
                    for param in api_caveat_obj['parameters']:
                         if param in placeholders[token]:
                            is_param = True
                            break
                if is_param:
                    params.append(token)
            elif token.startswith('@IEXPR'):
                if re.search(r'@PARAM', token):
                    params.append(token)
                else:
                    if params:
                        for param in params:
                            unresolved_rules.append({'param': param, 'constraint': token})

                        prev_params = params.copy()
                        params.clear()
                    elif prev_params:
                        for param in prev_params:
                            unresolved_rules.append({'param': param, 'constraint': token})
                        prev_params.clear()
                    elif len(api_caveat_obj['parameters']) == 1:
                        unresolved_rules.append({'param': api_caveat_obj['parameters'][0], 'constraint': token})

                negate = False
            elif token.startswith('@EEXPR'):
                if re.search(r'(<|!|>|=|&|\|)', placeholders[token]):
                    for k in placeholders:
                        if k.startswith('@PARAM') and placeholders[k] in placeholders[token]:
                            unresolved_rules.append({'param': placeholders[k], 'constraint': placeholders[token]})
                else:
                    params.append(token)
            elif token.startswith('@RANGE'):
                match = re.search(r'(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\s*(<=?|>=?)\s*(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)\s*(<=?|>=?)\s*(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)', placeholders[token])
                if match:
                    unresolved_rules.append({'param': match.group(3), 
                                             'op': reverse_rules[match.group(2)] if not negate else negate_rules[reverse_rules[match.group(2)]], 
                                             'constraint': match.group(1)})
                    unresolved_rules.append({'param': match.group(3), 
                                             'op': match.group(4) if not negate else negate_rules[match.group(4)], 
                                             'constraint': match.group(5)})
                else:
                    match = re.search(r'(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w).*(@ARG\w|@PARAM\w|@IEXPR\w|@EEXPR\w)', placeholders[token])
                    if match and match.group(1) != '' and match.group(2) != '':
                        if params:
                            for param in params:
                                unresolved_rules.append({'param': param, 'op': '>=' if not negate else '<', 'constraint': match.group(1)})
                                unresolved_rules.append({'param': param, 'op': '<=' if not negate else '>', 'constraint': match.group(2)})

                            prev_params = params.copy()
                            params.clear()
                        elif prev_params:
                            for param in prev_params:
                                unresolved_rules.append({'param': param, 'op': '>=' if not negate else '<', 'constraint': match.group(1)})
                                unresolved_rules.append({'param': param, 'op': '<=' if not negate else '>', 'constraint': match.group(2)})
                            prev_params.clear()
                        elif len(api_caveat_obj['parameters']) == 1:
                            unresolved_rules.append({'param': api_caveat_obj['parameters'][0], 'op': '>=' if not negate else '<', 'constraint': match.group(1)})
                            unresolved_rules.append({'param': api_caveat_obj['parameters'][0], 'op': '<=' if not negate else '>', 'constraint': match.group(2)})
                        
                negate = False
            elif token.startswith('@LIST'):
                list_str = lists[token]
                list_items = [e for e in re.split(r'(or|,| )', list_str) if e.startswith('@ARG')]

                if params:
                    for param in params:
                        for l in list_items:
                            unresolved_rules.append({'param': param, 
                                                     'op': last_op if not negate else negate_rules[last_op], 
                                                     'constraint': l})

                    prev_params = params.copy()
                    params.clear()
                else:
                    for param in prev_params:
                        for l in list_items:
                            unresolved_rules.append({'param': param, 
                                                     'op': last_op if not negate else negate_rules[last_op], 
                                                     'constraint': l})
                    prev_params.clear()
                negate = False
            elif token == 'is' and (params or len(api_caveat_obj['parameters']) == 1) and \
                    i + 1 < len(tokens) and tokens[i+1].startswith('@ARG'):
                if params:
                    for param in params:
                        unresolved_rules.append({'param': param, 
                                                 'op': '=' if not negate else '!=', 
                                                 'constraint': tokens[i+1]})
                    prev_params = params.copy()
                else:
                    unresolved_rules.append({'param': api_caveat_obj['parameters'][0], 
                                                 'op': '=' if not negate else '!=', 
                                                 'constraint': tokens[i+1]})
                params.clear()
            elif re.search(r'^[<!>=]+', token):
                last_op = token
            elif token in ['and', 'or']:
                cc = token
        return resolve_rule_placeholders(unresolved_rules, placeholders)
    except:
        return None

In [290]:
normalise('if the given key material, starting at offset inclusive, is shorter than 24 bytes', ['key'])

(['if',
  'the',
  'given',
  '@PARAMa',
  'material',
  'starting',
  'at',
  'offset',
  'inclusive',
  'is',
  '@IEXPR0',
  'bytes'],
 {'@ARGa': 'null', '@ARGb': '24', '@PARAMa': 'key', '@IEXPR0': '<24'})

In [258]:
obj = exception_caveat_df.loc[4258].to_dict()
print(obj['parameters'])
print(obj['sentence'] + '\n')
print(get_range_rules(obj))

['downstreamHandler', 'bufferSize']
if bufferSize <= 0

['if', '@PARAMb', '@IEXPR0']
[{'param': 'bufferSize', 'op': '<=', 'constraint': '0'}]


In [237]:
# evaluate type rules generation via regex
retrieved, correct, total = 0, 0, 0
for key in type_restrict_exception_results:
    index = int(key)
    caveat_obj = type_restrict_exception_df.loc[index].to_dict()
    
    rules = get_type_rules(caveat_obj)
    if rules:
        retrieved += 1
        
        if not type_restrict_exception_results[key]:
            continue
        
        is_correct = True
        if not type_restrict_exception_results[key]:
            is_correct = False
        
        has_match = False
        for rule in rules:
            match = [e for e in type_restrict_exception_results[key] if e['obj'] == rule['obj']]
            if len(match) > 0 and match[0]['types'] == rule['types'] and match[0]['negated'] == rule['negated']:
                has_match = True
                break
        
        if is_correct and has_match:
            correct += 1
            
    if type_restrict_exception_results[key]:
        total += 1
print('retrieved:{}, correct:{}, total: {}'.format(retrieved, correct, total))

retrieved:15, correct:14, total: 33


In [123]:
# evaluate not null rules (parameter sentences) generation via regex
retrieved, correct, total = 0, 0, 0
for key in not_null_parameter_results:
    index = int(key)
    caveat_obj = not_null_parameter_df.loc[index].to_dict()
    
    rule = is_not_null_parameter_caveat(caveat_obj)
    if rule:
        retrieved += 1
        
        if not not_null_parameter_results[key]:
            continue
        
        correct += 1
            
    if not_null_parameter_results[key]:
        if not rule:
            print(index)
        total += 1
print('retrieved:{}, correct:{}, total: {}'.format(retrieved, correct, total))

retrieved:91, correct:90, total: 90


In [434]:
# evaluate not null rules (exception sentences) generation via regex
retrieved, correct, total = 0, 0, 0

for key in not_null_exception_results:
    index = int(key)
    caveat_obj = not_null_exception_df.loc[index].to_dict()
    
    rules = get_not_null_exception_rules(caveat_obj)
    if rules:
        retrieved += 1
        
        if not not_null_exception_results[key] or set(not_null_exception_results[key]) != set(rules):
            continue
        
        correct += 1
            
    if not_null_exception_results[key]:
        total += 1
print('retrieved:{}, correct:{}, total: {}'.format(retrieved, correct, total))

retrieved:81, correct:75, total: 84


In [144]:
exception_caveat_df[exception_caveat_df['simple_class_name'] == "Date"].head()

Unnamed: 0,api,full_class_name,obj,parameters,sentence,signature,simple_class_name,type
6484,valueOf,java.sql.Date,IllegalArgumentException,[s],if the date given is not in the JDBC date esca...,public static Date valueOf(String s),Date,exception
6485,valueOf,java.sql.Date,NullPointerException,[date],if date is null,public static Date valueOf(LocalDate date),Date,exception
8670,before,java.util.Date,NullPointerException,[when],if when is null.,public boolean before(Date when),Date,exception
8671,after,java.util.Date,NullPointerException,[when],if when is null.,public boolean after(Date when),Date,exception
8672,compareTo,java.util.Date,NullPointerException,[anotherDate],if anotherDate is null.,public int compareTo(Date anotherDate),Date,exception


In [155]:
c = 0
for i in exception_caveat_df.index:
    obj = exception_caveat_df.iloc[i]
    rules = get_range_rules(obj)
    if rules:
        c += 1
print(c)

4101


In [291]:
def get_params(signature):
    signature = re.sub(r'.*\b(public|private|protected)\b', '', signature)
    params = re.search(r'\(([^\)]*)\)', signature)
    if params:
        params = params.group(1)
        params = params.split(', ')
        params = map(lambda x: re.sub(r'<[^>]*>', '', x), params)
        return [x.split() for x in params]
    
def get_single_param_index(params, param):
    l = [x[1] for x in params]
    for i, e in enumerate(l):
        if e == param:
            return i
    
def get_param_indices(params, rules):
    l = [x[1] for x in params]
    indices = []
    for i, e in enumerate(l):
        if e in rules:
            indices.append(i)
            
    return indices
    
get_params('public abstract XMLSignature newXMLSignature(SignedInfo si, KeyInfo ki, List<? extends XMLObject> objects, String id, String signatureValueId)')

[['SignedInfo', 'si'],
 ['KeyInfo', 'ki'],
 ['List', 'objects'],
 ['String', 'id'],
 ['String', 'signatureValueId']]

In [292]:
# write all non-null exceptions caveats
with open('./output/non_null_rules.json', 'w+') as f:
    d = {}
    
    for i in exception_caveat_df.index:
        obj = exception_caveat_df.iloc[i]
        rules = get_not_null_exception_rules(obj)
        if rules:
            params = get_params(obj['signature'])
            types = [x[0] for x in params]
            
            indices = get_param_indices(params, rules)
            if indices:
                res = {
                    'className': obj['full_class_name'], 
                    'api': obj['api'], 
                    'signature': obj['signature'],
                    'paramTypes': types,
                    'notNullIndices': indices
                }         
                d[i] = res            
                
    for i in parameter_caveat_df.index:
        obj = parameter_caveat_df.iloc[i]
        rules = is_not_null_parameter_caveat(obj)
        if rules:
            params = get_params(obj['signature'])
            types = [x[0] for x in params]
            index = get_param_indices(params, [rules])
            
            if index:
                if i in d:
                    c += 1
                    d[i]['notNullIndices'].append(index)
                else:
                    res = {
                        'className': obj['full_class_name'], 
                        'api': obj['api'], 
                        'signature': obj['signature'],
                        'paramTypes': types,
                        'notNullIndices': [index]
                    }
                    d[i] = res
    
    ujson.dump([d[i] for i in d], f)

In [293]:
# write all exception range rules
with open('./output/exception_range_rules.json', 'w+') as f:
    res = []
    c = 0
    for i in exception_caveat_df.index:
        obj = exception_caveat_df.iloc[i]
        rules = get_range_rules(obj)
        
        # skip all constraints with and conditions
        if ' and ' in obj['sentence']:
            continue
        
        if rules:
            params = get_params(obj['signature'])
            types = [x[0] for x in params]
            
            s = set()
            valid_rules = []
            for rule in rules:
                if 'param' in rule and 'op' in rule and 'constraint' in rule:
                    rule['param'] = get_single_param_index(params, rule['param'])
                    valid_rules.append(rule)
            
            if valid_rules:
                res.append({
                    'className': obj['full_class_name'], 
                    'api': obj['api'], 
                    'signature': obj['signature'],
                    'paramTypes': types,
                    'rangeRules': valid_rules
                })
            c += 1
            
    ujson.dump(res, f)
    print(c)

4119


In [294]:
with open('./output/exception_range_rules.json') as f, open('./output/exception_range_rules_filtered.json', 'w+') as f_out:
    arr = ujson.load(f)
    filtered = []
    c, n = 0, 0
    for e in arr: 
        filtered_rules = []
        for rule in e['rangeRules']:
            try:
                if rule['constraint'] == 'null':
                    n += 1
                else:
                    int(rule['constraint'])
                    c += 1
                filtered_rules.append(rule)
            except:
                continue
            
        if filtered_rules:
            e['rangeRules'] = filtered_rules
            filtered.append(e)
    print(c)
    print(n)
    ujson.dump(filtered, f_out)

1230
3464


In [108]:
is_not_null_parameter_caveat(parameter_caveat_df.iloc[7587])
print(parameter_caveat_df.iloc[7587])

api                                                   getOverrideStyle
full_class_name                            org.w3c.dom.css.DocumentCSS
obj                                                                elt
parameters                                            [elt, pseudoElt]
sentence                                This parameter cannot be null.
signature            CSSStyleDeclaration getOverrideStyle(Element e...
simple_class_name                                          DocumentCSS
type                                                         parameter
Name: 7587, dtype: object
