In [325]:
import ujson
import glob
import os
import re
import pandas as pd
from nltk.parse.corenlp import CoreNLPParser

parser = CoreNLPParser(url='http://localhost:9010', tagtype='pos')

In [2]:
# load the non-dreprecated java doc caveat sentences of methods (parameters or exception level)
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
method_caveats = []
parameter_caveats = []
exception_caveats = []

files = sorted(glob.glob(caveat_files_dir + '*.json'))
for file in files:
    with open(file) as f:
        arr = ujson.load(f)
        full_class_name = os.path.splitext(os.path.basename(file))[0]
        simple_class_name = full_class_name.split('.')[-1]
        
        for caveat in arr:
            if not caveat['deprecated'] and 'name' in caveat:
                
                # collect the name of parameters
                collected = False
                parameters = []
                for misc_obj in caveat['caveat_misc']:
                    if misc_obj['name'] == 'Parameters:':
                        for obj in misc_obj['list']:
                            parameters.append(obj['parameter'])
                        collected = True
                        break
                        
                for sentence in caveat['sentences']:
                    method_caveats.append({
                                    'obj': '',
                                    'simple_class_name': simple_class_name,
                                    'full_class_name': full_class_name,
                                    'api': caveat['name'],
                                    'signature': caveat['signature'],
                                    'sentence': sentence,
                                    'parameters': parameters,
                                    'type': 'method'
                                })

                if collected:
                    # add all parameter and exception level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    if misc_obj['name'] == 'Parameters:':
                                        parameter_caveats.append({
                                            'obj': obj['parameter'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'parameter'
                                        })
                                    else:
                                        exception_caveats.append({
                                            'obj': obj['exception'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'exception'
                                        })
                                        
method_caveat_df = pd.DataFrame(method_caveats)
parameter_caveat_df = pd.DataFrame(parameter_caveats)
exception_caveat_df = pd.DataFrame(exception_caveats)

In [3]:
def filter_by_pattern(sentence, patterns, lowercase):
    if lowercase:
        sentence = sentence.lower()
    if sentence:
        for pattern in patterns:
            matches = re.search(pattern, sentence)
            if matches:
                return True
    
    return False

# Regex patterns for filtering
exception_not_null_patterns = [
#     '(be|equal|equals|is|are) null',
#     'be (equal|equivalent) to null',
#     'non-null',
     'null'
]

exception_range_limitation_patterns = [
    r'<|>|=',
    r'equal|equal to|equivalent to|illegal value| is (nan|infinite|empty)',
    r'\b(less|smaller|greater|larger)\b',
    r'\b(range|negative|positive|non-negative|non-positive)\b'
]

exception_type_restriction_patterns = [
    r'is( not)? an? [A-Z][a-z]+([A-Za-z_0-9\.]*)*',
    r'instance of|return type'
]

parameter_not_null_patterns = [
    r'not( be)? null',
    r'non-null',
]

parameter_range_limitation_patterns = [
     r'<|>|=',
     r'(less|smaller|greater|larger) than',
     r'negative|positive|non-negative|non-positive'
]

def exception_not_null_filter(sentence):
    return filter_by_pattern(sentence, exception_not_null_patterns, lowercase=True)

def exception_range_limitation_filter(sentence):
    return filter_by_pattern(sentence, exception_range_limitation_patterns, lowercase=True)

def exception_type_restriction_filter(sentence):
    return filter_by_pattern(sentence, exception_type_restriction_patterns, lowercase=False)

def parameter_not_null_filter(sentence):
    return filter_by_pattern(sentence, parameter_not_null_patterns, lowercase=True)

def parameter_range_limitation_filter(sentence):
    return filter_by_pattern(sentence, parameter_range_limitation_patterns, lowercase=True)

In [4]:
exception_filters = {
    'not_null': (exception_not_null_filter, 'NullnessNotAllowed'),
    'range': (exception_range_limitation_filter, 'RangeLimitation'),
    'type': (exception_type_restriction_filter, 'TypeRestriction')
}

parameter_filters = {
    'not_null': (parameter_not_null_filter, 'NullnessNotAllowed'),
    'range': (parameter_range_limitation_filter, 'RangeLimitation')
}

def analyse_labelled_results(file, filters):
    with open(file) as f:
        counts = {}
        c = 0
        for line in f:
            obj = ujson.loads(line)

            if obj['labels']:
                c += 1
                if c > 384:
                    break

                for label in obj['labels']:
                    if not label in counts:
                        counts[label] = 1
                    else:
                        counts[label] += 1
        print(counts)

        for key in filters:
            print('Filter results for {}'.format(key))
            correct, retrieved = 0, 0
            c = 0
            f.seek(0)
            for line in f:
                obj = ujson.loads(line)

                if obj['labels']:
                    c += 1
                    if c > 384:
                        break

                    if filters[key][0](obj['text']):
                        retrieved += 1
                        if filters[key][1] in obj['labels']:
                            correct += 1
#                         else:
#                             print(obj)
#                     elif filters[key][1] in obj['labels']:
#                         print('MISS')
#                         print(obj)

            print('Correct: {}'.format(correct))
            print('Retrieved: {}\n'.format(retrieved))
  
# Test filtering rules
print('Exception caveat sentences')
analyse_labelled_results('./labelled_data/labelled_exception_full.jsonl', exception_filters)
print('Parameter caveat sentences')
analyse_labelled_results('./labelled_data/labelled_parameter_full.jsonl', parameter_filters)

Exception caveat sentences
{'Ambiguous': 256, 'NullnessNotAllowed': 92, 'RangeLimitation': 27, 'TypeRestriction': 9, 'Dependent': 9}
Filter results for not_null
Correct: 92
Retrieved: 92

Filter results for range
Correct: 27
Retrieved: 35

Filter results for type
Correct: 9
Retrieved: 10

Parameter caveat sentences
{'NullnessNotAllowed': 36, 'Ambiguous': 222, 'ExpectedValue': 3, 'RangeLimitation': 3}
Filter results for not_null
Correct: 36
Retrieved: 37

Filter results for range
Correct: 3
Retrieved: 7



In [7]:
method_caveat_df = pd.DataFrame(method_caveats)
parameter_caveat_df = pd.DataFrame(parameter_caveats)
exception_caveat_df = pd.DataFrame(exception_caveats)

def get_unique_filtered_df(df, filter_func):
    filtered_df = df[df['sentence'].apply(filter_func)]
    unique_df = filtered_df.drop_duplicates('sentence').sample(frac=1)
    print('Filtered results: {}'.format(len(filtered_df.index)))
    print('Unique results: {}\n'.format(len(unique_df.index)))
    return unique_df

# Not null exception
not_null_exception_df = get_unique_filtered_df(exception_caveat_df, exception_not_null_filter)

# Not null paramater
not_null_parameter_df = get_unique_filtered_df(parameter_caveat_df, parameter_not_null_filter)

# Range limitation exception
range_limit_exception_df = get_unique_filtered_df(exception_caveat_df, exception_range_limitation_filter)

# Range limitation parameter
range_limit_parameter_df = get_unique_filtered_df(parameter_caveat_df, parameter_range_limitation_filter)

# Type restriction exception
type_restrict_exception_df = get_unique_filtered_df(exception_caveat_df, exception_type_restriction_filter)

def create_annotation_file(df, size, out_path, include_obj=False):
    with open(out_path, 'w+') as out_f:
        c = 0
        for i in df.index:
            c += 1
            if c == 1:
                out_f.write('-----------------------------')
            
            if c > size:
                break
            
            out_f.write(str(i) + '\n')
            out_f.write(df.loc[i, 'signature'] + '\n')
            if include_obj:
                out_f.write(df.loc[i,'obj'] + '\n')
            out_f.write(df.loc[i, 'sentence'] + '\n\n')
            out_f.write('-----------------------------')
            

# create_annotation_file(type_restrict_exception_df, 100, './output/labelled_caveat_rules/type_restrict_exception.txt')
create_annotation_file(not_null_exception_df, 100, './output/labelled_caveat_rules/not_null_exception.txt')
create_annotation_file(not_null_parameter_df, 100, './output/labelled_caveat_rules/not_null_parameter.txt', True)
create_annotation_file(range_limit_exception_df, 100, './output/labelled_caveat_rules/range_limit_exception.txt')
create_annotation_file(range_limit_parameter_df, 100, './output/labelled_caveat_rules/range_limit_parameter.txt', True)

Filtered results: 3844
Unique results: 1447

Filtered results: 1202
Unique results: 495

Filtered results: 1946
Unique results: 834

Filtered results: 335
Unique results: 193

Filtered results: 208
Unique results: 149



In [725]:
def read_labelled_rule_results(path, parser, rule_start=3, offset=4):
    with open(path) as f:
        lines = f.readlines()
        
        indices = [re.sub("[^0-9]", "", line) for line in lines[::offset]]
        rules = [line.strip() for line in lines[rule_start::offset]]
        
        if len(indices) > len(rules):
            indices = indices[:len(rules)]
        
        results = {}
        for i, val in enumerate(indices):
            results[val] = parser(rules[i])
            
        return results
    
def or_parser(expression):
    if expression == '':
        return None
    return expression.split('||')

def type_parser(expression):
    if expression == '':
        return None
    
    expressions = expression.split(' & ')
    results = []
    for exp in expressions:
        components = exp.split(' -> ')
        types = components[1].split('||')
        negated = components[0][0] == '!'
        
        obj = components[0]
        if components[0][0] == '!':
            obj = components[0][1:]
        results.append({'obj': obj, 'types': types, 'negated':negated})
    
    return results
        
not_null_exception_results = read_labelled_rule_results('./labelled_data/not_null_exception.txt', or_parser)
print(len([x for x in not_null_exception_results if not_null_exception_results[x]]))

not_null_parameter_results = read_labelled_rule_results('./labelled_data/not_null_parameter.txt', or_parser, 4, 5)
print(len([x for x in not_null_parameter_results if not_null_parameter_results[x]]))

range_limit_exception_results = read_labelled_rule_results('./labelled_data/range_limit_exception.txt', or_parser)
print(len([x for x in range_limit_exception_results if range_limit_exception_results[x]]))

range_limit_parameter_results = read_labelled_rule_results('./labelled_data/range_limit_parameter.txt', or_parser, 4, 5)
print(len([x for x in range_limit_parameter_results if range_limit_parameter_results[x]]))

type_restrict_exception_results = read_labelled_rule_results('./labelled_data/type_restrict_exception.txt', type_parser)
print(len([x for x in type_restrict_exception_results if type_restrict_exception_results[x]]))

87
90
72
49
33


In [877]:
def get_type_rules(api_caveat_obj):
    skip_patterns = [
        r'subclass|at the specified',
        r'any .*element of .* is( not)? an?',
        r'and ([a-z]\w+) is( not)? an? ((?:[A-Z]\w+(?:, )?)+)(?:or(?: a)? ([A-Z]\w+))?',
        r' ([a-z]\w+) is( not)? an? ((?:[A-Z]\w+(?:, )?)+)(?:or(?: a)? ([A-Z]\w+))? and'
    ]
    for pattern in skip_patterns:
        if re.search(pattern, api_caveat_obj['sentence']):
            return
    matches = re.search(r' ([a-z]\w+) is( not)? a Class that implements interface ([A-Z]\w+)', api_caveat_obj['sentence'])
    if matches:
        results = []
        negated = matches.group(1) != None
        results.append({'obj': matches.group(0), 'types': [matches.group(2)], 'negated': negated})
    
    else:
        matches = re.findall(r' ([a-z]\w+) is( not)? an? ((?:[A-Z]\w+,? ?)+)(?:or ([A-Z]\w+))?', api_caveat_obj['sentence'])
        if matches:
            results = []
            for match in matches:
                if match[0] in api_caveat_obj['parameters']:
                    types = [e.strip() for e in match[2].split(', ') if len(e) > 0]
                    if match[3]:
                        types.append(match[3])
                    # only look at class names (first char is uppercase)
                    types = [t for t in types if t[0].isupper()]
                    if len(types) == 0:
                        continue

                    negated = match[1] == '' # not we check the reversed negation
                    results.append({'obj': match[0], 'types': types, 'negated': negated})

            return results
    
def is_not_null_parameter_caveat(api_caveat_obj):
    sentence = api_caveat_obj['sentence'].lower()
    
    # skip conditional
    if 'if ' in sentence:
        return False
    
    patterns = [
        r'not( be)? null',
        r'non-null'
    ]
    
    for pattern in patterns:
        if re.search(pattern, sentence):
            return True
    return False

def get_not_null_exception_rules(api_caveat_obj):
    # skip additional condition patterns
    skip_patterns = [
        r' and .* is null',
        r' is null and',
        r' any of the arguments',
        r' contains null element'
    ]
    
    types = []
    
    for pattern in skip_patterns:
        if re.search(pattern, api_caveat_obj['sentence']):
            return types
    
    # assume the caveat is about single parameters
    if len(api_caveat_obj['parameters']) == 1 and re.search(' null', api_caveat_obj['sentence']):
        return api_caveat_obj['parameters']
    
    match = re.search(r'([Ii]f|[Ww]hen) (.*) null', api_caveat_obj['sentence'])
    if match:
        tokens = re.sub(', ', ' ', match.group(2)).split()
        types = [t for t in tokens if t in api_caveat_obj['parameters']]
    return set(types)

def get_range_limitation_parameter_rules(api_caveat_obj):
    patterns = [
        r'\W([A-Za-z0-9\.()+-*/]+)(?:(?:\s+-)|(?:-\s+)|(?:\s+-\s+))([A-Za-z0-9\.()\+-\*/]+)\W', # subtract
        r'\W([A-Za-z0-9\.()\+\-\*\/]+)(?:\s\+\s+)([A-Za-z0-9\.()\+\-\*\/]+)\W' # add,
        r'\W([A-Za-z0-9\.()\+\-\*\/]+)(?:\s\*\s+)([A-Za-z0-9\.()\+\-\*\/]+)\W' # multiply,
        r'\W\(?\s*\w+\s*\)?\s*\.\s*\.\s*\(?\s*\w+\s*\)?\W', # exclusive range
        r'\W\[\s*([A-Za-z0-9\.()\+\-\*\/]+)\s*(\.\s*\.\s*)\s*([A-Za-z0-9\.()\+\-\*\/]+)\s*\]\W', # inclusive range
        r'\W\[\s*([A-Za-z0-9\.()\+\-\*\/]+)\s*,\s*([A-Za-z0-9\.()\+\-\*\/]+)\s*\]\W', # inclusive range 2
        r'\W([A-Za-z0-9\.()\+\-\*\/]+)\s*<=?\s*([A-Za-z0-9\.()\+\-\*\/]+)\s*<=?\s*([A-Za-z0-9\.()\+\-\*\/]+)\W', # equality range
        r'\W([A-Za-z0-9\.()\+\-\*\/]+)\s*>=?\s*([A-Za-z0-9\.()\+\-\*\/]+)\s*>=?\s*([A-Za-z0-9\.()\+\-\*\/]+)\W', # equality range
    ]
    
    rules = []
    for pattern in patterns:
        if re.search(pattern, api_caveat_obj['sentence']):
#             rules.append(patterns[pattern])
            print('HERE')
    
    return rules

In [905]:
print(re.search(r'\W (\s*\w+\s*)(,\s*\w+\s*)+,?\s*or\s*\w+\W', ' what in the world, ok here I have a list, one, two, one or three ').groups())

('one', ', one ')


In [836]:
k = 12403
print(get_not_null_exception_rules(not_null_exception_df.loc[k].to_dict()))
print(not_null_exception_results[str(k)])

<_sre.SRE_Match object; span=(0, 40), match='if type, source, or connectionId is null'>
{'source', 'connectionId', 'type'}
['type', 'source', 'connectionId']


In [678]:
print(get_type_rules(type_restrict_exception_df.loc[12403].to_dict()))
print(type_restrict_exception_results['1762'])

[{'obj': 'src', 'types': ['IndexColorModel'], 'negated': True}]
None


In [684]:
# evaluate type rules generation via regex
retrieved, correct, total = 0, 0, 0
for key in type_restrict_exception_results:
    index = int(key)
    caveat_obj = type_restrict_exception_df.loc[index].to_dict()
    
    rules = get_type_rules(caveat_obj)
    if rules:
        retrieved += 1
        
        if not type_restrict_exception_results[key]:
            print('MISFIND')
            print(index)
            continue
        
        is_correct = True
        if not type_restrict_exception_results[key]:
            is_correct = False
        
        has_match = False
        for rule in rules:
            match = [e for e in type_restrict_exception_results[key] if e['obj'] == rule['obj']]
            if len(match) > 0 and match[0]['types'] == rule['types'] and match[0]['negated'] == rule['negated']:
                has_match = True
                break
        
        if is_correct and has_match:
            print(index)
            correct += 1
            
    if type_restrict_exception_results[key]:
        total += 1
print('retrieved:{}, correct:{}, total: {}'.format(retrieved, correct, total))

16414
16423
7928
5174
8009
16388
16411
7990
10139
14367
12594
12676
16398
12607
MISFIND
1762
retrieved:15, correct:14, total: 33


In [727]:
# evaluate not null rules (parameter sentences) generation via regex
retrieved, correct, total = 0, 0, 0
for key in not_null_parameter_results:
    index = int(key)
    caveat_obj = not_null_parameter_df.loc[index].to_dict()
    
    rule = is_not_null_parameter_caveat(caveat_obj)
    if rule:
        retrieved += 1
        
        if not not_null_parameter_results[key]:
            print(index)
            continue
        
        correct += 1
            
    if not_null_parameter_results[key]:
        if not rule:
            print(index)
        total += 1
print('retrieved:{}, correct:{}, total: {}'.format(retrieved, correct, total))

4227
retrieved:91, correct:90, total: 90


In [840]:
# evaluate not null rules (exception sentences) generation via regex
retrieved, correct, total = 0, 0, 0

for key in not_null_exception_results:
    index = int(key)
    caveat_obj = not_null_exception_df.loc[index].to_dict()
    
    rules = get_not_null_exception_rules(caveat_obj)
    if rules:
        retrieved += 1
        
        if not not_null_exception_results[key] or set(not_null_exception_results[key]) != set(rules):
            print(index)
            continue
        
        correct += 1
            
    if not_null_exception_results[key]:
        if not rules:
            print(index)
        total += 1
print('retrieved:{}, correct:{}, total: {}'.format(retrieved, correct, total))

3078
8702
12013
3994
8651
12091
12587
12519
3112
14946
949
8146
15643
3364
12088
retrieved:81, correct:75, total: 84


In [848]:
def preprocess(sentence):
    if sentence[-1] == '.':
        sentence = sentence[:-1]
    
    sentence = re.sub(r'[^\w\. -]', '', sentence)
    # remove brackets surrounding text
    matches = re.findall(r' ([^)])', sentence)
    for match in matches:
        sentence = re.sub(match, match[0], sentence)
    return sentence

def tokenize(sentence):    
    return sentence.split()

test = 'The maximum number of bytes to be written to the given array; must be non-negative and no larger than dst.length - offset'
processed_text = tokenize(preprocess(test))
# print(parser.tag(processed_text))
print(next(next(parser.parse_sents([processed_text]))).pretty_print())
# next(parser.raw_parse('if number is null or not an instance of Number')).pretty_print()

                                                          ROOT                                                                               
                                                           |                                                                                  
                                                           S                                                                                 
               ____________________________________________|_________________________________________                                         
              NP                                                                                     |                                       
        ______|__________                                                                            |                                        
       |                 PP                                                                          |                                       
   