In [7]:
import ujson
import glob
import os
import re
import pandas as pd
from nltk.parse.corenlp import CoreNLPParser

parser = CoreNLPParser(url='http://localhost:9010')

In [11]:
# load the non-dreprecated java doc caveat sentences of methods (parameters or exception level)
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
caveats = []

files = sorted(glob.glob(caveat_files_dir + '*.json'))
for file in files:
    with open(file) as f:
        arr = ujson.load(f)
        full_class_name = os.path.splitext(os.path.basename(file))[0]
        simple_class_name = full_class_name.split('.')[-1]
        
        for caveat in arr:
            if not caveat['deprecated'] and 'name' in caveat:
                
                # collect the name of parameters
                collected = False
                parameters = []
                for misc_obj in caveat['caveat_misc']:
                    if misc_obj['name'] == 'Parameters:':
                        for obj in misc_obj['list']:
                            parameters.append(obj['parameter'])
                        collected = True
                        break
                        
                for sentence in caveat['sentences']:
                    caveats.append({
                                    'obj': '',
                                    'simple_class_name': simple_class_name,
                                    'full_class_name': full_class_name,
                                    'api': caveat['name'],
                                    'signature': caveat['signature'],
                                    'sentence': sentence,
                                    'parameters': parameters,
                                    'type': 'method'
                                })

                if collected:
                    # add all parameter and exception level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    if misc_obj['name'] == 'Parameters:':
                                        caveats.append({
                                            'obj': obj['parameter'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'parameter'
                                        })
                                    else:
                                        caveats.append({
                                            'obj': obj['exception'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'exception'
                                        })
                                        
caveat_df = pd.DataFrame(caveats)

In [92]:
def filter_by_pattern(sentence, patterns):
    sentence = sentence.lower()
    if sentence:
        for pattern in patterns:
            matches = re.search(pattern, sentence)
            if matches:
                return True
    
    return False

# Regex patterns for filtering
not_null_patterns = [
    '(be|equal|equals|is) null',
    'be (equal|equivalent) to null',
    'not-null'
]

range_limitation_patterns = [
    '<|>|=',
    'less|greater|larger|equals|equal to|equivalent to|the same',
    'be( not)? (in|out of|outside of).* range',
    'negative|positive|non-negative|non-positive',
]

type_restriction_patterns = [
    'is( not)? a'
    'type'
]

def not_null_filter(sentence):
    return filter_by_pattern(sentence, not_null_patterns)

def range_limitation_filter(sentence):
    return filter_by_pattern(sentence, range_limitation_patterns)

def type_restriction_filter(sentence):
    return filter_by_pattern(sentence, type_restriction_patterns)

In [82]:
match = re.search('instance of|derived from', "this is a test str, and a is derived from")
if match:
    print("YES")

YES


In [93]:
# not null caveats
not_null_df = caveat_df[caveat_df['sentence'].apply(not_null_filter)]

# range limitation caveats
range_df = caveat_df[caveat_df['sentence'].apply(range_limitation_filter)]

# type restriction caveats
type_df = caveat_df[caveat_df['sentence'].apply(type_restriction_filter)]

In [94]:
print('Number of not null candidates: {}'.format(len(not_null_df.index)))
print('Number of range limitation candidates: {}'.format(len(range_df.index)))
print('Number of type restriction candidates: {}'.format(len(type_df.index)))

Number of not null candidates: 5939
Number of range limitation candidates: 6677
Number of type restriction candidates: 0


In [91]:
with open('./labelled_data/labelled_exception_full.jsonl') as f:
    counts = {}
    for line in f:
        obj = ujson.loads(line)
        if obj['labels']:
            if obj['labels'][0] not in counts:
                counts[obj['labels'][0]] = 0
                
            if obj['labels'][0] == 'RangeLimitation':
                print(obj)
            counts[obj['labels'][0]] += 1
    print(counts)

{'id': 11512, 'text': 'if the index is greater than or equal to frameCount() or is negative.', 'labels': ['RangeLimitation']}
{'id': 11514, 'text': 'That is, the exception is thrown if any of the following are true: start < 0 start >= frameCount() length < 0 (start+length) > frameCount()', 'labels': ['RangeLimitation']}
{'id': 11605, 'text': 'if size <= 0', 'labels': ['RangeLimitation']}
{'id': 11692, 'text': 'if the specified offset is less than the first text boundary or greater than the last text boundary.', 'labels': ['RangeLimitation']}
{'id': 11696, 'text': 'if the port parameter is outside the specified range of valid port values, which is between 0 and 65535, inclusive.', 'labels': ['RangeLimitation']}
{'id': 11740, 'text': 'if there is an error accessing the BLOB value or if pos is less than 1', 'labels': ['RangeLimitation']}
{'id': 11741, 'text': 'if there is an error accessing the BLOB value or if len is less than 0', 'labels': ['RangeLimitation']}
{'id': 11802, 'text': 'if 

In [57]:
unique_not_null_df = not_null_df.drop_duplicates('sentence').sample(frac=1)
print(len(unique_not_null_df.index))

with open('./labelled_data/not_null_rules.txt', 'w+') as f:
    for i in unique_not_null_df.index:
        f.write(unique_not_null_df.loc[i, 'sentence'] + '\n')

645


In [8]:
next(parser.raw_parse('The quick brown fox sucks at jumping.')).pretty_print()

                ROOT                          
                 |                             
                 S                            
       __________|__________________________   
      |                     VP              | 
      |                 ____|___            |  
      |                |        PP          | 
      |                |     ___|_____      |  
      |                |    |         S     | 
      |                |    |         |     |  
      NP               |    |         VP    | 
  ____|__________      |    |         |     |  
 DT   JJ    JJ   NN   VBZ   IN       VBG    . 
 |    |     |    |     |    |         |     |  
The quick brown fox  sucks  at     jumping  . 

