In [237]:
import ujson
import glob
import os
import re
import pandas as pd
from nltk.parse.corenlp import CoreNLPDependencyParser, CoreNLPParser
from nltk.tree import Tree
# parser = CoreNLPDependencyParser(url='http://localhost:9010')
posparser = CoreNLPParser(url='http://localhost:9010', tagtype='pos')

In [224]:
# load the non-dreprecated java doc caveat sentences of methods (parameters or exception level)
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
parameter_caveats = []
exception_caveats = []

files = sorted(glob.glob(caveat_files_dir + '*.json'))
for file in files:
    with open(file) as f:
        arr = ujson.load(f)
        full_class_name = os.path.splitext(os.path.basename(file))[0]
        simple_class_name = full_class_name.split('.')[-1]
        
        for caveat in arr:
            if not caveat['deprecated'] and 'name' in caveat:
                
                # collect the name of parameters
                collected = False
                parameters = []
                for misc_obj in caveat['caveat_misc']:
                    if misc_obj['name'] == 'Parameters:':
                        for obj in misc_obj['list']:
                            parameters.append(obj['parameter'])
                        collected = True
                        break

                if collected:
                    # add all parameter and exception level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    if misc_obj['name'] == 'Parameters:':
                                        e = {
                                            'obj': obj['parameter'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'parameter'
                                        }
                                        parameter_caveats.append(e)
                                    else:
                                        e = {
                                            'obj': obj['exception'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'exception'
                                        }
                                        exception_caveats.append(e)

parameter_caveat_df = pd.DataFrame(parameter_caveats)
exception_caveat_df = pd.DataFrame(exception_caveats)

In [133]:
for i in parameter_caveat_df.index:
    sent = parameter_caveat_df.loc[i, 'sentence']
    if '#' in sent:
        print(sent)

This is the end of the string or the position of the "#" character, if present.
the DriverAction implementation to be used when DriverManager#deregisterDriver is called
whether local cipher suites order in #getCipherSuites should be honored during SSL/TLS/DTLS handshaking.
A string representing the attribute defaulting mode ("#IMPLIED", "#REQUIRED", or "#FIXED") or null if none of these applies.


In [217]:
def normalise_values(sentence):
    patterns = [
        '\W(-?[0-9]+(?:,[0-9]+)*(?:(?:\.[0-9]+)?[a-z]*))\W', # specific numeric value
        r'\W((^(java\.|javax\.|org\.))?([A-Za-z_]\w*\.)+\w+)[^A-Za-z0-9_\(\)]', # member value of object/Class
    ]
    
    value_prefix = '@VAL'
    value_dict  = {}
    
    for pattern in patterns:
        match = re.search(pattern, sentence)
        while match:
            key = value_prefix + str(len(value_dict))
            value_dict[key] = match.group(1)
            next_pattern = re.escape(match.group(1))
            sentence = re.sub(next_pattern, key, sentence)

            match = re.search(pattern, sentence)
            
    return sentence, value_dict

# Sentence normalisation functions
def normalise_members(sentence):
    # variable substitutions
    patterns = [
        r'\W("[^"]+")\W', # simple string
        r'\W([A-Za-z_]+[A-Za-z_0-9]*(\.[A-Za-z_]+[A-Za-z_0-9])*(#[A-Za-z_]+[A-Za-z_0-9]*)?\([^\(\)]*\))\W', # class methods
        r'\W([A-Za-z_]+[A-Za-z_0-9]*(\.[A-Za-z_]+[A-Za-z_0-9])*#[A-Za-z_]+[A-Za-z_0-9]*)[^A-Za-z0-9_\(\)]', # static method
        r'(#[A-Za-z_]+[A-Za-z_0-9]*)[^A-Za-z0-9_\(\)]', # static member by itself
        r'\W((\w+\.)*([A-Z]+_)*[A-Z]+)\W', # all uppercase
        r'\W([a-z_][A-Za-z_0-9]*\([^\)]*\))\W', # standalone methods
        r'\W([a-z_]+[A-Za-z_0-9]*\.[a-z_]+[A-Za-z_0-9]*)\W', # variable fields
    ]

    member_prefix = '@MEMBER'
    member_dict = {}

    # normalise all variables/methods/fields that match predefined regex patterns
    for pattern in patterns:
        match = re.search(pattern, sentence)
        while match:
            print(pattern)
            print(match)
            key = member_prefix + str(len(member_dict))
            member_dict[key] = match.group(1)
            next_pattern = re.escape(match.group(1))
            sentence = re.sub(next_pattern, key, sentence)
            print(sentence)

            match = re.search(pattern, sentence)

    return sentence, member_dict

def normalise_expressions(sentence):
    patterns = [
        '\W(\w+((\s+-)|(-\s+)|(\s+-\s+))\w+)\W', # subtraction
        '\W(\w+\s*\+\s*\w+)\W', # addition
        '\W(\w+\s*\*\s*\w+)\W', # multiplication
        # ranges
        r'\W(\[\s*\w+\s*(\.\s*\.\s*)\s*\w+\s*\])\W',
        r'\W(\[\s*\w+\s*,\s*\w+\s*\])\W',
        r'\W(\(?\s*\w+\s*\)?\s*\.\s*\.\s*\(?\s*\w+\s*\)?)\W',
        r'\W(([Ff]rom\s+)?\w+\s+to\s+\w+)\W',
        # equality
        r'\W(\w+\s*<=?\s*\w+\s*<=?\s*\w+)\W',
        r'\W(\w+\s*>=?\s*\w+\s*>=?\s*\w+)\W',
        r'\W(\w+\s*!=\s*\w+)\W',
        r'\W ((\s*\w+\s*)(,\s*\w+\s*)+,?\s*or\s*\w+)\W'
    ]
    
    expr_prefix = "@EXPR"
    expr_dict = {}
    
    for pattern in patterns:
        match = re.search(pattern, sentence)

        while match:
            print(match)
            key = expr_prefix + str(len(expr_dict))
            escaped = re.escape(match.group(1))
            sentence = re.sub(escaped, key, sentence)
            expr_dict[key] = match.group(1)
    
            match = re.search(pattern, sentence) 
            print(sentence)
            
    return sentence, expr_dict

def normalise(sentence):
    sentence, member_dict = normalise_members(sentence)
    sentence, expr_dict = normalise_expressions(sentence)
    sentence, value_dict = normalise_values(sentence)
    
    placeholders = dict()
    placeholders.update(value_dict)
    placeholders.update(member_dict)
    placeholders.update(expr_dict)
    
    return sentence, placeholders

In [207]:
parse, = parser.raw_parse(" if component is less than 0 or greater than @EXPR0 ")
for governor, dep, dependent in parse.triples():
    print(governor, dep, dependent)  # doctest: +NORMALIZE_WHITESPACE

('0', 'CD') mark ('if', 'IN')
('0', 'CD') nsubj ('component', 'NN')
('0', 'CD') cop ('is', 'VBZ')
('0', 'CD') advmod ('less', 'JJR')
('less', 'JJR') mwe ('than', 'IN')
('0', 'CD') cc ('or', 'CC')
('0', 'CD') conj ('greater', 'JJR')
('0', 'CD') nmod ('@EXPR0', 'NN')
('@EXPR0', 'NN') case ('than', 'IN')


In [296]:
a = next(next(posparser.raw_parse_sents([' if Component is less than 0 or greater than EXPRESSION '])))

def recurse(node):
    for child in node:
        try:
            if child.label() == 'CC':
                print('HERE')
                for c in node:
                    if (c.label() != 'CC'):
                        print(c.flatten())
            else:
                recurse(child)
        except:
            pass

recurse(a)

print(a.pos())

HERE
(ADJP less than 0)
(ADJP greater)
[('if', 'IN'), ('Component', 'NNP'), ('is', 'VBZ'), ('less', 'JJR'), ('than', 'IN'), ('0', 'CD'), ('or', 'CC'), ('greater', 'JJR'), ('than', 'IN'), ('EXPRESSION', 'NN')]


In [222]:
normalise_members(' if maxPathLength is set to a value less than -1 ')

(' if maxPathLength is set to a value less than -1 ', {})