In [83]:
import ujson
import glob
import os
import re
import stanfordnlp
import pandas as pd

# stanfordnlp.download('en')   # This downloads the English models for the neural pipeline
nlp = stanfordnlp.Pipeline() # This sets up a default neural pipeline in English

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/thien/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/thien/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/home/thien/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/thien/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/thien/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/thien/stanfordnlp_resources/en_ewt_models/en_ewt.pr

In [94]:
# load the non-dreprecated java doc caveat sentences of methods (parameters or exception level)
caveat_files_dir = './output/java_12_spec_caveat_sentences_revised/'
caveats = []

files = sorted(glob.glob(caveat_files_dir + '*.json'))
for file in files:
    with open(file) as f:
        arr = ujson.load(f)
        full_class_name = os.path.splitext(os.path.basename(file))[0]
        simple_class_name = full_class_name.split('.')[-1]
        
        for caveat in arr:
            if not caveat['deprecated'] and 'name' in caveat:
                
                # collect the name of parameters
                collected = False
                parameters = []
                for misc_obj in caveat['caveat_misc']:
                    if misc_obj['name'] == 'Parameters:':
                        for obj in misc_obj['list']:
                            parameters.append(obj['parameter'])
                        collected = True
                        break
                
                if collected:
                    # add all parameter and exception level sentences
                    for misc_obj in caveat['caveat_misc']:
                        if misc_obj['name'] in ['Parameters:', 'Throws:']:
                            for obj in misc_obj['list']:
                                for misc_sentence in obj['sentences']:
                                    if misc_obj['name'] == 'Parameters:':
                                        parameter_caveats.append({
                                            'obj': obj['parameter'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'parameter'
                                        })
                                    else:
                                        exception_caveats.append({
                                            'obj': obj['exception'],
                                            'simple_class_name': simple_class_name,
                                            'full_class_name': full_class_name,
                                            'api': caveat['name'],
                                            'signature': caveat['signature'],
                                            'sentence': misc_sentence,
                                            'parameters': parameters,
                                            'type': 'exception'
                                        })
                                        
caveat_df = pd.DataFrame(caveats)

In [95]:
def filter_by_pattern(sentence, pattern):
    sentence = sentence.lower()
    if sentence:
        matches = re.search(pattern, sentence)
        if matches:
            return True
    
    return False

def not_null_filter(sentence):
    return filter_by_pattern(sentence, 'null')

def range_limitation_filter(sentence):
    return filter_by_pattern(sentence, '(<|>|=|less|greater|larger|equal|equivalent than|to)')

def type_restriction_filter(sentence):
    return filter_by_pattern(sentence, '(equal|equivalent|class|derived|instance of|be|is)')

In [76]:
# not null caveats
not_null_df = caveat_df[caveat_df['sentence'].apply(not_null_filter)]
print(len(not_null_df.index))

7667


In [77]:
# range limitation caveats
range_df = caveat_df[caveat_df['sentence'].apply(range_limitation_filter)]
print(len(range_df.index))

8347


In [78]:
# type restriction caveats
type_df = caveat_df[caveat_df['sentence'].apply(type_restriction_filter)]
print(len(type_df.index))

20365


In [80]:
# dependency parsing
doc = nlp('if the specified element is null and this queue does not permit null elements')



In [81]:
doc.sentences[0].print_dependencies()

('if', '6', 'mark')
('the', '4', 'det')
('specified', '4', 'amod')
('element', '6', 'nsubj')
('is', '6', 'cop')
('null', '0', 'root')
('and', '12', 'cc')
('this', '9', 'det')
('queue', '12', 'nsubj')
('does', '12', 'aux')
('not', '12', 'advmod')
('permit', '6', 'conj')
('null', '14', 'amod')
('elements', '12', 'obj')
