diff --git a/cliner/features_dir/features.py b/cliner/features_dir/features.py index 4f293e3..2b0141b 100755 --- a/cliner/features_dir/features.py +++ b/cliner/features_dir/features.py @@ -12,7 +12,7 @@ from wordshape import getWordShapes -from utilities import prose_sentence +from utilities import is_prose_sentence from sentence_features import SentenceFeatures @@ -31,14 +31,19 @@ def __init__(self, data=None): - # IOB_features() - # - # input: A sentence - # output: A hash table of features def extract_IOB_features(self, sentence): + """ + extract_IOB_features() + @param sentence. A list of chunks + @return tuple: boolean (Prose or not), a list of dictionaries of features + + >>> fw = FeatureWrapper() + >>> fw.extract_IOB_features(['this', 'is', 'a' 'test']) is not None + True + """ # Different features depending on whether sentence is 'prose' - isProse = prose_sentence(sentence) + isProse = is_prose_sentence(sentence) if isProse: features_list = self.feat_sent.IOB_prose_features(sentence) @@ -50,13 +55,18 @@ def extract_IOB_features(self, sentence): - # concept_features() - # - # input: A sentence/line from a medical text file (list of chunks) - # An list of indices into the sentence for each important chunk - # output: A list of hash tables of features def concept_features(self, sentence, chunk_inds): + """ + concept_features() + + @param sentence. a list of chunks + @param chunk_inds. a list of important indices of the sentence + @return a list of dictionaries of features + >>> fw = FeatureWrapper() + >>> fw.concept_features(['this', 'is', 'an', 'important', 'test'], [3, 4]) is not None + True + """ # FIXME - move all of this work to SentenceFeatures object ''' diff --git a/cliner/features_dir/genia_dir/genia_features.py b/cliner/features_dir/genia_dir/genia_features.py index b9274aa..2ffac42 100644 --- a/cliner/features_dir/genia_dir/genia_features.py +++ b/cliner/features_dir/genia_dir/genia_features.py @@ -24,7 +24,7 @@ def __init__(self, tagger, data): """ # Filter out nonprose sentences - prose = [ sent for sent in data if utilities.prose_sentence(sent) ] + prose = [ sent for sent in data if utilities.is_prose_sentence(sent) ] # Process prose sentences with GENIA tagger self.GENIA_features = iter(interface_genia.genia(tagger, prose)) diff --git a/cliner/features_dir/read_config.py b/cliner/features_dir/read_config.py index 055688b..9515e33 100644 --- a/cliner/features_dir/read_config.py +++ b/cliner/features_dir/read_config.py @@ -12,16 +12,17 @@ import os - -# -# enabled_modules -# -# @return dictionary of (name,resource path) pairs. -# -# ex. {'UMLS': None, 'GENIA': 'genia/geniatagger-3.0.1/geniatagger'} -# def enabled_modules(): + """ + enabled_modules() + + @return a dictionary of {name, resource} pairs. + + ex. {'UMLS': None, 'GENIA': 'genia/geniatagger-3.0.1/geniatagger'} + >>> enabled_modules() is not None + True + """ # Open config file filename = os.path.join( os.getenv('CLINER_DIR'), 'config.txt' ) f = open(filename, 'r') diff --git a/cliner/features_dir/sentence_features.py b/cliner/features_dir/sentence_features.py index e3b2991..d828815 100755 --- a/cliner/features_dir/sentence_features.py +++ b/cliner/features_dir/sentence_features.py @@ -77,12 +77,15 @@ def __init__(self, data): self.enabled_IOB_prose_sentence_features.append('UMLS') - # IOB_prose_features() - # - # input: A sentence - # output: A list of hash tables of features + def IOB_prose_features(self, sentence): + """ + IOB_prose_features + + @param sentence. A list of strings + @return A list of dictionaries of features + """ features_list = [] # Get a feature set for each word in the sentence @@ -224,14 +227,15 @@ def IOB_prose_features(self, sentence): return features_list - - # IOB_nonprose_features() - # - # input: A sentence - # output: A hash table of features def IOB_nonprose_features(self, sentence): - - + """ + IOB_nonprose_features + + @param sentence. A list of strings + @return A list of dictionaries of features + + """ + # Get a feature set for each word in the sentence features_list = [] for i,word in enumerate(sentence): diff --git a/cliner/features_dir/utilities.py b/cliner/features_dir/utilities.py index 85703cd..bc89fb8 100755 --- a/cliner/features_dir/utilities.py +++ b/cliner/features_dir/utilities.py @@ -11,6 +11,7 @@ import cPickle as pickle import os + # used as a default path for stashing pos tagger. pos_tagger_path = os.path.join( os.environ['CLINER_DIR'], "cliner/features_dir/nltk_tagger.p") @@ -64,11 +65,24 @@ def load_pos_tagger(path_to_obj=pos_tagger_path): return tagger -# prose_sentence() -# -# input: A sentence -# output: Boolean yes/no -def prose_sentence(sentence): +def is_prose_sentence(sentence): + """ + is_prose_sentence() + + Purpose: Determine if a sentence of text is 'prose' + + @param sentence A list of words + @return A boolean + + >>> is_prose_sentence(['Admission', 'Date', ':']) + False + >>> is_prose_sentence(['Hello', 'World', '.']) + True + >>> is_prose_sentence(['What', 'do', 'you', 'think', '?']) + True + >>> is_prose_sentence(['Short', 'sentence']) + False + """ # Empty sentence is not prose if not sentence: @@ -80,20 +94,32 @@ def prose_sentence(sentence): return False elif len(sentence) <= 5: return False - elif at_least_half_nonprose(sentence): + elif is_at_least_half_nonprose(sentence): return True else: return False -# at_least_half_nonprose() -# -# input: A sentence -# output: A bollean yes/no -def at_least_half_nonprose(sentence): +def is_at_least_half_nonprose(sentence): + """ + is_at_least_half_nonprose(sentence) - count = len( [ w for w in sentence if prose_word(w) ] ) + Purpose: Checks if at least half of the sentence is considered to be 'nonprose' + + @param sentence. A list of words + @return A boolean + + >>> is_at_least_half_nonprose(['1','2','and','some','words']) + True + >>> is_at_least_half_nonprose(['1', '2', '3', '4', 'and', 'some', 'words', '5']) + False + >>> is_at_least_half_nonprose(['word']) + True + >>> is_at_least_half_nonprose([' ']) + True + """ + count = len( [ w for w in sentence if is_prose_word(w) ] ) if count >= len(sentence)/2: return True @@ -101,13 +127,24 @@ def at_least_half_nonprose(sentence): return False +def is_prose_word(word): + """ + is_prose_word(word) + + Purpose: Checks if the given word is 'prose' -# prose_word() -# -# input: A word -# output: Boolean yes/no -def prose_word(word): + @param word. A word + @return A boolean + >>> is_prose_word('word') + True + >>> is_prose_word('99') + False + >>> is_prose_word('question?') + False + >>> is_prose_word('ALLCAPS') + False + """ # Punctuation for punc in ".?,!:\"'": if punc in word: @@ -123,6 +160,9 @@ def prose_word(word): # Else return True +<<<<<<< HEAD #EOF +======= +>>>>>>> doctests diff --git a/cliner/features_dir/word_features.py b/cliner/features_dir/word_features.py index 7e244b2..79d7a8c 100755 --- a/cliner/features_dir/word_features.py +++ b/cliner/features_dir/word_features.py @@ -36,12 +36,19 @@ def __init__(self): pass - # IOB_prose_features_for_word() - # - # input: A single word - # output: A dictionary of features def IOB_prose_features(self, word): - + """ + IOB_prose_features() + + Purpose: Creates a dictionary of prose features for the given word. + + @param word. A string + @return A dictionary of features + + >>> wf = WordFeatures() + >>> wf.IOB_prose_features('test') is not None + True + """ # Feature: features = {('dummy', None): 1} # always have >0 dimensions @@ -85,15 +92,20 @@ def IOB_prose_features(self, word): return features - - - - # IOB_nonprose_features_for_word() - # - # input: A single word - # output: A dictionary of features def IOB_nonprose_features(self, word): - + """ + IOB_nonprose_features() + + Purpose: Creates a dictionary of nonprose features for the given word. + + @param word. A string + @return A dictionary of features + + >>> wf = WordFeatures() + >>> wf.IOB_nonprose_features('test') is not None + True + """ + features = {} # Feature: The word, itself @@ -123,14 +135,20 @@ def IOB_nonprose_features(self, word): - + # Note: most of this function is currently commented out so the doctests should be fixed if this is ever changed def concept_features_for_word(self, word): """ concept_features_for_word() + Purpose: Creates a dictionary of concept features for the given word. + @param word. A word to generate features for @return A dictionary of features + + >>> wf = WordFeatures() + >>> wf.concept_features_for_word('test') is not None + True """ features = {} @@ -209,8 +227,7 @@ def concept_features_for_word(self, word): return features - - + #FIXME The documentation for this is incorrect, not 100% sure how it works. def concept_features_for_chunk(self, sentence, ind): """ @@ -218,6 +235,7 @@ def concept_features_for_chunk(self, sentence, ind): @param word. A chunk from the sentence @return A dictionary of features + """ features = {'dummy':1} @@ -281,6 +299,19 @@ def concept_features_for_chunk(self, sentence, ind): # Try to get QANN features def QANN_features(self, word): + """ + QANN_features() + + Purpose: Creates a dictionary of QANN features for the given word. + + @param word. A string + @return A dictionary of features + + >>> wf = WordFeatures() + >>> wf.QANN_features('test') is not None + True + """ + features = {} # Feature: test result @@ -315,48 +346,282 @@ def QANN_features(self, word): return features - + # note: make spaces optional? + # Check about the documentation for this. def is_test_result(self, context): - # note: make spaces optional? + """ + is_test_result() + + Purpose: Checks if the context is a test result. + + @param context. A string. + @return it returns the matching object of '[blank] was positive/negative' or None if it cannot find it. + otherwise, it will return True. + + >>> wf = WordFeatures() + >>> print wf.is_test_result('test was 10%') + True + >>> print wf.is_test_result('random string of words') + None + >>> print wf.is_test_result('Test') + None + >>> print wf.is_test_result('patient less than 30') + True + >>> print wf.is_test_result(' ') + None + """ regex = r"^[A-Za-z]+( )*(-|--|:|was|of|\*|>|<|more than|less than)( )*[0-9]+(%)*" if not re.search(regex, context): return re.search(r"^[A-Za-z]+ was (positive|negative)", context) return True def is_measurement(self, word): - regex = r"^[0-9]*(unit(s)|cc|L|mL|dL)$" + """ + is_measurement() + + Purpose: Checks if the word is a measurement. + + @param word. A string. + @return the matched object if it is a measurement, otherwise None. + + >>> wf = WordFeatures() + >>> wf.is_measurement('10units') is not None + True + >>> wf.is_measurement('7 units') is not None + True + >>> wf.is_measurement('10cc') is not None + True + >>> wf.is_measurement('300 L') is not None + True + >>> wf.is_measurement('20mL') is not None + True + >>> wf.is_measurement('400000 dL') is not None + True + >>> wf.is_measurement('30000') is not None + False + >>> wf.is_measurement('20dl') is not None + False + >>> wf.is_measurement('units') is not None + True + """ + regex = r"^[0-9]*( )?(unit(s)|cc|L|mL|dL)$" return re.search(regex, word) def is_directive(self, word): + """ + is_directive() + + Purpose: Checks if the word is a directive. + + @param word. A string. + @return the matched object if it is a directive, otherwise None. + + >>> wf = WordFeatures() + >>> wf.is_directive('q.abc') is not None + True + >>> wf.is_directive('qAD') is not None + True + >>> wf.is_directive('PRM') is not None + True + >>> wf.is_directive('bid') is not None + True + >>> wf.is_directive('prm') is not None + True + >>> wf.is_directive('p.abc') is not None + True + >>> wf.is_directive('qABCD') is not None + False + >>> wf.is_directive('BID') is not None + False + """ regex = r"^(q\..*|q..|PRM|bid|prm|p\..*)$" return re.search(regex, word) def is_date(self, word): + """ + is_date() + + Purpose: Checks if word is a date. + + @param word. A string. + @return the matched object if it is a date, otherwise None. + + >>> wf = WordFeatures() + >>> wf.is_date('2015-03-1') is not None + True + >>> wf.is_date('2014-02-19') is not None + True + >>> wf.is_date('03-27-1995') is not None + True + >>> wf.is_date('201') is not None + False + >>> wf.is_date('0') is not None + False + """ regex= r'^(\d\d\d\d-\d\d-\d|\d\d?-\d\d?-\d\d\d\d?|\d\d\d\d-\d\d?-\d\d?)$' return re.search(regex,word) def is_volume(self, word): - regex = r"^[0-9]*(ml|mL|dL)$" + """ + is_volume() + + Purpose: Checks if word is a volume. + + @param word. A string. + @return the matched object if it is a volume, otherwise None. + + >>> wf = WordFeatures() + >>> wf.is_volume('9ml') is not None + True + >>> wf.is_volume('10 mL') is not None + True + >>> wf.is_volume('552 dL') is not None + True + >>> wf.is_volume('73') is not None + False + >>> wf.is_volume('ml') is not None + True + """ + regex = r"^[0-9]*( )?(ml|mL|dL)$" return re.search(regex, word) def is_weight(self, word): - regex = r"^[0-9]*(mg|g|mcg|milligrams|grams)$" + """ + is_weight() + + Purpose: Checks if word is a weight. + + @param word. A string. + @return the matched object if it is a weight, otherwise None. + + >>> wf = WordFeatures() + >>> wf.is_weight('1mg') is not None + True + >>> wf.is_weight('10 g') is not None + True + >>> wf.is_weight('78 mcg') is not None + True + >>> wf.is_weight('10000 milligrams') is not None + True + >>> wf.is_weight('14 grams') is not None + True + >>> wf.is_weight('-10 g') is not None + False + >>> wf.is_weight('grams') is not None + True + """ + regex = r"^[0-9]*( )?(mg|g|mcg|milligrams|grams)$" return re.search(regex, word) def is_size(self, word): - regex = r"^[0-9]*(mm|cm|millimeters|centimeters)$" + """ + is_size() + + Purpose: Checks if the word is a size. + + @param word. A string. + @return the matched object if it is a weight, otheriwse None. + + >>> wf = WordFeatures() + >>> wf.is_size('1mm') is not None + True + >>> wf.is_size('10 cm') is not None + True + >>> wf.is_size('36 millimeters') is not None + True + >>> wf.is_size('423 centimeters') is not None + True + >>> wf.is_size('328') is not None + False + >>> wf.is_size('22 meters') is not None + False + >>> wf.is_size('millimeters') is not None + True + """ + regex = r"^[0-9]*( )?(mm|cm|millimeters|centimeters)$" return re.search(regex, word) def is_prognosis_location(self, word): + """ + is_prognosis_location() + + Purpose: Checks if the word is a prognosis location + + @param word. A string. + @return the matched object if it is a prognosis location, otherwise None. + + >>> wf = WordFeatures() + >>> wf.is_prognosis_location('c9-c5') is not None + True + >>> wf.is_prognosis_location('C5-C9') is not None + True + >>> wf.is_prognosis_location('test') is not None + False + >>> wf.is_prognosis_location('c-9-C5') is not None + False + """ regex = r"^(c|C)[0-9]+(-(c|C)[0-9]+)*$" return re.search(regex, word) def has_problem_form(self, word): + """ + has_problem_form() + + Purpose: Checks if the word has problem form. + + @param word. A string + @return the matched object if it has problem form, otheriwse None. + + >>> wf = WordFeatures() + >>> wf.has_problem_form('prognosis') is not None + True + >>> wf.has_problem_form('diagnosis') is not None + True + >>> wf.has_problem_form('diagnostic') is not None + True + >>> wf.has_problem_form('arachnophobic') is not None + True + >>> wf.has_problem_form('test') is not None + False + >>> wf.has_problem_form('ice') is not None + False + """ regex = r".*(ic|is)$" return re.search(regex, word) - # checks for a definitive classification at the word level def get_def_class(self, word): + """ + get_def_class() + + Purpose: Checks for a definitive classification at the word level. + + @param word. A string + @return 1 if the word is a test term, + 2 if the word is a problem term, + 3 if the word is a treatment term, + 0 otherwise. + >>> wf = WordFeatures(); + >>> wf.get_def_class('eval') + 1 + >>> wf.get_def_class('rate') + 1 + >>> wf.get_def_class('tox') + 1 + >>> wf.get_def_class('swelling') + 2 + >>> wf.get_def_class('mass') + 2 + >>> wf.get_def_class('broken') + 2 + >>> wf.get_def_class('therapy') + 3 + >>> wf.get_def_class('vaccine') + 3 + >>> wf.get_def_class('treatment') + 3 + >>> wf.get_def_class('unrelated') + 0 + """ test_terms = { "eval", "evaluation", "evaluations", "sat", "sats", "saturation", @@ -394,7 +659,7 @@ def get_def_class(self, word): "dose", "doses", "shot", "shots", "medication", "medicine", - "treament", "treatments" + "treatment", "treatments" } if word.lower() in test_terms: return 1 diff --git a/tests/test_features_dir.py b/tests/test_features_dir.py new file mode 100644 index 0000000..b45f932 --- /dev/null +++ b/tests/test_features_dir.py @@ -0,0 +1,25 @@ + + +if __name__ == '__main__': + import doctest + + import os, sys + home = os.path.join( os.getenv('CLINER_DIR') , 'cliner' ) + if home not in sys.path: sys.path.append(home) + + #from features_dir import * + + import features_dir.features + doctest.testmod(features_dir.features) + + import features_dir.read_config + doctest.testmod(features_dir.read_config) + + import features_dir.sentence_features + doctest.testmod(features_dir.sentence_features) + + import features_dir.utilities + doctest.testmod(features_dir.utilities) + + import features_dir.word_features + doctest.testmod(features_dir.word_features)