Merge branch 'doctests'

* doctests: (23 commits) fixed failing doctests expanded on current doctests added more doctests added doctests removed unnecessary doctests added more doctests cleaned up some documentation removed excess test files concatenated tests into one file added structure for doctests added test file for sentence_features added test file for read_config added basic doctests added test file for features added basic doctests Fixed doctests, modified regexes added more doctests/documentation added more doctests and general documentation fixed failing doctest added test file for word features ... Conflicts: cliner/features_dir/features.py cliner/features_dir/sentence_features.py cliner/features_dir/utilities.py cliner/features_dir/word_features.py
text-machine-lab · Apr 7, 2015 · 5e03a5a · 5e03a5a
2 parents ec07da9 + 853a818
commit 5e03a5a
Show file tree

Hide file tree

Showing 7 changed files with 417 additions and 72 deletions.
diff --git a/cliner/features_dir/features.py b/cliner/features_dir/features.py
@@ -12,7 +12,7 @@
 
 
 from wordshape import getWordShapes
-from utilities import prose_sentence
+from utilities import is_prose_sentence
 
 from sentence_features import SentenceFeatures
 
@@ -31,14 +31,19 @@ def __init__(self, data=None):
 
 
 
-    # IOB_features()
-    #
-    # input:  A sentence
-    # output: A hash table of features
     def extract_IOB_features(self, sentence):
+        """
+        extract_IOB_features()
 
+        @param sentence. A list of chunks
+        @return          tuple: boolean (Prose or not), a list of dictionaries of features
+
+        >>> fw = FeatureWrapper()
+        >>> fw.extract_IOB_features(['this', 'is', 'a' 'test']) is not None
+        True
+        """
         # Different features depending on whether sentence is 'prose'
-        isProse = prose_sentence(sentence)
+        isProse = is_prose_sentence(sentence)
 
         if isProse:
             features_list = self.feat_sent.IOB_prose_features(sentence)
@@ -50,13 +55,18 @@ def extract_IOB_features(self, sentence):
 
 
 
-    # concept_features()
-    #
-    # input:  A sentence/line from a medical text file (list of chunks)
-    #         An list of indices into the sentence for each important chunk
-    # output: A list of hash tables of features
     def concept_features(self, sentence, chunk_inds):
+        """
+        concept_features()
+
+        @param sentence.   a list of chunks
+        @param chunk_inds. a list of important indices of the sentence
+        @return            a list of dictionaries of features
 
+        >>> fw = FeatureWrapper()
+        >>> fw.concept_features(['this', 'is', 'an', 'important', 'test'], [3, 4]) is not None
+        True
+        """
         # FIXME - move all of this work to SentenceFeatures object
 
         '''

diff --git a/cliner/features_dir/genia_dir/genia_features.py b/cliner/features_dir/genia_dir/genia_features.py
@@ -24,7 +24,7 @@ def __init__(self, tagger, data):
         """
 
         # Filter out nonprose sentences
-        prose = [ sent  for  sent  in  data  if  utilities.prose_sentence(sent) ]
+        prose = [ sent  for  sent  in  data  if  utilities.is_prose_sentence(sent) ]
 
         # Process prose sentences with GENIA tagger
         self.GENIA_features = iter(interface_genia.genia(tagger, prose))

diff --git a/cliner/features_dir/read_config.py b/cliner/features_dir/read_config.py
@@ -12,16 +12,17 @@
 import os
 
 
-
-#
-# enabled_modules
-#
-# @return dictionary of (name,resource path) pairs.
-#
-#   ex. {'UMLS': None, 'GENIA': 'genia/geniatagger-3.0.1/geniatagger'}
-#
 def enabled_modules():
+    """
+    enabled_modules()
+
+    @return a dictionary of {name, resource} pairs.
+
+    ex. {'UMLS': None, 'GENIA': 'genia/geniatagger-3.0.1/geniatagger'}
 
+    >>> enabled_modules() is not None
+    True
+    """
     # Open config file
     filename = os.path.join( os.getenv('CLINER_DIR'), 'config.txt' )
     f = open(filename, 'r')

diff --git a/cliner/features_dir/sentence_features.py b/cliner/features_dir/sentence_features.py
@@ -77,12 +77,15 @@ def __init__(self, data):
         self.enabled_IOB_prose_sentence_features.append('UMLS')
 
 
-    # IOB_prose_features()
-    #
-    # input:  A sentence
-    # output: A list of hash tables of features
+
     def IOB_prose_features(self, sentence):
+        """
+        IOB_prose_features
+
+        @param sentence. A list of strings
+        @return          A list of dictionaries of features
 
+        """
         features_list = []
 
         # Get a feature set for each word in the sentence
@@ -224,14 +227,15 @@ def IOB_prose_features(self, sentence):
         return features_list
 
 
-
-    # IOB_nonprose_features()
-    #
-    # input:  A sentence
-    # output: A hash table of features
     def IOB_nonprose_features(self, sentence):
-
-
+        """
+        IOB_nonprose_features
+        
+        @param sentence. A list of strings
+        @return          A list of dictionaries of features
+        
+        """
+
         # Get a feature set for each word in the sentence
         features_list = []
         for i,word in enumerate(sentence):

diff --git a/cliner/features_dir/utilities.py b/cliner/features_dir/utilities.py
@@ -11,6 +11,7 @@
 import cPickle as pickle
 import os
 
+
 # used as a default path for stashing pos tagger.
 pos_tagger_path = os.path.join( os.environ['CLINER_DIR'], "cliner/features_dir/nltk_tagger.p")
 
@@ -64,11 +65,24 @@ def load_pos_tagger(path_to_obj=pos_tagger_path):
 
     return tagger
 
-# prose_sentence()
-#
-# input:  A sentence
-# output: Boolean yes/no
-def prose_sentence(sentence):
+def is_prose_sentence(sentence):
+    """
+    is_prose_sentence()
+
+    Purpose: Determine if a sentence of text is 'prose'
+
+    @param sentence A list of words
+    @return         A boolean
+
+    >>> is_prose_sentence(['Admission', 'Date', ':'])
+    False
+    >>> is_prose_sentence(['Hello', 'World', '.'])
+    True
+    >>> is_prose_sentence(['What', 'do', 'you', 'think', '?'])
+    True
+    >>> is_prose_sentence(['Short', 'sentence'])
+    False
+    """
 
     # Empty sentence is not prose
     if not sentence:
@@ -80,34 +94,57 @@ def prose_sentence(sentence):
         return False
     elif len(sentence) <= 5:
         return False
-    elif at_least_half_nonprose(sentence):
+    elif is_at_least_half_nonprose(sentence):
         return True
     else:
         return False
 
 
 
-# at_least_half_nonprose()
-#
-# input:  A sentence
-# output: A bollean yes/no
-def at_least_half_nonprose(sentence):
+def is_at_least_half_nonprose(sentence):
+    """
+    is_at_least_half_nonprose(sentence)
 
-    count = len(  [ w  for  w  in  sentence  if prose_word(w) ]  )
+    Purpose: Checks if at least half of the sentence is considered to be 'nonprose'
+
+    @param sentence. A list of words
+    @return          A boolean
+
+    >>> is_at_least_half_nonprose(['1','2','and','some','words'])
+    True
+    >>> is_at_least_half_nonprose(['1', '2', '3', '4', 'and', 'some', 'words', '5'])   
+    False
+    >>> is_at_least_half_nonprose(['word'])
+    True
+    >>> is_at_least_half_nonprose([' '])
+    True
+    """
+    count = len(  [ w  for  w  in  sentence  if is_prose_word(w) ]  )
 
     if count >= len(sentence)/2:
         return True
     else:
         return False
 
 
+def is_prose_word(word):
+    """
+    is_prose_word(word)
+
+    Purpose: Checks if the given word is 'prose'
 
-# prose_word()
-#
-# input:  A word
-# output: Boolean yes/no
-def prose_word(word):
+    @param word. A word
+    @return      A boolean
 
+    >>> is_prose_word('word')
+    True
+    >>> is_prose_word('99') 
+    False
+    >>> is_prose_word('question?')
+    False
+    >>> is_prose_word('ALLCAPS')
+    False
+    """
     # Punctuation
     for punc in ".?,!:\"'":
         if punc in word:
@@ -123,6 +160,9 @@ def prose_word(word):
 
     # Else
     return True
+<<<<<<< HEAD
 
 
 #EOF
+=======
+>>>>>>> doctests