Skip to content
This repository has been archived by the owner on Aug 15, 2020. It is now read-only.

Commit

Permalink
Merge branch 'doctests'
Browse files Browse the repository at this point in the history
* doctests: (23 commits)
  fixed failing doctests
  expanded on current doctests
  added more doctests
  added doctests
  removed unnecessary doctests
  added more doctests
  cleaned up some documentation
  removed excess test files
  concatenated tests into one file
  added structure for doctests
  added test file for sentence_features
  added test file for read_config
  added basic doctests
  added test file for features
  added basic doctests
  Fixed doctests, modified regexes
  added more doctests/documentation
  added more doctests and general documentation
  fixed failing doctest
  added test file for word features
  ...

Conflicts:
	cliner/features_dir/features.py
	cliner/features_dir/sentence_features.py
	cliner/features_dir/utilities.py
	cliner/features_dir/word_features.py
  • Loading branch information
tnaumann committed Apr 7, 2015
2 parents ec07da9 + 853a818 commit 5e03a5a
Show file tree
Hide file tree
Showing 7 changed files with 417 additions and 72 deletions.
32 changes: 21 additions & 11 deletions cliner/features_dir/features.py
Expand Up @@ -12,7 +12,7 @@


from wordshape import getWordShapes
from utilities import prose_sentence
from utilities import is_prose_sentence

from sentence_features import SentenceFeatures

Expand All @@ -31,14 +31,19 @@ def __init__(self, data=None):



# IOB_features()
#
# input: A sentence
# output: A hash table of features
def extract_IOB_features(self, sentence):
"""
extract_IOB_features()
@param sentence. A list of chunks
@return tuple: boolean (Prose or not), a list of dictionaries of features
>>> fw = FeatureWrapper()
>>> fw.extract_IOB_features(['this', 'is', 'a' 'test']) is not None
True
"""
# Different features depending on whether sentence is 'prose'
isProse = prose_sentence(sentence)
isProse = is_prose_sentence(sentence)

if isProse:
features_list = self.feat_sent.IOB_prose_features(sentence)
Expand All @@ -50,13 +55,18 @@ def extract_IOB_features(self, sentence):



# concept_features()
#
# input: A sentence/line from a medical text file (list of chunks)
# An list of indices into the sentence for each important chunk
# output: A list of hash tables of features
def concept_features(self, sentence, chunk_inds):
"""
concept_features()
@param sentence. a list of chunks
@param chunk_inds. a list of important indices of the sentence
@return a list of dictionaries of features
>>> fw = FeatureWrapper()
>>> fw.concept_features(['this', 'is', 'an', 'important', 'test'], [3, 4]) is not None
True
"""
# FIXME - move all of this work to SentenceFeatures object

'''
Expand Down
2 changes: 1 addition & 1 deletion cliner/features_dir/genia_dir/genia_features.py
Expand Up @@ -24,7 +24,7 @@ def __init__(self, tagger, data):
"""

# Filter out nonprose sentences
prose = [ sent for sent in data if utilities.prose_sentence(sent) ]
prose = [ sent for sent in data if utilities.is_prose_sentence(sent) ]

# Process prose sentences with GENIA tagger
self.GENIA_features = iter(interface_genia.genia(tagger, prose))
Expand Down
17 changes: 9 additions & 8 deletions cliner/features_dir/read_config.py
Expand Up @@ -12,16 +12,17 @@
import os



#
# enabled_modules
#
# @return dictionary of (name,resource path) pairs.
#
# ex. {'UMLS': None, 'GENIA': 'genia/geniatagger-3.0.1/geniatagger'}
#
def enabled_modules():
"""
enabled_modules()
@return a dictionary of {name, resource} pairs.
ex. {'UMLS': None, 'GENIA': 'genia/geniatagger-3.0.1/geniatagger'}
>>> enabled_modules() is not None
True
"""
# Open config file
filename = os.path.join( os.getenv('CLINER_DIR'), 'config.txt' )
f = open(filename, 'r')
Expand Down
26 changes: 15 additions & 11 deletions cliner/features_dir/sentence_features.py
Expand Up @@ -77,12 +77,15 @@ def __init__(self, data):
self.enabled_IOB_prose_sentence_features.append('UMLS')


# IOB_prose_features()
#
# input: A sentence
# output: A list of hash tables of features

def IOB_prose_features(self, sentence):
"""
IOB_prose_features
@param sentence. A list of strings
@return A list of dictionaries of features
"""
features_list = []

# Get a feature set for each word in the sentence
Expand Down Expand Up @@ -224,14 +227,15 @@ def IOB_prose_features(self, sentence):
return features_list



# IOB_nonprose_features()
#
# input: A sentence
# output: A hash table of features
def IOB_nonprose_features(self, sentence):


"""
IOB_nonprose_features
@param sentence. A list of strings
@return A list of dictionaries of features
"""

# Get a feature set for each word in the sentence
features_list = []
for i,word in enumerate(sentence):
Expand Down
74 changes: 57 additions & 17 deletions cliner/features_dir/utilities.py
Expand Up @@ -11,6 +11,7 @@
import cPickle as pickle
import os


# used as a default path for stashing pos tagger.
pos_tagger_path = os.path.join( os.environ['CLINER_DIR'], "cliner/features_dir/nltk_tagger.p")

Expand Down Expand Up @@ -64,11 +65,24 @@ def load_pos_tagger(path_to_obj=pos_tagger_path):

return tagger

# prose_sentence()
#
# input: A sentence
# output: Boolean yes/no
def prose_sentence(sentence):
def is_prose_sentence(sentence):
"""
is_prose_sentence()
Purpose: Determine if a sentence of text is 'prose'
@param sentence A list of words
@return A boolean
>>> is_prose_sentence(['Admission', 'Date', ':'])
False
>>> is_prose_sentence(['Hello', 'World', '.'])
True
>>> is_prose_sentence(['What', 'do', 'you', 'think', '?'])
True
>>> is_prose_sentence(['Short', 'sentence'])
False
"""

# Empty sentence is not prose
if not sentence:
Expand All @@ -80,34 +94,57 @@ def prose_sentence(sentence):
return False
elif len(sentence) <= 5:
return False
elif at_least_half_nonprose(sentence):
elif is_at_least_half_nonprose(sentence):
return True
else:
return False



# at_least_half_nonprose()
#
# input: A sentence
# output: A bollean yes/no
def at_least_half_nonprose(sentence):
def is_at_least_half_nonprose(sentence):
"""
is_at_least_half_nonprose(sentence)
count = len( [ w for w in sentence if prose_word(w) ] )
Purpose: Checks if at least half of the sentence is considered to be 'nonprose'
@param sentence. A list of words
@return A boolean
>>> is_at_least_half_nonprose(['1','2','and','some','words'])
True
>>> is_at_least_half_nonprose(['1', '2', '3', '4', 'and', 'some', 'words', '5'])
False
>>> is_at_least_half_nonprose(['word'])
True
>>> is_at_least_half_nonprose([' '])
True
"""
count = len( [ w for w in sentence if is_prose_word(w) ] )

if count >= len(sentence)/2:
return True
else:
return False


def is_prose_word(word):
"""
is_prose_word(word)
Purpose: Checks if the given word is 'prose'
# prose_word()
#
# input: A word
# output: Boolean yes/no
def prose_word(word):
@param word. A word
@return A boolean
>>> is_prose_word('word')
True
>>> is_prose_word('99')
False
>>> is_prose_word('question?')
False
>>> is_prose_word('ALLCAPS')
False
"""
# Punctuation
for punc in ".?,!:\"'":
if punc in word:
Expand All @@ -123,6 +160,9 @@ def prose_word(word):

# Else
return True
<<<<<<< HEAD


#EOF
=======
>>>>>>> doctests

0 comments on commit 5e03a5a

Please sign in to comment.