make cliner python3 compatible. add validation data arguments

text-machine-lab · Oct 13, 2017 · 78ebda2 · 78ebda2
1 parent c5af26c
commit 78ebda2
Show file tree

Hide file tree

Showing 19 changed files with 295 additions and 379 deletions.
diff --git a/README.rst b/README.rst
@@ -87,4 +87,4 @@ This allows us to evaluate how well CliNER does by comparing it against a gold s
 
     cliner evaluate --txt data/examples/ex_doc.txt --gold examples --predictions data/test_predictions/ --format i2b2
 
-Evaluate how well the system predictions did for given discharge summaries. The prediction and reference directories are provided with the --predictions and --gold flags, respectively. Both sets of data must be in the same format, and that format must be specified - in this case, they are both i2b2. This means that both the examples and data/test_predictions directories contain the file pretend.con.
+Evaluate how well the system predictions did. Both sets of data must be in the same format, and that format must be specified. This means that both the examples and data/test_predictions directories contain the file pretend.con.
diff --git a/cliner b/cliner
@@ -9,15 +9,13 @@
 ######################################################################
 
 
-
 import sys
 import os
 
 
-
 def main():
 
-    commands = ['train', 'predict', 'evaluate', 'error']
+    commands = ['train', 'predict', 'evaluate']
 
     help_msg = \
     '''
@@ -32,7 +30,7 @@ def main():
 
     # Is argument correct?
     if len(sys.argv)<2 or sys.argv[1] not in commands or sys.argv[1] == '--help':
-        print >>sys.stderr, help_msg, '\n'
+        sys.stderr.write('%s\n\n'%(help_msg))
         exit(1)
 
     # select appropriate sub-command
@@ -55,9 +53,6 @@ def main():
     elif subcmd == 'evaluate':
         import evaluate
         evaluate.main()
-    elif subcmd == 'error':
-        import error
-        error.main()
 
 
 

diff --git a/code/evaluate.py b/code/evaluate.py
@@ -12,7 +12,7 @@
 import glob
 import random
 import shutil
-import commands
+import subprocess
 
 import tools
 
@@ -37,33 +37,33 @@ def main():
 
 
     if not args.pred:
-        print '\n\tERROR: must provide --pred argument\n'
+        sys.stderr.write('\n\tERROR: must provide --pred argument\n\n')
         parser.print_help(sys.stderr)
-        print >>sys.stderr,  ''
+        sys.stderr.write('\n')
         exit(1)
 
     if not args.gold:
-        print '\n\tERROR: must provide --gold argument\n'
+        sys.stderr.write('\n\tERROR: must provide --gold argument\n\n')
         parser.print_help(sys.stderr)
-        print >>sys.stderr,  ''
+        sys.stderr.write('\n')
         exit(1)
 
     if args.format:
         format = args.format
     else:
-        print '\n\tERROR: must provide --format argument\n'
+        sys.stderr.write('\n\tERROR: must provide --format argument\n\n')
         parser.print_help(sys.stderr)
-        print >>sys.stderr,  ''
+        sys.stderr.write('\n')
         exit(1)
 
 
     # Must specify output format
     if format not in ['i2b2']:
-        print >>sys.stderr, '\n\tError: Must specify output format'
-        print >>sys.stderr,   '\tAvailable formats: con'
-        print >>sys.stderr, ''
+        sys.stderr.write('\n\tError: Must specify output format\n')
+        sys.stderr.write('\tAvailable formats: i2b2\n')
+        sys.stderr.write('\n')
         parser.print_help(sys.stderr)
-        print >>sys.stderr,  ''
+        sys.stderr.write('\n')
         exit(1)
 
 
@@ -112,8 +112,7 @@ def main():
     eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar')
 
     cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir, pred_dir)
-    status,output = commands.getstatusoutput(cmd)
-    print output
+    status = subprocess.call(cmd, shell=True, stdout=sys.stdout)
 
     # cleanup after yourself
     shutil.rmtree(tempdir_name)

diff --git a/code/feature_extraction/features.py b/code/feature_extraction/features.py
@@ -8,9 +8,9 @@
 
 
 # What modules are available
-from utilities import load_pos_tagger
-from read_config import enabled_modules
-import word_features as feat_word
+from .utils import load_pos_tagger
+from .read_config import enabled_modules
+from . import word_features as feat_word
 
 
 
@@ -158,12 +158,12 @@ def extract_features_sentence(sentence):
             genia_feat_list = feat_genia.features(sentence)
 
             '''
-            print '\t', sentence
-            print '\n\n'
+            print( '\t', sentence)
+            print( '\n\n')
             for gf in genia_feat_list:
-                print '\t', gf
-                print
-            print '\n\n'
+                print( '\t', gf)
+                print()
+            print ('\n\n')
             '''
 
             for i,feat_dict in enumerate(genia_feat_list):
@@ -184,7 +184,7 @@ def extract_features_sentence(sentence):
     ngram_features = [{} for i in range(len(features_list))]
     if "prev" in enabled_IOB_prose_sentence_features:
         prev = lambda f: {("prev_"+k[0], k[1]): v for k,v in f.items()}
-        prev_list = map(prev, features_list)
+        prev_list = list(map(prev, features_list))
         for i in range(len(features_list)):
             if i == 0:
                 ngram_features[i][("prev", "*")] = 1
@@ -193,7 +193,7 @@ def extract_features_sentence(sentence):
 
     if "prev2" in enabled_IOB_prose_sentence_features:
         prev2 = lambda f: {("prev2_"+k[0], k[1]): v/2.0 for k,v in f.items()}
-        prev_list = map(prev2, features_list)
+        prev_list = list(map(prev2, features_list))
         for i in range(len(features_list)):
             if i == 0:
                 ngram_features[i][("prev2", "*")] = 1
@@ -204,7 +204,7 @@ def extract_features_sentence(sentence):
 
     if "next" in enabled_IOB_prose_sentence_features:
         next = lambda f: {("next_"+k[0], k[1]): v for k,v in f.items()}
-        next_list = map(next, features_list)
+        next_list = list(map(next, features_list))
         for i in range(len(features_list)):
             if i < len(features_list) - 1:
                 ngram_features[i].update(next_list[i+1])
@@ -213,7 +213,7 @@ def extract_features_sentence(sentence):
 
     if "next2" in enabled_IOB_prose_sentence_features:
         next2 = lambda f: {("next2_"+k[0], k[1]): v/2.0 for k,v in f.items()}
-        next_list = map(next2, features_list)
+        next_list = list(map(next2, features_list))
         for i in range(len(features_list)):
             if i < len(features_list) - 2:
                 ngram_features[i].update(next_list[i+2])
@@ -222,26 +222,26 @@ def extract_features_sentence(sentence):
             else:
                 ngram_features[i][("next2", "*")] = 1
 
-    merged = lambda d1, d2: dict(d1.items() + d2.items())
+    merged = lambda d1, d2: dict(list(d1.items()) + list(d2.items()))
     features_list = [merged(features_list[i], ngram_features[i])
         for i in range(len(features_list))]
 
     '''
     for f in features_list:
-        print sorted(f.items())
-        print
-    print '\n\n\n'
+        print (sorted(f.items()))
+        print ()
+    print ('\n\n\n')
     '''
 
     return features_list
 
 
 
 def display_enabled_modules():
-    print
+    print()
     for module,status in enabled.items():
         if status:
-            print '\t', module, '\t', ' ENABLED'
+            print ('\t', module, '\t', ' ENABLED')
         else:
-            print '\t', module, '\t', 'DISABLED'
-    print
+            print ('\t', module, '\t', 'DISABLED')
+    print()
diff --git a/code/feature_extraction/read_config.py b/code/feature_extraction/read_config.py
@@ -55,4 +55,4 @@ def enabled_modules():
 
 
 if __name__ == "__main__":
-    print enabled_modules()
+    print(enabled_modules())
diff --git a/code/feature_extraction/sentence_features.py b/code/feature_extraction/sentence_features.py
diff --git a/code/feature_extraction/umls_dir/umls_cache.py b/code/feature_extraction/umls_dir/umls_cache.py
@@ -1,4 +1,4 @@
-import cPickle as pickle
+import pickle
 import sys
 import os
 
@@ -15,9 +15,6 @@
 umls_tables = enabled['UMLS']
 
 
-features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if features_dir not in sys.path:
-    sys.path.append(features_dir)
 from utilities import load_pickled_obj
 
 class UmlsCache:

diff --git a/code/feature_extraction/utilities.py → code/feature_extraction/utils.py b/code/feature_extraction/utilities.py → code/feature_extraction/utils.py
@@ -8,48 +8,43 @@
 
 
 import re
-import cPickle as pickle
+import pickle
 import os
+import sys
 
 
 # used as a default path for stashing pos tagger.
 dname = os.path.dirname
 CLINER_DIR = dname(dname(dname(os.path.abspath(__file__))))
-pos_tagger_path = os.path.join( CLINER_DIR, 'code', 'feature_extraction', 'taggers', 'maxent_treebank_pos_tagger.pickle')
+tagger_name = 'py%d_maxent_treebank_pos_tagger.pickle' % sys.version_info.major
+pos_tagger_path = os.path.join(CLINER_DIR, 'tools', tagger_name)
 
-def load_pickled_obj(path_to_pickled_obj):
 
+def load_pickled_obj(path_to_pickled_obj):
     data = None
-
     with open(path_to_pickled_obj, "rb") as f:
-
         data = f.read()
-
     return pickle.loads(data)
 
-def pickle_dump(obj, path_to_obj):
 
+def pickle_dump(obj, path_to_obj):
     f = open(path_to_obj, "wb")
-
     # NOTE: using highest priority makes loading TRAINED models load really slowly.
     # use this function for anything BUT THAT!. I mainly made this for loading pos tagger...
     pickle.dump(obj, f, -1)
-
     f.close()
 
-def dump_pos_tagger(path_to_obj):
 
+def dump_pos_tagger(path_to_obj):
     tagger = nltk.data.load(nltk.tag._POS_TAGGER)
-
     pickle_dump(tagger, path_to_obj)
 
-def load_pos_tagger(path_to_obj=pos_tagger_path):
-    """ faster tagger loading """
 
+def load_pos_tagger(path_to_obj=pos_tagger_path):
     tagger = load_pickled_obj(path_to_obj)
-
     return tagger
 
+
 def is_prose_sentence(sentence):
     """
     is_prose_sentence()
@@ -68,7 +63,6 @@ def is_prose_sentence(sentence):
     >>> is_prose_sentence(['Short', 'sentence'])
     False
     """
-
     # Empty sentence is not prose
     if not sentence:
         return False

diff --git a/code/feature_extraction/word_features.py b/code/feature_extraction/word_features.py
@@ -14,7 +14,7 @@
 import os
 import sys
 
-from wordshape import getWordShapes
+from .wordshape import getWordShapes
 from nltk import LancasterStemmer, PorterStemmer
 
 lancaster_st = LancasterStemmer()
@@ -34,7 +34,7 @@ def feature_last_two_letters(word):
     return {('last_two_letters', word[-2:]): 1}
 
 def feature_length(word):
-    return {('length', None): len(word)}
+    return {('length', ''): len(word)}
 
 def feature_stem_porter(word):
     return {('stem_porter', porter_st.stem(word)): 1}
@@ -54,7 +54,7 @@ def feature_word_shape(word):
     return features
 
 def feature_metric_unit(word):
-    unit = None
+    unit = ''
     if is_weight(word):
         unit = 'weight'
     elif is_size(word):
@@ -83,34 +83,34 @@ def QANN_features(word):
     features = {}
 
     # Feature: test result
-    if is_test_result(word):    features[('test_result',None)] = 1
+    if is_test_result(word):    features[('test_result','')] = 1
 
     # Feature: measurements
-    if is_measurement(word):    features[('measurement',None)] = 1
+    if is_measurement(word):    features[('measurement','')] = 1
 
     # Feature: directive
-    if is_directive(word):      features[('directive',  None)] = 1
+    if is_directive(word):      features[('directive',  '')] = 1
 
     # Feature: date
-    if is_date(word):           features[('date',       None)] = 1
+    if is_date(word):           features[('date',       '')] = 1
 
     # Feature: volume
-    if is_volume(word):         features[('volume',     None)] = 1
+    if is_volume(word):         features[('volume',     '')] = 1
 
     # Feature: weight
-    if is_weight(word):         features[('weight',     None)] = 1
+    if is_weight(word):         features[('weight',     '')] = 1
 
     # Feature: size
-    if is_size(word):           features[('size',       None)] = 1
+    if is_size(word):           features[('size',       '')] = 1
 
     # Feature: prognosis location
-    if is_prognosis_location:   features[('prog_location', None)] = 1
+    if is_prognosis_location:   features[('prog_location', '')] = 1
 
     # Feature: problem form
-    if has_problem_form(word):  features[('problem_form',     None)] = 1
+    if has_problem_form(word):  features[('problem_form',     '')] = 1
 
     # Feature: concept class
-    if is_weight(word):         features[('weight',     None)] = 1
+    if is_weight(word):         features[('weight',     '')] = 1
 
     return features
 
@@ -149,7 +149,7 @@ def IOB_prose_features(word):
     """
 
     # Feature: <dummy>
-    features = {('dummy', None): 1}  # always have >0 dimensions
+    features = {('dummy', ''): 1}  # always have >0 dimensions
 
     # Extract all enabled features
     for feature in enabled_IOB_prose_word_features:
@@ -175,7 +175,7 @@ def IOB_nonprose_features(word):
     """
 
     # Feature: <dummy>
-    features = {('dummy', None): 1}  # always have >0 dimensions
+    features = {('dummy', ''): 1}  # always have >0 dimensions
 
     # Extract all enabled features
     for feature in enabled_IOB_nonprose_word_features:

diff --git a/code/feature_extraction/wordshape.py b/code/feature_extraction/wordshape.py
@@ -1,5 +1,3 @@
-import func_cache
-
 import re
 
 BOUNDARY_SIZE = 2
@@ -417,8 +415,6 @@ def wordShapeChris1 (s):
     else:
         return "SYMBOL"
 
-
 # gets Chris1, Dan1, Jenny1, Chris2 and Dan2 word shapes
-@func_cache.func_cache(False)
 def getWordShapes(word):
     return [wordShapeChris1(word), wordShapeDan1(word), wordShapeJenny1(word), wordShapeChris2(word, False, None), wordShapeDan2(word, None)]
diff --git a/code/format.py b/code/format.py