Skip to content
This repository has been archived by the owner on Aug 15, 2020. It is now read-only.

Commit

Permalink
make cliner python3 compatible. add validation data arguments
Browse files Browse the repository at this point in the history
  • Loading branch information
wboag committed Oct 13, 2017
1 parent c5af26c commit 78ebda2
Show file tree
Hide file tree
Showing 19 changed files with 295 additions and 379 deletions.
2 changes: 1 addition & 1 deletion README.rst
Expand Up @@ -87,4 +87,4 @@ This allows us to evaluate how well CliNER does by comparing it against a gold s

cliner evaluate --txt data/examples/ex_doc.txt --gold examples --predictions data/test_predictions/ --format i2b2

Evaluate how well the system predictions did for given discharge summaries. The prediction and reference directories are provided with the --predictions and --gold flags, respectively. Both sets of data must be in the same format, and that format must be specified - in this case, they are both i2b2. This means that both the examples and data/test_predictions directories contain the file pretend.con.
Evaluate how well the system predictions did. Both sets of data must be in the same format, and that format must be specified. This means that both the examples and data/test_predictions directories contain the file pretend.con.
9 changes: 2 additions & 7 deletions cliner
Expand Up @@ -9,15 +9,13 @@
######################################################################



import sys
import os



def main():

commands = ['train', 'predict', 'evaluate', 'error']
commands = ['train', 'predict', 'evaluate']

help_msg = \
'''
Expand All @@ -32,7 +30,7 @@ def main():

# Is argument correct?
if len(sys.argv)<2 or sys.argv[1] not in commands or sys.argv[1] == '--help':
print >>sys.stderr, help_msg, '\n'
sys.stderr.write('%s\n\n'%(help_msg))
exit(1)

# select appropriate sub-command
Expand All @@ -55,9 +53,6 @@ def main():
elif subcmd == 'evaluate':
import evaluate
evaluate.main()
elif subcmd == 'error':
import error
error.main()



Expand Down
25 changes: 12 additions & 13 deletions code/evaluate.py
Expand Up @@ -12,7 +12,7 @@
import glob
import random
import shutil
import commands
import subprocess

import tools

Expand All @@ -37,33 +37,33 @@ def main():


if not args.pred:
print '\n\tERROR: must provide --pred argument\n'
sys.stderr.write('\n\tERROR: must provide --pred argument\n\n')
parser.print_help(sys.stderr)
print >>sys.stderr, ''
sys.stderr.write('\n')
exit(1)

if not args.gold:
print '\n\tERROR: must provide --gold argument\n'
sys.stderr.write('\n\tERROR: must provide --gold argument\n\n')
parser.print_help(sys.stderr)
print >>sys.stderr, ''
sys.stderr.write('\n')
exit(1)

if args.format:
format = args.format
else:
print '\n\tERROR: must provide --format argument\n'
sys.stderr.write('\n\tERROR: must provide --format argument\n\n')
parser.print_help(sys.stderr)
print >>sys.stderr, ''
sys.stderr.write('\n')
exit(1)


# Must specify output format
if format not in ['i2b2']:
print >>sys.stderr, '\n\tError: Must specify output format'
print >>sys.stderr, '\tAvailable formats: con'
print >>sys.stderr, ''
sys.stderr.write('\n\tError: Must specify output format\n')
sys.stderr.write('\tAvailable formats: i2b2\n')
sys.stderr.write('\n')
parser.print_help(sys.stderr)
print >>sys.stderr, ''
sys.stderr.write('\n')
exit(1)


Expand Down Expand Up @@ -112,8 +112,7 @@ def main():
eval_jar = os.path.join(eval_dir, 'i2b2va-eval.jar')

cmd = 'java -jar %s -rcp %s -scp %s -ft con -ex all' % (eval_jar, gold_dir, pred_dir)
status,output = commands.getstatusoutput(cmd)
print output
status = subprocess.call(cmd, shell=True, stdout=sys.stdout)

# cleanup after yourself
shutil.rmtree(tempdir_name)
Expand Down
40 changes: 20 additions & 20 deletions code/feature_extraction/features.py
Expand Up @@ -8,9 +8,9 @@


# What modules are available
from utilities import load_pos_tagger
from read_config import enabled_modules
import word_features as feat_word
from .utils import load_pos_tagger
from .read_config import enabled_modules
from . import word_features as feat_word



Expand Down Expand Up @@ -158,12 +158,12 @@ def extract_features_sentence(sentence):
genia_feat_list = feat_genia.features(sentence)

'''
print '\t', sentence
print '\n\n'
print( '\t', sentence)
print( '\n\n')
for gf in genia_feat_list:
print '\t', gf
print
print '\n\n'
print( '\t', gf)
print()
print ('\n\n')
'''

for i,feat_dict in enumerate(genia_feat_list):
Expand All @@ -184,7 +184,7 @@ def extract_features_sentence(sentence):
ngram_features = [{} for i in range(len(features_list))]
if "prev" in enabled_IOB_prose_sentence_features:
prev = lambda f: {("prev_"+k[0], k[1]): v for k,v in f.items()}
prev_list = map(prev, features_list)
prev_list = list(map(prev, features_list))
for i in range(len(features_list)):
if i == 0:
ngram_features[i][("prev", "*")] = 1
Expand All @@ -193,7 +193,7 @@ def extract_features_sentence(sentence):

if "prev2" in enabled_IOB_prose_sentence_features:
prev2 = lambda f: {("prev2_"+k[0], k[1]): v/2.0 for k,v in f.items()}
prev_list = map(prev2, features_list)
prev_list = list(map(prev2, features_list))
for i in range(len(features_list)):
if i == 0:
ngram_features[i][("prev2", "*")] = 1
Expand All @@ -204,7 +204,7 @@ def extract_features_sentence(sentence):

if "next" in enabled_IOB_prose_sentence_features:
next = lambda f: {("next_"+k[0], k[1]): v for k,v in f.items()}
next_list = map(next, features_list)
next_list = list(map(next, features_list))
for i in range(len(features_list)):
if i < len(features_list) - 1:
ngram_features[i].update(next_list[i+1])
Expand All @@ -213,7 +213,7 @@ def extract_features_sentence(sentence):

if "next2" in enabled_IOB_prose_sentence_features:
next2 = lambda f: {("next2_"+k[0], k[1]): v/2.0 for k,v in f.items()}
next_list = map(next2, features_list)
next_list = list(map(next2, features_list))
for i in range(len(features_list)):
if i < len(features_list) - 2:
ngram_features[i].update(next_list[i+2])
Expand All @@ -222,26 +222,26 @@ def extract_features_sentence(sentence):
else:
ngram_features[i][("next2", "*")] = 1

merged = lambda d1, d2: dict(d1.items() + d2.items())
merged = lambda d1, d2: dict(list(d1.items()) + list(d2.items()))
features_list = [merged(features_list[i], ngram_features[i])
for i in range(len(features_list))]

'''
for f in features_list:
print sorted(f.items())
print
print '\n\n\n'
print (sorted(f.items()))
print ()
print ('\n\n\n')
'''

return features_list



def display_enabled_modules():
print
print()
for module,status in enabled.items():
if status:
print '\t', module, '\t', ' ENABLED'
print ('\t', module, '\t', ' ENABLED')
else:
print '\t', module, '\t', 'DISABLED'
print
print ('\t', module, '\t', 'DISABLED')
print()
2 changes: 1 addition & 1 deletion code/feature_extraction/read_config.py
Expand Up @@ -55,4 +55,4 @@ def enabled_modules():


if __name__ == "__main__":
print enabled_modules()
print(enabled_modules())
Empty file modified code/feature_extraction/sentence_features.py 100755 → 100644
Empty file.
5 changes: 1 addition & 4 deletions code/feature_extraction/umls_dir/umls_cache.py
@@ -1,4 +1,4 @@
import cPickle as pickle
import pickle
import sys
import os

Expand All @@ -15,9 +15,6 @@
umls_tables = enabled['UMLS']


features_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if features_dir not in sys.path:
sys.path.append(features_dir)
from utilities import load_pickled_obj

class UmlsCache:
Expand Down
24 changes: 9 additions & 15 deletions code/feature_extraction/utilities.py → code/feature_extraction/utils.py 100755 → 100644
Expand Up @@ -8,48 +8,43 @@


import re
import cPickle as pickle
import pickle
import os
import sys


# used as a default path for stashing pos tagger.
dname = os.path.dirname
CLINER_DIR = dname(dname(dname(os.path.abspath(__file__))))
pos_tagger_path = os.path.join( CLINER_DIR, 'code', 'feature_extraction', 'taggers', 'maxent_treebank_pos_tagger.pickle')
tagger_name = 'py%d_maxent_treebank_pos_tagger.pickle' % sys.version_info.major
pos_tagger_path = os.path.join(CLINER_DIR, 'tools', tagger_name)

def load_pickled_obj(path_to_pickled_obj):

def load_pickled_obj(path_to_pickled_obj):
data = None

with open(path_to_pickled_obj, "rb") as f:

data = f.read()

return pickle.loads(data)

def pickle_dump(obj, path_to_obj):

def pickle_dump(obj, path_to_obj):
f = open(path_to_obj, "wb")

# NOTE: using highest priority makes loading TRAINED models load really slowly.
# use this function for anything BUT THAT!. I mainly made this for loading pos tagger...
pickle.dump(obj, f, -1)

f.close()

def dump_pos_tagger(path_to_obj):

def dump_pos_tagger(path_to_obj):
tagger = nltk.data.load(nltk.tag._POS_TAGGER)

pickle_dump(tagger, path_to_obj)

def load_pos_tagger(path_to_obj=pos_tagger_path):
""" faster tagger loading """

def load_pos_tagger(path_to_obj=pos_tagger_path):
tagger = load_pickled_obj(path_to_obj)

return tagger


def is_prose_sentence(sentence):
"""
is_prose_sentence()
Expand All @@ -68,7 +63,6 @@ def is_prose_sentence(sentence):
>>> is_prose_sentence(['Short', 'sentence'])
False
"""

# Empty sentence is not prose
if not sentence:
return False
Expand Down
30 changes: 15 additions & 15 deletions code/feature_extraction/word_features.py 100755 → 100644
Expand Up @@ -14,7 +14,7 @@
import os
import sys

from wordshape import getWordShapes
from .wordshape import getWordShapes
from nltk import LancasterStemmer, PorterStemmer

lancaster_st = LancasterStemmer()
Expand All @@ -34,7 +34,7 @@ def feature_last_two_letters(word):
return {('last_two_letters', word[-2:]): 1}

def feature_length(word):
return {('length', None): len(word)}
return {('length', ''): len(word)}

def feature_stem_porter(word):
return {('stem_porter', porter_st.stem(word)): 1}
Expand All @@ -54,7 +54,7 @@ def feature_word_shape(word):
return features

def feature_metric_unit(word):
unit = None
unit = ''
if is_weight(word):
unit = 'weight'
elif is_size(word):
Expand Down Expand Up @@ -83,34 +83,34 @@ def QANN_features(word):
features = {}

# Feature: test result
if is_test_result(word): features[('test_result',None)] = 1
if is_test_result(word): features[('test_result','')] = 1

# Feature: measurements
if is_measurement(word): features[('measurement',None)] = 1
if is_measurement(word): features[('measurement','')] = 1

# Feature: directive
if is_directive(word): features[('directive', None)] = 1
if is_directive(word): features[('directive', '')] = 1

# Feature: date
if is_date(word): features[('date', None)] = 1
if is_date(word): features[('date', '')] = 1

# Feature: volume
if is_volume(word): features[('volume', None)] = 1
if is_volume(word): features[('volume', '')] = 1

# Feature: weight
if is_weight(word): features[('weight', None)] = 1
if is_weight(word): features[('weight', '')] = 1

# Feature: size
if is_size(word): features[('size', None)] = 1
if is_size(word): features[('size', '')] = 1

# Feature: prognosis location
if is_prognosis_location: features[('prog_location', None)] = 1
if is_prognosis_location: features[('prog_location', '')] = 1

# Feature: problem form
if has_problem_form(word): features[('problem_form', None)] = 1
if has_problem_form(word): features[('problem_form', '')] = 1

# Feature: concept class
if is_weight(word): features[('weight', None)] = 1
if is_weight(word): features[('weight', '')] = 1

return features

Expand Down Expand Up @@ -149,7 +149,7 @@ def IOB_prose_features(word):
"""

# Feature: <dummy>
features = {('dummy', None): 1} # always have >0 dimensions
features = {('dummy', ''): 1} # always have >0 dimensions

# Extract all enabled features
for feature in enabled_IOB_prose_word_features:
Expand All @@ -175,7 +175,7 @@ def IOB_nonprose_features(word):
"""

# Feature: <dummy>
features = {('dummy', None): 1} # always have >0 dimensions
features = {('dummy', ''): 1} # always have >0 dimensions

# Extract all enabled features
for feature in enabled_IOB_nonprose_word_features:
Expand Down
4 changes: 0 additions & 4 deletions code/feature_extraction/wordshape.py
@@ -1,5 +1,3 @@
import func_cache

import re

BOUNDARY_SIZE = 2
Expand Down Expand Up @@ -417,8 +415,6 @@ def wordShapeChris1 (s):
else:
return "SYMBOL"


# gets Chris1, Dan1, Jenny1, Chris2 and Dan2 word shapes
@func_cache.func_cache(False)
def getWordShapes(word):
return [wordShapeChris1(word), wordShapeDan1(word), wordShapeJenny1(word), wordShapeChris2(word, False, None), wordShapeDan2(word, None)]
Empty file modified code/format.py 100755 → 100644
Empty file.

0 comments on commit 78ebda2

Please sign in to comment.