diff --git a/drafttopic/feature_lists/__init__.py b/drafttopic/feature_lists/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/drafttopic/feature_lists/tests/__init__.py b/drafttopic/feature_lists/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/drafttopic/feature_lists/w2v.py b/drafttopic/feature_lists/w2v.py new file mode 100644 index 0000000..cb92c63 --- /dev/null +++ b/drafttopic/feature_lists/w2v.py @@ -0,0 +1,43 @@ +import os + +from gensim.models.keyedvectors import KeyedVectors +import numpy as np + +from revscoring.languages import english +from revscoring.features import FeatureVector +from revscoring.datasources import Datasource + +word2vec = None +VECTORS_DIM = 300 +w2v_path = os.environ.get('WORD2VEC') + + +def load_word2vec(filepath): + global word2vec + if word2vec is not None: + return word2vec + word2vec = KeyedVectors.load_word2vec_format( + filepath, binary=True, limit=100000) + return word2vec + + +def get_word_vectors(non_stop_tokens): + word2vec = load_word2vec(w2v_path) + return np.mean( + [word2vec[w] for w in non_stop_tokens if w in word2vec] or + [np.zeros(VECTORS_DIM)], axis=0 + ) + + +word_vectors = Datasource( + "word_vectors", get_word_vectors, + depends_on=[english.stopwords.revision.datasources.non_stopwords] +) + +w2v = FeatureVector( + "word2vec", get_word_vectors, + depends_on=[english.stopwords.revision.datasources.non_stopwords], + returns=np.float32 +) + +drafttopic = [w2v] diff --git a/drafttopic/utilities/extract_from_text.py b/drafttopic/utilities/extract_from_text.py new file mode 100644 index 0000000..5d6f1d6 --- /dev/null +++ b/drafttopic/utilities/extract_from_text.py @@ -0,0 +1,138 @@ +""" +``$ drafttopic extract_from_text -h`` +:: + Extracts dependents from a labeling doc containing text label and + writes a new set of labeling docs that is compatible as observations + for `revscoring`'s cv_train and tune utilities. + Input: { ... "mid-level-categories": ...,"text": ..., ... } + Output: { ... "mid-level-categories": ..., "cache": ..., ... } + Usage: + extract_from_text ... + [--input=] + [--output=] + [--extractors=] + [--verbose] + [--debug] + + Options: + -h --help Print this documentation + Classpath to a single dependent or list of + dependent values to solve + --input= Path to a file containing observations + [default: ] + --output= Path to a file to write new observations to + [default: ] + --extractors= The number of parallel extractors to + start [default: ] + --verbose Print dots and stuff to stderr + --debug Print debug logs +""" +import logging +import sys +from multiprocessing import Pool, cpu_count + +import docopt +import yamlconf +from revscoring import Dependent +from revscoring.datasources import revision_oriented +from revscoring.dependencies import solve +from revscoring.utilities.util import dump_observation, read_observations + + +def main(argv=None): + args = docopt.docopt(__doc__, argv=argv) + + logging.basicConfig( + level=logging.WARNING if not args['--debug'] else logging.DEBUG, + format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' + ) + + dependents = [] + for dependent_path in args['']: + dependent_or_list = yamlconf.import_path(dependent_path) + if isinstance(dependent_or_list, Dependent): + dependents.append(dependent_or_list) + else: + dependents.extend(dependent_or_list) + + if args['--input'] == "": + observations = read_observations(sys.stdin) + else: + observations = read_observations(open(args['--input'])) + + if args['--output'] == "": + output = sys.stdout + else: + output = open(args['--output'], 'w') + + if args['--extractors'] == "": + extractors = cpu_count() + else: + extractors = int(args['--extractors']) + + verbose = args['--verbose'] + + run(observations, dependents, output, extractors, verbose) + + +def run(labelings, dependents, output, extractors, verbose=False): + extractor_pool = Pool(processes=extractors) + + extractor = LabelingDependentExtractor(dependents) + + for observation in extractor_pool.imap( + extractor.extract_and_cache, labelings): + if observation is not None: + if verbose: + sys.stderr.write(".") + sys.stderr.flush() + + dump_observation(observation, output) + else: + if verbose: + sys.stderr.write("-") + sys.stderr.flush() + + if verbose: + sys.stderr.write("\n") + + +class LabelingDependentExtractor: + + def __init__(self, dependents): + self.dependents = dependents + + def extract_and_cache(self, observation): + if observation['text'] is None: + return None + + values = extract_from_text( + self.dependents, observation['text'], + cache=observation.get('cache')) + dependent_cache = {str(d): val + for d, val in zip(self.dependents, values)} + + del observation['text'] + updated_cache = observation.get('cache', {}) + updated_cache.update(dependent_cache) + observation['cache'] = updated_cache + + return observation + + +def extract_from_text(dependents, text, cache=None, context=None): + """ + Extracts a set of values from a text an returns a cache containing just + those values. + :Parameters: + dependents : `list`( :class:`revscoring.Dependent` ) + A list of dependents to extract values for + text : `str` + A text from which to extract features + :Returns: + A list of extracted feature values + """ + cache = cache if cache is not None else {} + cache[revision_oriented.revision.text] = text + + return list(solve(dependents, cache=cache, context=context))