wikimedia · codez266 · Nov 21, 2017 · Dec 20, 2017 · Dec 20, 2017
diff --git a/drafttopic/feature_lists/__init__.py b/drafttopic/feature_lists/__init__.py
diff --git a/drafttopic/feature_lists/tests/__init__.py b/drafttopic/feature_lists/tests/__init__.py
diff --git a/drafttopic/feature_lists/w2v.py b/drafttopic/feature_lists/w2v.py
@@ -0,0 +1,43 @@
+import os
+
+from gensim.models.keyedvectors import KeyedVectors
+import numpy as np
+
+from revscoring.languages import english
+from revscoring.features import FeatureVector
+from revscoring.datasources import Datasource
+
+word2vec = None
+VECTORS_DIM = 300
+w2v_path = os.environ.get('WORD2VEC')
+
+
+def load_word2vec(filepath):
+    global word2vec
+    if word2vec is not None:
+        return word2vec
+    word2vec = KeyedVectors.load_word2vec_format(
+        filepath, binary=True, limit=100000)
+    return word2vec
+
+
+def get_word_vectors(non_stop_tokens):
+    word2vec = load_word2vec(w2v_path)
+    return np.mean(
+        [word2vec[w] for w in non_stop_tokens if w in word2vec] or
+        [np.zeros(VECTORS_DIM)], axis=0
+    )
+
+
+word_vectors = Datasource(
+    "word_vectors", get_word_vectors,
+    depends_on=[english.stopwords.revision.datasources.non_stopwords]
+)
+
+w2v = FeatureVector(
+    "word2vec", get_word_vectors,
+    depends_on=[english.stopwords.revision.datasources.non_stopwords],
+    returns=np.float32
+)
+
+drafttopic = [w2v]
diff --git a/drafttopic/utilities/extract_from_text.py b/drafttopic/utilities/extract_from_text.py
@@ -0,0 +1,138 @@
+"""
+``$ drafttopic extract_from_text -h``
+::
+    Extracts dependents from a labeling doc containing text label and
+    writes a new set of labeling docs that is compatible as observations
+    for `revscoring`'s cv_train and tune utilities.
+    Input: { ... "mid-level-categories": ...,"text": ..., ... }
+    Output: { ... "mid-level-categories": ..., "cache": ..., ... }
+    Usage:
+        extract_from_text <dependent>...
+                          [--input=<path>]
+                          [--output=<path>]
+                          [--extractors=<num>]
+                          [--verbose]
+                          [--debug]
+
+    Options:
+        -h --help               Print this documentation
+        <dependent>             Classpath to a single dependent or list of
+                                dependent values to solve
+        --input=<path>          Path to a file containing observations
+                                [default: <stdin>]
+        --output=<path>         Path to a file to write new observations to
+                                [default: <stdout>]
+        --extractors=<num>      The number of parallel extractors to
+                                start [default: <cpu count>]
+        --verbose               Print dots and stuff to stderr
+        --debug                 Print debug logs
+"""
+import logging
+import sys
+from multiprocessing import Pool, cpu_count
+
+import docopt
+import yamlconf
+from revscoring import Dependent
+from revscoring.datasources import revision_oriented
+from revscoring.dependencies import solve
+from revscoring.utilities.util import dump_observation, read_observations
+
+
+def main(argv=None):
+    args = docopt.docopt(__doc__, argv=argv)
+
+    logging.basicConfig(
+        level=logging.WARNING if not args['--debug'] else logging.DEBUG,
+        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
+    )
+
+    dependents = []
+    for dependent_path in args['<dependent>']:
+        dependent_or_list = yamlconf.import_path(dependent_path)
+        if isinstance(dependent_or_list, Dependent):
+            dependents.append(dependent_or_list)
+        else:
+            dependents.extend(dependent_or_list)
+
+    if args['--input'] == "<stdin>":
+        observations = read_observations(sys.stdin)
+    else:
+        observations = read_observations(open(args['--input']))
+
+    if args['--output'] == "<stdout>":
+        output = sys.stdout
+    else:
+        output = open(args['--output'], 'w')
+
+    if args['--extractors'] == "<cpu count>":
+        extractors = cpu_count()
+    else:
+        extractors = int(args['--extractors'])
+
+    verbose = args['--verbose']
+
+    run(observations, dependents, output, extractors, verbose)
+
+
+def run(labelings, dependents, output, extractors, verbose=False):
+    extractor_pool = Pool(processes=extractors)
+
+    extractor = LabelingDependentExtractor(dependents)
+
+    for observation in extractor_pool.imap(
+            extractor.extract_and_cache, labelings):
+        if observation is not None:
+            if verbose:
+                sys.stderr.write(".")
+                sys.stderr.flush()
+
+            dump_observation(observation, output)
+        else:
+            if verbose:
+                sys.stderr.write("-")
+                sys.stderr.flush()
+
+    if verbose:
+        sys.stderr.write("\n")
+
+
+class LabelingDependentExtractor:
+
+    def __init__(self, dependents):
+        self.dependents = dependents
+
+    def extract_and_cache(self, observation):
+        if observation['text'] is None:
+            return None
+
+        values = extract_from_text(
+            self.dependents, observation['text'],
+            cache=observation.get('cache'))
+        dependent_cache = {str(d): val
+                           for d, val in zip(self.dependents, values)}
+
+        del observation['text']
+        updated_cache = observation.get('cache', {})
+        updated_cache.update(dependent_cache)
+        observation['cache'] = updated_cache
+
+        return observation
+
+
+def extract_from_text(dependents, text, cache=None, context=None):
+    """
+    Extracts a set of values from a text an returns a cache containing just
+    those values.
+    :Parameters:
+        dependents : `list`( :class:`revscoring.Dependent` )
+            A list of dependents to extract values for
+        text : `str`
+            A text from which to extract features
+    :Returns:
+        A list of extracted feature values
+    """
+    cache = cache if cache is not None else {}
+    cache[revision_oriented.revision.text] = text
+
+    return list(solve(dependents, cache=cache, context=context))