From 8fe45ae6eefabd04a1867e42a40d076c4ddd505d Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Fri, 27 Nov 2015 11:20:18 -0600
Subject: [PATCH 01/12] (WIP) work towards a grid model tuning script.

---
 config/all_classifiers.yaml                   |  49 +++++
 revscoring/scorer_models/scorer_model.py      | 184 -----------------
 .../scorer_models/sklearn_classifier.py       | 194 ++++++++++++++++++
 revscoring/utilities/tune.py                  | 129 ++++++++++++
 4 files changed, 372 insertions(+), 184 deletions(-)
 create mode 100644 config/all_classifiers.yaml
 create mode 100644 revscoring/scorer_models/sklearn_classifier.py
 create mode 100644 revscoring/utilities/tune.py

diff --git a/config/all_classifiers.yaml b/config/all_classifiers.yaml
new file mode 100644
index 00000000..6474d1c3
--- /dev/null
+++ b/config/all_classifiers.yaml
@@ -0,0 +1,49 @@
+
+GradientBoostingClassifier:
+  class: sklearn.ensemble.GradientBoostingClassifier
+  params:
+    n_estimators: [150, 250, 500]
+    max_depth: [4, 5, 6]
+    max_features: [log2]
+    learning_rate: [0.01]
+RandomForestClassifier:
+  class: sklearn.ensemble.RandomForestClassifier
+  params:
+    n_estimators: [10, 20, 40, 80, 160, 320, 640],
+    min_samples_leaf: [1, 2, 4, 8, 16],
+    max_features: [auto, log2, None],
+    criterion: [gini, entropy]
+LogisticRegression:
+  class: sklearn.ensemble.GradientBoostingClassifier
+  params:
+     -
+      penalty: "l1",
+      C: [0.1, 1, 10, 100],
+      solver: [liblinear]
+     -
+      penalty: "l2",
+      C: [0.1, 1, 10, 100],
+      solver: [newton-cg, lbfgs, sag]
+SVC:
+  class: sklearn.svm.SVC
+  params:
+     -
+      kernel: [rbf],
+      probability: True,
+      gamma: [1e-3, 1e-4, auto],
+      C: [0.1, 1, 10, 100]
+     -
+      kernel: [linear],
+      probability: True,
+      C: [0.1, 1, 10, 100]
+GaussianNB:
+  class: sklearn.naive_bayes.GaussianNB
+  params:
+    alpha: [0.1, 1, 10]
+BernoulliNB:
+  class: sklearn.naive_bayes.BernouliNB
+  params: {}
+MultinomialNB:
+  class: sklearn.naive_bayes.MultinomialNB
+  params:
+    alpha: [0.1, 1, 10]
diff --git a/revscoring/scorer_models/scorer_model.py b/revscoring/scorer_models/scorer_model.py
index 278903b0..df6f8269 100644
--- a/revscoring/scorer_models/scorer_model.py
+++ b/revscoring/scorer_models/scorer_model.py
@@ -184,187 +184,3 @@ def from_config(cls, config, name, section_key="scorer_models"):
             return cls.load(open(section['model_file'], 'rb'))
         else:
             return cls(**{k: v for k, v in section.items() if k != "class"})
-
-
-class ScikitLearnClassifier(MLScorerModel):
-
-    def __init__(self, features, classifier_model, version=None):
-        super().__init__(features, version=version)
-        self.classifier_model = classifier_model
-        self.stats = None
-
-    def __getattr__(self, attr):
-        if attr is "stats":
-            return None
-        else:
-            raise AttributeError(attr)
-
-    def train(self, values_labels):
-        """
-
-        :Returns:
-            A dictionary with the fields:
-
-            * seconds_elapsed -- Time in seconds spent fitting the model
-        """
-        start = time.time()
-
-        values, labels = zip(*values_labels)
-
-        # Fit SVC model
-        self.classifier_model.fit(values, labels)
-        self.trained = time.time()
-
-        return {
-            'seconds_elapsed': time.time() - start
-        }
-
-    def score(self, feature_values):
-        """
-        Generates a score for a single revision based on a set of extracted
-        feature_values.
-
-        :Parameters:
-            feature_values : collection(`mixed`)
-                an ordered collection of values that correspond to the
-                `Feature` s provided to the constructor
-
-        :Returns:
-            A dict with the fields:
-
-            * predicion -- The most likely class
-            * probability -- A mapping of probabilities for input classes
-                             corresponding to the classes the classifier was
-                             trained on.  Generating this probability is
-                             slower than a simple prediction.
-        """
-        prediction = self.classifier_model.predict([feature_values])[0]
-        labels = self.classifier_model.classes_
-        probas = self.classifier_model.predict_proba([feature_values])[0]
-        probability = {label: proba for label, proba in zip(labels, probas)}
-
-        doc = {
-            'prediction': prediction,
-            'probability': probability
-        }
-        return normalize_json(doc)
-
-    def test(self, values_labels):
-        """
-        :Returns:
-            A dictionary of test statistics with the fields:
-
-            * accuracy -- The mean accuracy of classification
-            * table -- A truth table for classification
-            * roc
-                * auc -- The area under the ROC curve
-        """
-        values, labels = zip(*values_labels)
-
-        scores = [self.score(feature_values) for feature_values in values]
-
-        self.stats = {
-            'table': self._label_table(scores, labels),
-            'accuracy': self.classifier_model.score(values, labels),
-            'roc': self._roc_stats(scores, labels,
-                                   self.classifier_model.classes_)
-        }
-        return self.stats
-
-    def info(self):
-        return normalize_json({
-            'type': self.__class__.__name__,
-            'version': self.version,
-            'trained': self.trained,
-            'stats': self.stats
-        })
-
-    def format_info(self):
-        info = self.info()
-        formatted = io.StringIO()
-        formatted.write("ScikitLearnClassifier\n")
-        formatted.write(" - type: {0}\n".format(info.get('type')))
-        formatted.write(" - version: {0}\n".format(info.get('version')))
-        if isinstance(info['trained'], float):
-            date_string = datetime.fromtimestamp(info['trained']).isoformat()
-            formatted.write(" - trained: {0}\n".format(date_string))
-        else:
-            formatted.write(" - trained: {0}\n".format(info.get('trained')))
-
-        formatted.write("\n")
-        formatted.write(self.format_stats())
-        return formatted.getvalue()
-
-    def format_stats(self):
-        if self.stats is None:
-            return "No stats available"
-        else:
-            formatted = io.StringIO()
-            predicted_actuals = self.stats['table'].keys()
-            possible = list(set(actual for _, actual in predicted_actuals))
-            possible.sort()
-
-            formatted.write("Accuracy: {0}\n\n".format(self.stats['accuracy']))
-            if 'auc' in self.stats['roc']:
-                formatted.write("ROC-AUC: {0}\n\n"
-                                .format(self.stats['roc']['auc']))
-            else:
-                formatted.write("ROC-AUC:\n")
-
-                table_data = [[comparison_label,
-                               self.stats['roc'][comparison_label]['auc']]
-                              for comparison_label in possible]
-                formatted.write(tabulate(table_data))
-                formatted.write("\n\n")
-
-            table_data = []
-
-            for actual in possible:
-                table_data.append(
-                    [(str(actual))] +
-                    [self.stats['table'].get((predicted, actual), 0)
-                     for predicted in possible]
-                )
-            formatted.write(tabulate(
-                table_data,
-                headers=["~{0}".format(p) for p in possible]))
-
-            return formatted.getvalue()
-
-    @classmethod
-    def _roc_stats(cls, scores, labels, possible_labels):
-
-        if len(possible_labels) <= 2:
-            # Binary classification, class choice doesn't matter.
-            comparison_label = possible_labels[0]
-            return cls._roc_single_class(scores, labels, comparison_label)
-        else:
-            roc_stats = {}
-            for comparison_label in possible_labels:
-                roc_stats[comparison_label] = \
-                    cls._roc_single_class(scores, labels, comparison_label)
-
-            return roc_stats
-
-    @classmethod
-    def _roc_single_class(cls, scores, labels, comparison_label):
-        probabilities = [s['probability'][comparison_label]
-                         for s in scores]
-
-        true_positives = [l == comparison_label for l in labels]
-        fpr, tpr, thresholds = roc_curve(true_positives, probabilities)
-
-        return {
-            'auc': auc(fpr, tpr)
-        }
-
-    @staticmethod
-    def _label_table(scores, labels):
-
-        predicteds = [s['prediction'] for s in scores]
-
-        table = {}
-        for pair in zip(labels, predicteds):
-            table[pair] = table.get(pair, 0) + 1
-
-        return table
diff --git a/revscoring/scorer_models/sklearn_classifier.py b/revscoring/scorer_models/sklearn_classifier.py
new file mode 100644
index 00000000..47fdd212
--- /dev/null
+++ b/revscoring/scorer_models/sklearn_classifier.py
@@ -0,0 +1,194 @@
+import io
+import time
+from datetime import datetime
+
+from sklearn.grid_search import GridSearchCV
+from sklearn.metrics import auc, roc_curve
+from tabulate import tabulate
+
+from .scorer_model import MLScorerModel
+from .util import normalize_json
+
+
+class ScikitLearnClassifier(MLScorerModel):
+
+    def __init__(self, features, classifier_model, version=None):
+        super().__init__(features, version=version)
+        self.classifier_model = classifier_model
+        self.stats = None
+
+    def __getattr__(self, attr):
+        if attr is "stats":
+            return None
+        else:
+            raise AttributeError(attr)
+
+    def train(self, values_labels):
+        """
+
+        :Returns:
+            A dictionary with the fields:
+
+            * seconds_elapsed -- Time in seconds spent fitting the model
+        """
+        start = time.time()
+
+        values, labels = zip(*values_labels)
+
+        # Fit SVC model
+        self.classifier_model.fit(values, labels)
+        self.trained = time.time()
+
+        return {
+            'seconds_elapsed': time.time() - start
+        }
+
+    def score(self, feature_values):
+        """
+        Generates a score for a single revision based on a set of extracted
+        feature_values.
+
+        :Parameters:
+            feature_values : collection(`mixed`)
+                an ordered collection of values that correspond to the
+                `Feature` s provided to the constructor
+
+        :Returns:
+            A dict with the fields:
+
+            * predicion -- The most likely class
+            * probability -- A mapping of probabilities for input classes
+                             corresponding to the classes the classifier was
+                             trained on.  Generating this probability is
+                             slower than a simple prediction.
+        """
+        prediction = self.classifier_model.predict([feature_values])[0]
+        labels = self.classifier_model.classes_
+        probas = self.classifier_model.predict_proba([feature_values])[0]
+        probability = {label: proba for label, proba in zip(labels, probas)}
+
+        doc = {
+            'prediction': prediction,
+            'probability': probability
+        }
+        return normalize_json(doc)
+
+    def test(self, values_labels):
+        """
+        :Returns:
+            A dictionary of test statistics with the fields:
+
+            * accuracy -- The mean accuracy of classification
+            * table -- A truth table for classification
+            * roc
+                * auc -- The area under the ROC curve
+        """
+        values, labels = zip(*values_labels)
+
+        scores = [self.score(feature_values) for feature_values in values]
+
+        self.stats = {
+            'table': self._label_table(scores, labels),
+            'accuracy': self.classifier_model.score(values, labels),
+            'roc': self._roc_stats(scores, labels,
+                                   self.classifier_model.classes_)
+        }
+        return self.stats
+
+    def info(self):
+        return normalize_json({
+            'type': self.__class__.__name__,
+            'version': self.version,
+            'trained': self.trained,
+            'stats': self.stats
+        })
+
+    def format_info(self):
+        info = self.info()
+        formatted = io.StringIO()
+        formatted.write("ScikitLearnClassifier\n")
+        formatted.write(" - type: {0}\n".format(info.get('type')))
+        formatted.write(" - version: {0}\n".format(info.get('version')))
+        if isinstance(info['trained'], float):
+            date_string = datetime.fromtimestamp(info['trained']).isoformat()
+            formatted.write(" - trained: {0}\n".format(date_string))
+        else:
+            formatted.write(" - trained: {0}\n".format(info.get('trained')))
+
+        formatted.write("\n")
+        formatted.write(self.format_stats())
+        return formatted.getvalue()
+
+    def format_stats(self):
+        if self.stats is None:
+            return "No stats available"
+        else:
+            formatted = io.StringIO()
+            predicted_actuals = self.stats['table'].keys()
+            possible = list(set(actual for _, actual in predicted_actuals))
+            possible.sort()
+
+            formatted.write("Accuracy: {0}\n\n".format(self.stats['accuracy']))
+            if 'auc' in self.stats['roc']:
+                formatted.write("ROC-AUC: {0}\n\n"
+                                .format(self.stats['roc']['auc']))
+            else:
+                formatted.write("ROC-AUC:\n")
+
+                table_data = [[comparison_label,
+                               self.stats['roc'][comparison_label]['auc']]
+                              for comparison_label in possible]
+                formatted.write(tabulate(table_data))
+                formatted.write("\n\n")
+
+            table_data = []
+
+            for actual in possible:
+                table_data.append(
+                    [(str(actual))] +
+                    [self.stats['table'].get((predicted, actual), 0)
+                     for predicted in possible]
+                )
+            formatted.write(tabulate(
+                table_data,
+                headers=["~{0}".format(p) for p in possible]))
+
+            return formatted.getvalue()
+
+    @classmethod
+    def _roc_stats(cls, scores, labels, possible_labels):
+
+        if len(possible_labels) <= 2:
+            # Binary classification, class choice doesn't matter.
+            comparison_label = possible_labels[0]
+            return cls._roc_single_class(scores, labels, comparison_label)
+        else:
+            roc_stats = {}
+            for comparison_label in possible_labels:
+                roc_stats[comparison_label] = \
+                    cls._roc_single_class(scores, labels, comparison_label)
+
+            return roc_stats
+
+    @classmethod
+    def _roc_single_class(cls, scores, labels, comparison_label):
+        probabilities = [s['probability'][comparison_label]
+                         for s in scores]
+
+        true_positives = [l == comparison_label for l in labels]
+        fpr, tpr, thresholds = roc_curve(true_positives, probabilities)
+
+        return {
+            'auc': auc(fpr, tpr)
+        }
+
+    @staticmethod
+    def _label_table(scores, labels):
+
+        predicteds = [s['prediction'] for s in scores]
+
+        table = {}
+        for pair in zip(labels, predicteds):
+            table[pair] = table.get(pair, 0) + 1
+
+        return table
diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py
new file mode 100644
index 00000000..89e3a893
--- /dev/null
+++ b/revscoring/utilities/tune.py
@@ -0,0 +1,129 @@
+"""
+Tunes a set of models against a training set to identify the best
+model/configuration.
+
+Usage:
+    tune <params-config> [--observations=<path>]
+                         [--scoring=<type>]
+                         [--test-prop=<prop>]
+                         [--folds=<num>]
+                         [--report=<path>]
+                         [--label-type=<type>]
+                         [--verbose]
+                         [--debug]
+
+"""
+import logging
+import multiprocessing
+import random
+
+import numpy as np
+import yamlconf
+from sklearn import cross_validation, grid_search
+from sklearn.cross_validation import StratifiedKFold
+from sklearn.metrics import (accuracy_score, auc, f1_score,
+                             precision_recall_curve,
+                             precision_recall_fscore_support, precision_score,
+                             recall_score, roc_auc_score, roc_curve)
+
+logger = logging.getLogger(__name__)
+
+
+def run(params_config, observations, scoring, test_prop, folds, report,
+        processes, verbose):
+
+    # Split train and test
+    train_set, test_set = train_test_split(observations, test_prop=test_prop)
+
+    best_fits = []
+
+    # For each estimator, run gridsearch.
+    for name, config in params_config:
+        logger.info("Running gridsearch for {0}".format(name))
+        EstimatorClass = yamlconf.import_module(config['class'])
+        estimator = EstimatorClass()
+        if not hasattr(estimator, "fit"):
+            raise RuntimeError("Estimator {0} does not have a fit() method."
+                               .format(config['class']))
+
+        logger.info("Running gridsearch for {0}...".format(name))
+        grid_model = gridsearch(train_set, estimator, config['params'],
+                                scoring=scoring, folds=folds,
+                                processes=processes)
+
+        logger.info("Completed gridsearch for {0}.".format(name))
+        best_params, best_score, _ = max(grid_model.grid_scores_,
+                                         key=lambda x: x[1])
+        logger.info("\tBest fit: {0}={1} with {2}"
+                    .format(scoring, best_score, best_params))
+
+        f1, roc_auc = test_model(test_set, grid_model)
+        logger.info("\tTest set fit: f1={0}, roc_auc={1}\n"
+                    .format(f1, roc_auc))
+
+        best_fits.append((name, best_params, best_score, f1, roc_auc))
+
+        # TODO: should be tabular
+        logger.info("\tGrid scores:")
+        for params, mean_score, scores in grid_model.grid_scores_:
+            logger.info("\t - %0.3f (+/-%0.03f) for %r"
+                        % (mean_score, scores.std(), params))
+
+
+def train_test_split(observations, test_prop=0.25):
+    # Split train and test set from obs.
+    observations = list(observations)
+    random.shuffle(observations)
+
+    test_set_size = int(len(observations) * test_prop)
+    test_set = observations[:test_set_size]
+    logger.debug("Test set: {0}".format(len(test_set)))
+
+    train_set = observations[test_set_size:]
+    logger.debug("Train set: {0}".format(len(train_set)))
+
+    return train_set, test_set
+
+
+def gridsearch(observations, estimator, param_grid=None,
+               scoring='roc_auc', folds=5, processes=None):
+    """
+    Determine the best model via cross validation. This should be run on
+    training data with test data withheld.
+    """
+    param_grid = param_grid or {}
+
+    processes = processes or multiprocessing.cpu_count()
+
+    stratified_cv = cross_validation.StratifiedKFold(labels, n_folds=folds)
+
+    grid_model = grid_search.GridSearchCV(
+        cv=stratified_cv,
+        estimator=estimator,
+        param_grid=param_grid,
+        scoring=scoring,
+        n_jobs=processes
+    )
+
+    # This line actually performs the gridsearch
+    feature_values, labels = (list(vals) for vals in zip(*observations))
+    grid_model.fit(feature_values, labels)
+
+    return grid_model
+
+def test_model(observations, grid_model):
+
+    feature_values, labels = (list(vals) for vals in zip(*observations))
+    predictions = model_grid.predict(feature_values)
+    scores = get_scores(model_grid, feature_values)
+
+    return f1_score(labels, predictions), roc_auc_score(labels, scores)
+
+# To compute an ROC score, you need scores for each example, either a class probability
+# of a distance from the decision boundary
+def get_scores(model, X):
+    try:
+        scores = model.decision_function(X)
+    except:
+        scores = model.predict_proba(X)[:, 1]
+    return scores

From b922e30e3f61d7fcea8380a066c3d4d90bbf8b7f Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Sat, 28 Nov 2015 13:42:33 -0600
Subject: [PATCH 02/12] (WIP) Working tune utility.  Still testing for
 performance.

---
 config/linear_svc.params.yaml                 |   8 +
 ...s.yaml => sklearn_classifiers.params.yaml} |  24 +--
 revscoring/scorer_models/__init__.py          |   3 +-
 revscoring/scorer_models/nb.py                |   2 +-
 revscoring/scorer_models/rf.py                |   2 +-
 revscoring/scorer_models/svc.py               |   2 +-
 revscoring/utilities/train_test.py            |  65 ++----
 revscoring/utilities/tune.py                  | 185 +++++++++++++-----
 revscoring/utilities/util.py                  |  60 ++++--
 9 files changed, 219 insertions(+), 132 deletions(-)
 create mode 100644 config/linear_svc.params.yaml
 rename config/{all_classifiers.yaml => sklearn_classifiers.params.yaml} (70%)

diff --git a/config/linear_svc.params.yaml b/config/linear_svc.params.yaml
new file mode 100644
index 00000000..8c4d2e93
--- /dev/null
+++ b/config/linear_svc.params.yaml
@@ -0,0 +1,8 @@
+
+SVC:
+  class: sklearn.svm.SVC
+  params:
+     -
+      kernel: ['linear']
+      probability: [true]
+      C: [0.1, 1, 10]
diff --git a/config/all_classifiers.yaml b/config/sklearn_classifiers.params.yaml
similarity index 70%
rename from config/all_classifiers.yaml
rename to config/sklearn_classifiers.params.yaml
index 6474d1c3..277d65a5 100644
--- a/config/all_classifiers.yaml
+++ b/config/sklearn_classifiers.params.yaml
@@ -9,32 +9,32 @@ GradientBoostingClassifier:
 RandomForestClassifier:
   class: sklearn.ensemble.RandomForestClassifier
   params:
-    n_estimators: [10, 20, 40, 80, 160, 320, 640],
-    min_samples_leaf: [1, 2, 4, 8, 16],
-    max_features: [auto, log2, None],
+    n_estimators: [10, 20, 40, 80, 160, 320, 640]
+    min_samples_leaf: [1, 2, 4, 8, 16]
+    max_features: [auto, log2, None]
     criterion: [gini, entropy]
 LogisticRegression:
   class: sklearn.ensemble.GradientBoostingClassifier
   params:
      -
-      penalty: "l1",
-      C: [0.1, 1, 10, 100],
+      penalty: "l1"
+      C: [0.1, 1, 10, 100]
       solver: [liblinear]
      -
-      penalty: "l2",
-      C: [0.1, 1, 10, 100],
+      penalty: "l2"
+      C: [0.1, 1, 10, 100]
       solver: [newton-cg, lbfgs, sag]
 SVC:
   class: sklearn.svm.SVC
   params:
      -
-      kernel: [rbf],
-      probability: True,
-      gamma: [1e-3, 1e-4, auto],
+      kernel: [rbf]
+      probability: True
+      gamma: [1e-3, 1e-4, auto]
       C: [0.1, 1, 10, 100]
      -
-      kernel: [linear],
-      probability: True,
+      kernel: [linear]
+      probability: True
       C: [0.1, 1, 10, 100]
 GaussianNB:
   class: sklearn.naive_bayes.GaussianNB
diff --git a/revscoring/scorer_models/__init__.py b/revscoring/scorer_models/__init__.py
index deb40379..55565735 100644
--- a/revscoring/scorer_models/__init__.py
+++ b/revscoring/scorer_models/__init__.py
@@ -27,7 +27,8 @@
 from .svc import SVC, SVCModel, LinearSVC, LinearSVCModel, RBFSVC, RBFSVCModel
 from .nb import (NB, NBModel, GaussianNB, GaussianNBModel, MultinomialNB,
                  MultinomialNBModel, BernoulliNB, BernoulliNBModel)
-from .scorer_model import ScorerModel, MLScorerModel, ScikitLearnClassifier
+from .scorer_model import ScorerModel, MLScorerModel
+from .sklearn_classifier import ScikitLearnClassifier
 from .rf import RF, RFModel
 
 __all__ = [
diff --git a/revscoring/scorer_models/nb.py b/revscoring/scorer_models/nb.py
index 2aeb6216..10bf3afa 100644
--- a/revscoring/scorer_models/nb.py
+++ b/revscoring/scorer_models/nb.py
@@ -17,7 +17,7 @@
 
 from sklearn import naive_bayes
 
-from .scorer_model import ScikitLearnClassifier
+from .sklearn_classifier import ScikitLearnClassifier
 
 logger = logging.getLogger("revscoring.scorers.nb")
 
diff --git a/revscoring/scorer_models/rf.py b/revscoring/scorer_models/rf.py
index d97b09f9..185074b6 100644
--- a/revscoring/scorer_models/rf.py
+++ b/revscoring/scorer_models/rf.py
@@ -9,7 +9,7 @@
 
 from sklearn.ensemble import RandomForestClassifier
 
-from .scorer_model import ScikitLearnClassifier
+from .sklearn_classifier import ScikitLearnClassifier
 
 logger = logging.getLogger("revscoring.scorers.rf")
 
diff --git a/revscoring/scorer_models/svc.py b/revscoring/scorer_models/svc.py
index d8a588e3..bb4e6c51 100644
--- a/revscoring/scorer_models/svc.py
+++ b/revscoring/scorer_models/svc.py
@@ -20,7 +20,7 @@
 
 from sklearn import svm
 
-from .scorer_model import ScikitLearnClassifier
+from .sklearn_classifier import ScikitLearnClassifier
 
 
 class SVC(ScikitLearnClassifier):
diff --git a/revscoring/utilities/train_test.py b/revscoring/utilities/train_test.py
index 8175a4a2..7e82dde5 100644
--- a/revscoring/utilities/train_test.py
+++ b/revscoring/utilities/train_test.py
@@ -40,8 +40,9 @@
 import sys
 
 import docopt
+import yamlconf
 
-from .util import import_from_path
+from . import util
 
 logger = logging.getLogger(__name__)
 
@@ -54,8 +55,8 @@ def main(argv=None):
         format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
     )
 
-    ScorerModel = import_from_path(args['<scorer_model>'])
-    features = import_from_path(args['<features>'])
+    ScorerModel = yamlconf.import_module(args['<scorer_model>'])
+    features = yamlconf.import_module(args['<features>'])
 
     version = args['--version']
 
@@ -67,55 +68,29 @@ def main(argv=None):
     scorer_model = ScorerModel(features, version=version, **model_kwargs)
 
     if args['--values-labels'] == "<stdin>":
-        values_labels_file = sys.stdin
+        observations_f = sys.stdin
     else:
-        values_labels_file = open(args['--values-labels'], 'r')
+        observations_f = open(args['--values-labels'], 'r')
 
     if args['--model-file'] == "<stdout>":
         model_file = sys.stdout.buffer
     else:
         model_file = open(args['--model-file'], 'wb')
 
-    decode_label = DECODERS[args['--label-type']]
+    decode_label = util.DECODERS[args['--label-type']]
 
-    feature_labels = read_value_labels(values_labels_file,
-                                       scorer_model.features,
-                                       decode_label)
+    observations = util.read_observations(observations_f,
+                                          scorer_model.features,
+                                          decode_label)
 
     test_prop = float(args['--test-prop'])
 
-    run(feature_labels, model_file, scorer_model, test_prop)
+    run(observations, model_file, scorer_model, test_prop)
 
-DECODERS = {
-    'int': lambda v: int(v),
-    'float': lambda v: float(v),
-    'str': lambda v: str(v),
-    'bool': lambda v: v in ("True", "true", "1", "T", "y", "Y")
-}
 
+def run(observations, model_file, scorer_model, test_prop):
 
-def read_value_labels(f, features, decode_label):
-    for line in f:
-        parts = line.strip().split("\t")
-        values = parts[:-1]
-        label = parts[-1]
-
-        label = decode_label(label)
-
-        feature_values = []
-        for feature, value in zip(features, values):
-
-            if feature.returns == bool:
-                feature_values.append(value == "True")
-            else:
-                feature_values.append(feature.returns(value))
-
-        yield feature_values, label
-
-
-def run(feature_labels, model_file, scorer_model, test_prop):
-
-    scorer_model = _train_test(scorer_model, feature_labels, test_prop)
+    scorer_model = _train_test(scorer_model, observations, test_prop)
 
     sys.stderr.write(scorer_model.format_info())
 
@@ -124,19 +99,17 @@ def run(feature_labels, model_file, scorer_model, test_prop):
     scorer_model.dump(model_file)
 
 
-def _train_test(scorer_model, feature_labels, test_prop):
-    feature_labels = list(feature_labels)
-    random.shuffle(feature_labels)
-
-    test_set_size = int(len(feature_labels) * test_prop)
-    test_set = feature_labels[:test_set_size]
+def _train_test(scorer_model, observations, test_prop):
+    train_set, test_set = util.train_test_split(observations,
+                                                test_prop=test_prop)
+    
     logger.debug("Test set: {0}".format(len(test_set)))
-
-    train_set = feature_labels[test_set_size:]
     logger.debug("Train set: {0}".format(len(train_set)))
 
+    logger.info("Training model...")
     scorer_model.train(train_set)
 
+    logger.info("Testing model...")
     scorer_model.test(test_set)
 
     return scorer_model
diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py
index 89e3a893..bfd72afb 100644
--- a/revscoring/utilities/tune.py
+++ b/revscoring/utilities/tune.py
@@ -3,43 +3,108 @@
 model/configuration.
 
 Usage:
-    tune <params-config> [--observations=<path>]
-                         [--scoring=<type>]
-                         [--test-prop=<prop>]
-                         [--folds=<num>]
-                         [--report=<path>]
-                         [--label-type=<type>]
-                         [--verbose]
-                         [--debug]
+    tune <params-config> <features> [--observations=<path>]
+                                    [--scoring=<type>]
+                                    [--test-prop=<prop>]
+                                    [--folds=<num>]
+                                    [--report=<path>]
+                                    [--label-type=<type>]
+                                    [--processes=<num>]
+                                    [--verbose]
+                                    [--debug]
+
+Options:
+    <params-config>        The path to a YAML configuration file containing the
+                           models and parameter values to search when tuning
+    <features>             The classpath to a feature_list to use when
+                           interpreting the feature values of the observations
+    --observations=<path>  The path to a file containing observations to train
+                           and test against. [default: <stdin>]
+    --scoring=<type>       The type of scoring strategy to optimize for when
+                           choosing parameter sets [default: roc_auc]
+    --test-prop=<prop>     The proportion of observations that should be held
+                           asside for testing. [default: 0.25]
+    --folds=<num>          The number of cross-validation folds to try
+                           [default: 5]
+    --report=<path>        Path to a file to write the tuning report to
+                           [default: <stdout>]
+    --label-type=<type>    A type describing the value to expect as a label
+                           [default: str]
+    --processes=<num>      The number of parallel processes to start for
+                           model building [default: <cpu-count>]
+    --verbose              Print progress information to stderr
+    --debug                Print debug information to stderr
 
 """
+import datetime
+import json
 import logging
 import multiprocessing
-import random
+import sys
 
-import numpy as np
+import docopt
 import yamlconf
 from sklearn import cross_validation, grid_search
-from sklearn.cross_validation import StratifiedKFold
-from sklearn.metrics import (accuracy_score, auc, f1_score,
-                             precision_recall_curve,
-                             precision_recall_fscore_support, precision_score,
-                             recall_score, roc_auc_score, roc_curve)
+from sklearn.metrics import f1_score, roc_auc_score
+from tabulate import tabulate
+
+from . import util
 
 logger = logging.getLogger(__name__)
 
 
-def run(params_config, observations, scoring, test_prop, folds, report,
-        processes, verbose):
+def main(argv=None):
+    args = docopt.docopt(__doc__, argv=argv)
+
+    logging.basicConfig(
+        level=logging.INFO if not args['--debug'] else logging.DEBUG,
+        format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
+    )
+
+    params_config = yamlconf.load(open(args['<params-config>']))
+
+    features = yamlconf.import_module(args['<features>'])
+
+    label_decoder = util.DECODERS[args['--label-type']]
+    if args['--observations'] == "<stdin>":
+        observations_f = sys.stdin
+    else:
+        observations_f = open(args['--observations'])
+
+    observations = util.read_observations(observations_f, features,
+                                          label_decoder)
+
+    scoring = args['--scoring']
+    test_prop = float(args['--test-prop'])
+    folds = int(args['--folds'])
+
+    if args['--report'] == "<stdout>":
+        report = sys.stdout
+    else:
+        report = open(args['--report'], "w")
+
+    if args['--processes'] == "<cpu-count>":
+        processes = multiprocessing.cpu_count()
+    else:
+        processes = int(args['--processes'])
+
+    verbose = args['--verbose']
+
+    run(params_config, features, observations, scoring, test_prop, folds,
+        report, processes, verbose)
+
+
+def run(params_config, features, observations, scoring, test_prop, folds,
+        report, processes, verbose):
 
     # Split train and test
-    train_set, test_set = train_test_split(observations, test_prop=test_prop)
+    train_set, test_set = util.train_test_split(observations,
+                                                test_prop=test_prop)
 
     best_fits = []
 
     # For each estimator, run gridsearch.
-    for name, config in params_config:
-        logger.info("Running gridsearch for {0}".format(name))
+    for name, config in params_config.items():
         EstimatorClass = yamlconf.import_module(config['class'])
         estimator = EstimatorClass()
         if not hasattr(estimator, "fit"):
@@ -49,7 +114,7 @@ def run(params_config, observations, scoring, test_prop, folds, report,
         logger.info("Running gridsearch for {0}...".format(name))
         grid_model = gridsearch(train_set, estimator, config['params'],
                                 scoring=scoring, folds=folds,
-                                processes=processes)
+                                processes=processes, verbose=verbose)
 
         logger.info("Completed gridsearch for {0}.".format(name))
         best_params, best_score, _ = max(grid_model.grid_scores_,
@@ -57,40 +122,56 @@ def run(params_config, observations, scoring, test_prop, folds, report,
         logger.info("\tBest fit: {0}={1} with {2}"
                     .format(scoring, best_score, best_params))
 
-        f1, roc_auc = test_model(test_set, grid_model)
-        logger.info("\tTest set fit: f1={0}, roc_auc={1}\n"
-                    .format(f1, roc_auc))
+        test_f1, test_auc = test_model(test_set, grid_model)
+        logger.info("\tTest fit: f1={0}, roc_auc={1}\n"
+                    .format(test_f1, test_auc))
 
-        best_fits.append((name, best_params, best_score, f1, roc_auc))
+        best_fits.append((name, best_params, best_score, test_f1, test_auc))
 
-        # TODO: should be tabular
         logger.info("\tGrid scores:")
-        for params, mean_score, scores in grid_model.grid_scores_:
-            logger.info("\t - %0.3f (+/-%0.03f) for %r"
-                        % (mean_score, scores.std(), params))
-
-
-def train_test_split(observations, test_prop=0.25):
-    # Split train and test set from obs.
-    observations = list(observations)
-    random.shuffle(observations)
-
-    test_set_size = int(len(observations) * test_prop)
-    test_set = observations[:test_set_size]
-    logger.debug("Test set: {0}".format(len(test_set)))
-
-    train_set = observations[test_set_size:]
-    logger.debug("Train set: {0}".format(len(train_set)))
-
-    return train_set, test_set
+        table = tabulate(
+            ((round(mean_score, 3), round(scores.std(), 3),
+              format_params(params))
+             for params, mean_score, scores in
+             grid_model.grid_scores_),
+            headers=["mean(score)", "std(score)", "params"]
+        )
+        for line in table.split("\n"):
+            logger.info("\t\t" + line)
+
+    # Sort the results by the best fit
+    best_fits.sort(key=lambda r: r[2])
+    possible_labels = set(label for _, label in observations)
+
+    # Write out the report
+    report.write("# Model tuning report\n")
+    report.write("- Date: {0}\n".format(datetime.datetime().isoformat()))
+    report.write("- Train set: {0}\n".format(len(train_set)))
+    report.write("- Test set: {0}\n".format(len(test_set)))
+    report.write("- Labels: {0}\n".format(tuple(possible_labels)))
+    report.write("\n")
+    report.write("# Best fits\n")
+    report.write(tabulate(
+        ((name, format_params(par), round(score, 3), round(test_f1, 3),
+          round(test_auc, 3))
+         for name, par, score, test_f1, test_auc in best_fits),
+        headers=["model", "parameters", "score", "test_f1", "test_auc"]
+    ))
+
+    report.close()
+
+
+def format_params(doc):
+    return ", ".join("{0}={1}".format(k, json.dumps(v)) for k, v in doc)
 
 
 def gridsearch(observations, estimator, param_grid=None,
-               scoring='roc_auc', folds=5, processes=None):
+               scoring='roc_auc', folds=5, processes=None, verbose=False):
     """
     Determine the best model via cross validation. This should be run on
     training data with test data withheld.
     """
+    feature_values, labels = (list(vals) for vals in zip(*observations))
     param_grid = param_grid or {}
 
     processes = processes or multiprocessing.cpu_count()
@@ -102,7 +183,8 @@ def gridsearch(observations, estimator, param_grid=None,
         estimator=estimator,
         param_grid=param_grid,
         scoring=scoring,
-        n_jobs=processes
+        n_jobs=processes,
+        verbose=verbose
     )
 
     # This line actually performs the gridsearch
@@ -111,16 +193,17 @@ def gridsearch(observations, estimator, param_grid=None,
 
     return grid_model
 
-def test_model(observations, grid_model):
 
+def test_model(observations, grid_model):
     feature_values, labels = (list(vals) for vals in zip(*observations))
-    predictions = model_grid.predict(feature_values)
-    scores = get_scores(model_grid, feature_values)
+    predictions = grid_model.predict(feature_values)
+    scores = get_scores(grid_model, feature_values)
 
     return f1_score(labels, predictions), roc_auc_score(labels, scores)
 
-# To compute an ROC score, you need scores for each example, either a class probability
-# of a distance from the decision boundary
+
+# To compute an ROC score, you need scores for each example, either a class
+# probability or a distance from the decision boundary
 def get_scores(model, X):
     try:
         scores = model.decision_function(X)
diff --git a/revscoring/utilities/util.py b/revscoring/utilities/util.py
index 077daa3a..d41e3693 100644
--- a/revscoring/utilities/util.py
+++ b/revscoring/utilities/util.py
@@ -1,23 +1,5 @@
+import random
 import sys
-from importlib import import_module
-
-sys.path.insert(0, ".")  # Necessary for working in other modules
-
-
-def import_from_path(path):
-    try:
-        module = import_module(path)
-        return module
-    except ImportError:
-        parts = path.split(".")
-        module_path = ".".join(parts[:-1])
-        attribute_name = parts[-1]
-
-        module = import_module(module_path)
-
-        attribute = getattr(module, attribute_name)
-
-        return attribute
 
 
 def encode(val, none_val="NULL"):
@@ -29,3 +11,43 @@ def encode(val, none_val="NULL"):
         val = str(val)
 
     return val.replace("\t", "\\t").replace("\n", "\\n")
+
+
+DECODERS = {
+    'int': lambda v: int(v),
+    'float': lambda v: float(v),
+    'str': lambda v: str(v),
+    'bool': lambda v: v in ("True", "true", "1", "T", "y", "Y")
+}
+
+
+def read_observations(f, features, decode_label):
+    for line in f:
+        parts = line.strip().split("\t")
+        values = parts[:-1]
+        label = parts[-1]
+
+        label = decode_label(label)
+
+        feature_values = []
+        for feature, value in zip(features, values):
+
+            if feature.returns == bool:
+                feature_values.append(value == "True")
+            else:
+                feature_values.append(feature.returns(value))
+
+        yield feature_values, label
+
+
+def train_test_split(observations, test_prop=0.25):
+    # Split train and test set from obs.
+    observations = list(observations)
+    random.shuffle(observations)
+
+    test_set_size = int(len(observations) * test_prop)
+
+    test_set = observations[:test_set_size]
+    train_set = observations[test_set_size:]
+
+    return train_set, test_set

From f5c9ee963f5df044477f32559e8633baaf04f251 Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Sat, 28 Nov 2015 16:22:32 -0600
Subject: [PATCH 03/12] Minor fixes to tuning.  Complete test run.

---
 config/linear_svc.params.yaml          |  2 +-
 config/sklearn_classifiers.params.yaml | 24 ++++++++---------
 revscoring/utilities/tune.py           | 36 ++++++++++++++++----------
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/config/linear_svc.params.yaml b/config/linear_svc.params.yaml
index 8c4d2e93..b6c91bc9 100644
--- a/config/linear_svc.params.yaml
+++ b/config/linear_svc.params.yaml
@@ -5,4 +5,4 @@ SVC:
      -
       kernel: ['linear']
       probability: [true]
-      C: [0.1, 1, 10]
+      C: [0.1, 1]
diff --git a/config/sklearn_classifiers.params.yaml b/config/sklearn_classifiers.params.yaml
index 277d65a5..aaf1a718 100644
--- a/config/sklearn_classifiers.params.yaml
+++ b/config/sklearn_classifiers.params.yaml
@@ -4,37 +4,37 @@ GradientBoostingClassifier:
   params:
     n_estimators: [150, 250, 500]
     max_depth: [4, 5, 6]
-    max_features: [log2]
+    max_features: ["log2"]
     learning_rate: [0.01]
 RandomForestClassifier:
   class: sklearn.ensemble.RandomForestClassifier
   params:
     n_estimators: [10, 20, 40, 80, 160, 320, 640]
     min_samples_leaf: [1, 2, 4, 8, 16]
-    max_features: [auto, log2, None]
-    criterion: [gini, entropy]
+    max_features: ["auto", "log2", null]
+    criterion: ["gini", "entropy"]
 LogisticRegression:
   class: sklearn.ensemble.GradientBoostingClassifier
   params:
      -
-      penalty: "l1"
+      penalty: ["l1"]
       C: [0.1, 1, 10, 100]
-      solver: [liblinear]
+      solver: ["liblinear"]
      -
-      penalty: "l2"
+      penalty: ["l2"]
       C: [0.1, 1, 10, 100]
-      solver: [newton-cg, lbfgs, sag]
+      solver: ["newton-cg", "lbfgs", "sag"]
 SVC:
   class: sklearn.svm.SVC
   params:
      -
-      kernel: [rbf]
-      probability: True
-      gamma: [1e-3, 1e-4, auto]
+      kernel: ["rbf"]
+      probability: [true]
+      gamma: [0.001, 0.0001, "auto"]
       C: [0.1, 1, 10, 100]
      -
-      kernel: [linear]
-      probability: True
+      kernel: ["linear"]
+      probability: [true]
       C: [0.1, 1, 10, 100]
 GaussianNB:
   class: sklearn.naive_bayes.GaussianNB
diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py
index bfd72afb..ec6ecee2 100644
--- a/revscoring/utilities/tune.py
+++ b/revscoring/utilities/tune.py
@@ -41,6 +41,7 @@
 import logging
 import multiprocessing
 import sys
+import time
 
 import docopt
 import yamlconf
@@ -111,16 +112,25 @@ def run(params_config, features, observations, scoring, test_prop, folds,
             raise RuntimeError("Estimator {0} does not have a fit() method."
                                .format(config['class']))
 
+        parameter_grid = grid_search.ParameterGrid(config['params'])
         logger.info("Running gridsearch for {0}...".format(name))
+        logger.debug("{0} parameter sets:".format(len(parameter_grid)))
+        for params in parameter_grid:
+            logger.debug(" - {0}".format(format_params(params)))
+        logger.debug("{0} folds per parameter set".format(folds))
+
+        start = time.time()
         grid_model = gridsearch(train_set, estimator, config['params'],
                                 scoring=scoring, folds=folds,
                                 processes=processes, verbose=verbose)
 
-        logger.info("Completed gridsearch for {0}.".format(name))
+        logger.info("Completed gridsearch for {0} in {1} hours."
+                    .format(name, round((time.time() - start) / (60 * 60), 3)))
         best_params, best_score, _ = max(grid_model.grid_scores_,
                                          key=lambda x: x[1])
         logger.info("\tBest fit: {0}={1} with {2}"
-                    .format(scoring, best_score, best_params))
+                    .format(scoring, round(best_score, 3),
+                            format_params(best_params)))
 
         test_f1, test_auc = test_model(test_set, grid_model)
         logger.info("\tTest fit: f1={0}, roc_auc={1}\n"
@@ -140,15 +150,16 @@ def run(params_config, features, observations, scoring, test_prop, folds,
             logger.info("\t\t" + line)
 
     # Sort the results by the best fit
-    best_fits.sort(key=lambda r: r[2])
-    possible_labels = set(label for _, label in observations)
+    best_fits.sort(key=lambda r: r[2], reverse=True)
+    possible_labels = set(label for _, label in train_set)
 
     # Write out the report
     report.write("# Model tuning report\n")
-    report.write("- Date: {0}\n".format(datetime.datetime().isoformat()))
+    report.write("- Date: {0}\n".format(datetime.datetime.now().isoformat()))
     report.write("- Train set: {0}\n".format(len(train_set)))
     report.write("- Test set: {0}\n".format(len(test_set)))
-    report.write("- Labels: {0}\n".format(tuple(possible_labels)))
+    report.write("- Labels: {0}\n".format(json.dumps(list(possible_labels))))
+    report.write("- Scoring: {0}\n".format(scoring))
     report.write("\n")
     report.write("# Best fits\n")
     report.write(tabulate(
@@ -157,12 +168,14 @@ def run(params_config, features, observations, scoring, test_prop, folds,
          for name, par, score, test_f1, test_auc in best_fits),
         headers=["model", "parameters", "score", "test_f1", "test_auc"]
     ))
+    report.write("\n")
 
     report.close()
 
 
 def format_params(doc):
-    return ", ".join("{0}={1}".format(k, json.dumps(v)) for k, v in doc)
+    return ", ".join("{0}={1}".format(k, json.dumps(v))
+                     for k, v in doc.items())
 
 
 def gridsearch(observations, estimator, param_grid=None,
@@ -176,18 +189,15 @@ def gridsearch(observations, estimator, param_grid=None,
 
     processes = processes or multiprocessing.cpu_count()
 
-    stratified_cv = cross_validation.StratifiedKFold(labels, n_folds=folds)
-
     grid_model = grid_search.GridSearchCV(
-        cv=stratified_cv,
+        cv=folds,
         estimator=estimator,
         param_grid=param_grid,
         scoring=scoring,
-        n_jobs=processes,
-        verbose=verbose
+        n_jobs=processes
     )
 
-    # This line actually performs the gridsearch
+    # To perform the gridsearch, we run fit()
     feature_values, labels = (list(vals) for vals in zip(*observations))
     grid_model.fit(feature_values, labels)
 

From fc37462bcf6155f55f2e092445f32b066d081b00 Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Sat, 28 Nov 2015 19:06:05 -0600
Subject: [PATCH 04/12] Removes old config ptwiki config files that were never
 used.

---
 config/ptwiki_api.yaml | 4 ----
 config/ptwiki_svc.yaml | 4 ----
 2 files changed, 8 deletions(-)
 delete mode 100644 config/ptwiki_api.yaml
 delete mode 100644 config/ptwiki_svc.yaml

diff --git a/config/ptwiki_api.yaml b/config/ptwiki_api.yaml
deleted file mode 100644
index cdb94539..00000000
--- a/config/ptwiki_api.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-
-class: revscores.APIExtractor
-url: https://pt.wikipedia.org/w/api.php
-language: revscores.language.Portuguese
diff --git a/config/ptwiki_svc.yaml b/config/ptwiki_svc.yaml
deleted file mode 100644
index 0b46bf9b..00000000
--- a/config/ptwiki_svc.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-/* This is just thinking out loud */
-
-class: revscores.scorers.LinearSVC
-file: ptwiki_svc.model

From e944636b99963bc115d72ae5ac5034ff5d32c4dc Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Sat, 28 Nov 2015 19:44:07 -0600
Subject: [PATCH 05/12] Cleanup to tuning utility and add config files for each
 classifier's param space.

---
 config/gradient_boost.params.yaml      |  7 ++
 config/linear_svc.params.yaml          |  8 ---
 config/logistic_regression.params.yaml |  6 ++
 config/naive_bayes.params.yaml         | 11 +++
 config/random_forest.params.yaml       |  7 ++
 config/sklearn_classifiers.params.yaml | 49 --------------
 config/svc.params.yaml                 | 12 ++++
 revscoring/utilities/tune.py           | 92 ++++++++++++++------------
 8 files changed, 92 insertions(+), 100 deletions(-)
 create mode 100644 config/gradient_boost.params.yaml
 delete mode 100644 config/linear_svc.params.yaml
 create mode 100644 config/logistic_regression.params.yaml
 create mode 100644 config/naive_bayes.params.yaml
 create mode 100644 config/random_forest.params.yaml
 delete mode 100644 config/sklearn_classifiers.params.yaml
 create mode 100644 config/svc.params.yaml

diff --git a/config/gradient_boost.params.yaml b/config/gradient_boost.params.yaml
new file mode 100644
index 00000000..3645d5c8
--- /dev/null
+++ b/config/gradient_boost.params.yaml
@@ -0,0 +1,7 @@
+GradientBoostingClassifier:
+  class: sklearn.ensemble.GradientBoostingClassifier
+  params:
+    n_estimators: [100, 300, 500, 700]
+    max_depth: [1, 3, 5, 7]
+    max_features: ["log2"]
+    learning_rate: [0.01, 0.1,  0.5, 1]
diff --git a/config/linear_svc.params.yaml b/config/linear_svc.params.yaml
deleted file mode 100644
index b6c91bc9..00000000
--- a/config/linear_svc.params.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-SVC:
-  class: sklearn.svm.SVC
-  params:
-     -
-      kernel: ['linear']
-      probability: [true]
-      C: [0.1, 1]
diff --git a/config/logistic_regression.params.yaml b/config/logistic_regression.params.yaml
new file mode 100644
index 00000000..3116dec0
--- /dev/null
+++ b/config/logistic_regression.params.yaml
@@ -0,0 +1,6 @@
+
+LogisticRegression:
+  class: sklearn.linear_model.LogisticRegression
+  params:
+    penalty: ["l1", "l2"]
+    C: [0.1, 1, 10]
diff --git a/config/naive_bayes.params.yaml b/config/naive_bayes.params.yaml
new file mode 100644
index 00000000..fb67b1aa
--- /dev/null
+++ b/config/naive_bayes.params.yaml
@@ -0,0 +1,11 @@
+GaussianNB:
+  class: sklearn.naive_bayes.GaussianNB
+  params:
+    alpha: [0.1, 1, 10]
+BernoulliNB:
+  class: sklearn.naive_bayes.BernouliNB
+  params: {}
+MultinomialNB:
+  class: sklearn.naive_bayes.MultinomialNB
+  params:
+  alpha: [0.1, 1, 10]
diff --git a/config/random_forest.params.yaml b/config/random_forest.params.yaml
new file mode 100644
index 00000000..31550d48
--- /dev/null
+++ b/config/random_forest.params.yaml
@@ -0,0 +1,7 @@
+RandomForestClassifier:
+  class: sklearn.ensemble.RandomForestClassifier
+  params:
+    n_estimators: [10, 20, 40, 80, 160, 320, 640]
+    min_samples_leaf: [1, 3, 5, 7, 13]
+    max_features: ["log2"]
+    criterion: ["gini", "entropy"]
diff --git a/config/sklearn_classifiers.params.yaml b/config/sklearn_classifiers.params.yaml
deleted file mode 100644
index aaf1a718..00000000
--- a/config/sklearn_classifiers.params.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-
-GradientBoostingClassifier:
-  class: sklearn.ensemble.GradientBoostingClassifier
-  params:
-    n_estimators: [150, 250, 500]
-    max_depth: [4, 5, 6]
-    max_features: ["log2"]
-    learning_rate: [0.01]
-RandomForestClassifier:
-  class: sklearn.ensemble.RandomForestClassifier
-  params:
-    n_estimators: [10, 20, 40, 80, 160, 320, 640]
-    min_samples_leaf: [1, 2, 4, 8, 16]
-    max_features: ["auto", "log2", null]
-    criterion: ["gini", "entropy"]
-LogisticRegression:
-  class: sklearn.ensemble.GradientBoostingClassifier
-  params:
-     -
-      penalty: ["l1"]
-      C: [0.1, 1, 10, 100]
-      solver: ["liblinear"]
-     -
-      penalty: ["l2"]
-      C: [0.1, 1, 10, 100]
-      solver: ["newton-cg", "lbfgs", "sag"]
-SVC:
-  class: sklearn.svm.SVC
-  params:
-     -
-      kernel: ["rbf"]
-      probability: [true]
-      gamma: [0.001, 0.0001, "auto"]
-      C: [0.1, 1, 10, 100]
-     -
-      kernel: ["linear"]
-      probability: [true]
-      C: [0.1, 1, 10, 100]
-GaussianNB:
-  class: sklearn.naive_bayes.GaussianNB
-  params:
-    alpha: [0.1, 1, 10]
-BernoulliNB:
-  class: sklearn.naive_bayes.BernouliNB
-  params: {}
-MultinomialNB:
-  class: sklearn.naive_bayes.MultinomialNB
-  params:
-    alpha: [0.1, 1, 10]
diff --git a/config/svc.params.yaml b/config/svc.params.yaml
new file mode 100644
index 00000000..3ab35a2c
--- /dev/null
+++ b/config/svc.params.yaml
@@ -0,0 +1,12 @@
+SVC:
+  class: sklearn.svm.SVC
+  params:
+     -
+      kernel: ["rbf"]
+      probability: [true]
+      gamma: [0.001, 0.0001, "auto"]
+      C: [0.1, 1, 10]
+     -
+      kernel: ["linear"]
+      probability: [true]
+      C: [0.1, 1, 10]
diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py
index ec6ecee2..2ce2175b 100644
--- a/revscoring/utilities/tune.py
+++ b/revscoring/utilities/tune.py
@@ -42,10 +42,11 @@
 import multiprocessing
 import sys
 import time
+import traceback
 
 import docopt
 import yamlconf
-from sklearn import cross_validation, grid_search
+from sklearn import grid_search
 from sklearn.metrics import f1_score, roc_auc_score
 from tabulate import tabulate
 
@@ -106,48 +107,53 @@ def run(params_config, features, observations, scoring, test_prop, folds,
 
     # For each estimator, run gridsearch.
     for name, config in params_config.items():
-        EstimatorClass = yamlconf.import_module(config['class'])
-        estimator = EstimatorClass()
-        if not hasattr(estimator, "fit"):
-            raise RuntimeError("Estimator {0} does not have a fit() method."
-                               .format(config['class']))
-
-        parameter_grid = grid_search.ParameterGrid(config['params'])
-        logger.info("Running gridsearch for {0}...".format(name))
-        logger.debug("{0} parameter sets:".format(len(parameter_grid)))
-        for params in parameter_grid:
-            logger.debug(" - {0}".format(format_params(params)))
-        logger.debug("{0} folds per parameter set".format(folds))
-
-        start = time.time()
-        grid_model = gridsearch(train_set, estimator, config['params'],
-                                scoring=scoring, folds=folds,
-                                processes=processes, verbose=verbose)
-
-        logger.info("Completed gridsearch for {0} in {1} hours."
-                    .format(name, round((time.time() - start) / (60 * 60), 3)))
-        best_params, best_score, _ = max(grid_model.grid_scores_,
-                                         key=lambda x: x[1])
-        logger.info("\tBest fit: {0}={1} with {2}"
-                    .format(scoring, round(best_score, 3),
-                            format_params(best_params)))
-
-        test_f1, test_auc = test_model(test_set, grid_model)
-        logger.info("\tTest fit: f1={0}, roc_auc={1}\n"
-                    .format(test_f1, test_auc))
-
-        best_fits.append((name, best_params, best_score, test_f1, test_auc))
-
-        logger.info("\tGrid scores:")
-        table = tabulate(
-            ((round(mean_score, 3), round(scores.std(), 3),
-              format_params(params))
-             for params, mean_score, scores in
-             grid_model.grid_scores_),
-            headers=["mean(score)", "std(score)", "params"]
-        )
-        for line in table.split("\n"):
-            logger.info("\t\t" + line)
+        try:
+            EstimatorClass = yamlconf.import_module(config['class'])
+            estimator = EstimatorClass()
+            if not hasattr(estimator, "fit"):
+                raise RuntimeError("Estimator {0} does not have a fit() method."
+                                   .format(config['class']))
+
+            parameter_grid = grid_search.ParameterGrid(config['params'])
+            logger.info("Running gridsearch for {0}...".format(name))
+            logger.debug("{0} parameter sets:".format(len(parameter_grid)))
+            for params in parameter_grid:
+                logger.debug(" - {0}".format(format_params(params)))
+            logger.debug("{0} folds per parameter set".format(folds))
+
+            start = time.time()
+            grid_model = gridsearch(train_set, estimator, config['params'],
+                                    scoring=scoring, folds=folds,
+                                    processes=processes, verbose=verbose)
+
+            logger.info("Completed gridsearch for {0} in {1} hours."
+                        .format(name, round((time.time() - start) / (60 * 60), 3)))
+            best_params, best_score, _ = max(grid_model.grid_scores_,
+                                             key=lambda x: x[1])
+            logger.info("\tBest fit: {0}={1} with {2}"
+                        .format(scoring, round(best_score, 3),
+                                format_params(best_params)))
+
+            test_f1, test_auc = test_model(test_set, grid_model)
+            logger.info("\tTest fit: f1={0}, roc_auc={1}\n"
+                        .format(test_f1, test_auc))
+
+            best_fits.append((name, best_params, best_score, test_f1, test_auc))
+
+            logger.info("\tGrid scores:")
+            table = tabulate(
+                ((round(mean_score, 3), round(scores.std(), 3),
+                  format_params(params))
+                 for params, mean_score, scores in
+                 grid_model.grid_scores_),
+                headers=["mean(score)", "std(score)", "params"]
+            )
+            for line in table.split("\n"):
+                logger.info("\t\t" + line)
+        except Exception:
+            logger.warn("An error occurred while trying to fit {0}"
+                        .format(name))
+            logger.warn("Exception:\n" + traceback.format_exc())
 
     # Sort the results by the best fit
     best_fits.sort(key=lambda r: r[2], reverse=True)

From 1c4787be04a66025a94923bf322ae82d763a928e Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Mon, 30 Nov 2015 19:58:31 -0600
Subject: [PATCH 06/12] Switches tuning utility to use multiprocessing
 directly.

---
 config/svc.params.yaml       |   2 +
 revscoring/utilities/tune.py | 211 +++++++++++++++++------------------
 2 files changed, 102 insertions(+), 111 deletions(-)

diff --git a/config/svc.params.yaml b/config/svc.params.yaml
index 3ab35a2c..2ac68c06 100644
--- a/config/svc.params.yaml
+++ b/config/svc.params.yaml
@@ -5,8 +5,10 @@ SVC:
       kernel: ["rbf"]
       probability: [true]
       gamma: [0.001, 0.0001, "auto"]
+      cache_size: [1000]
       C: [0.1, 1, 10]
      -
       kernel: ["linear"]
       probability: [true]
+      cache_size: [1000]
       C: [0.1, 1, 10]
diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py
index 2ce2175b..dcdaa47b 100644
--- a/revscoring/utilities/tune.py
+++ b/revscoring/utilities/tune.py
@@ -22,8 +22,6 @@
                            and test against. [default: <stdin>]
     --scoring=<type>       The type of scoring strategy to optimize for when
                            choosing parameter sets [default: roc_auc]
-    --test-prop=<prop>     The proportion of observations that should be held
-                           asside for testing. [default: 0.25]
     --folds=<num>          The number of cross-validation folds to try
                            [default: 5]
     --report=<path>        Path to a file to write the tuning report to
@@ -43,14 +41,15 @@
 import sys
 import time
 import traceback
+from collections import defaultdict
 
 import docopt
 import yamlconf
-from sklearn import grid_search
-from sklearn.metrics import f1_score, roc_auc_score
+from sklearn import cross_validation, grid_search
 from tabulate import tabulate
 
 from . import util
+from .. import __version__
 
 logger = logging.getLogger(__name__)
 
@@ -65,7 +64,8 @@ def main(argv=None):
 
     params_config = yamlconf.load(open(args['<params-config>']))
 
-    features = yamlconf.import_module(args['<features>'])
+    features_path = args['<features>']
+    features = yamlconf.import_module(features_path)
 
     label_decoder = util.DECODERS[args['--label-type']]
     if args['--observations'] == "<stdin>":
@@ -77,7 +77,6 @@ def main(argv=None):
                                           label_decoder)
 
     scoring = args['--scoring']
-    test_prop = float(args['--test-prop'])
     folds = int(args['--folds'])
 
     if args['--report'] == "<stdout>":
@@ -92,90 +91,85 @@ def main(argv=None):
 
     verbose = args['--verbose']
 
-    run(params_config, features, observations, scoring, test_prop, folds,
+    run(params_config, features_path, observations, scoring, folds,
         report, processes, verbose)
 
 
-def run(params_config, features, observations, scoring, test_prop, folds,
+def run(params_config, features_path, observations, scoring, folds,
         report, processes, verbose):
 
-    # Split train and test
-    train_set, test_set = util.train_test_split(observations,
-                                                test_prop=test_prop)
+    observations = list(observations)
 
-    best_fits = []
+    # Prepare the worker pool
+    logger.debug("Starting up multiprocessing pool (processes={0})"
+                 .format(processes))
+    pool = multiprocessing.Pool(processes=processes)
 
-    # For each estimator, run gridsearch.
-    for name, config in params_config.items():
-        try:
-            EstimatorClass = yamlconf.import_module(config['class'])
-            estimator = EstimatorClass()
-            if not hasattr(estimator, "fit"):
-                raise RuntimeError("Estimator {0} does not have a fit() method."
-                                   .format(config['class']))
-
-            parameter_grid = grid_search.ParameterGrid(config['params'])
-            logger.info("Running gridsearch for {0}...".format(name))
-            logger.debug("{0} parameter sets:".format(len(parameter_grid)))
-            for params in parameter_grid:
-                logger.debug(" - {0}".format(format_params(params)))
-            logger.debug("{0} folds per parameter set".format(folds))
-
-            start = time.time()
-            grid_model = gridsearch(train_set, estimator, config['params'],
-                                    scoring=scoring, folds=folds,
-                                    processes=processes, verbose=verbose)
-
-            logger.info("Completed gridsearch for {0} in {1} hours."
-                        .format(name, round((time.time() - start) / (60 * 60), 3)))
-            best_params, best_score, _ = max(grid_model.grid_scores_,
-                                             key=lambda x: x[1])
-            logger.info("\tBest fit: {0}={1} with {2}"
-                        .format(scoring, round(best_score, 3),
-                                format_params(best_params)))
-
-            test_f1, test_auc = test_model(test_set, grid_model)
-            logger.info("\tTest fit: f1={0}, roc_auc={1}\n"
-                        .format(test_f1, test_auc))
-
-            best_fits.append((name, best_params, best_score, test_f1, test_auc))
-
-            logger.info("\tGrid scores:")
-            table = tabulate(
-                ((round(mean_score, 3), round(scores.std(), 3),
-                  format_params(params))
-                 for params, mean_score, scores in
-                 grid_model.grid_scores_),
-                headers=["mean(score)", "std(score)", "params"]
-            )
-            for line in table.split("\n"):
-                logger.info("\t\t" + line)
-        except Exception:
-            logger.warn("An error occurred while trying to fit {0}"
-                        .format(name))
-            logger.warn("Exception:\n" + traceback.format_exc())
-
-    # Sort the results by the best fit
-    best_fits.sort(key=lambda r: r[2], reverse=True)
-    possible_labels = set(label for _, label in train_set)
-
-    # Write out the report
+    # Start writing the model tuning report
+    possible_labels = set(label for _, label in observations)
     report.write("# Model tuning report\n")
+    report.write("- Revscoring version: {0}\n".format(__version__))
+    report.write("- Features: {0}\n".format(features_path))
     report.write("- Date: {0}\n".format(datetime.datetime.now().isoformat()))
-    report.write("- Train set: {0}\n".format(len(train_set)))
-    report.write("- Test set: {0}\n".format(len(test_set)))
+    report.write("- Observations: {0}\n".format(len(observations)))
     report.write("- Labels: {0}\n".format(json.dumps(list(possible_labels))))
     report.write("- Scoring: {0}\n".format(scoring))
+    report.write("- Folds: {0}\n".format(folds))
     report.write("\n")
-    report.write("# Best fits\n")
-    report.write(tabulate(
-        ((name, format_params(par), round(score, 3), round(test_f1, 3),
-          round(test_auc, 3))
-         for name, par, score, test_f1, test_auc in best_fits),
-        headers=["model", "parameters", "score", "test_f1", "test_auc"]
-    ))
+
+    # For each estimator and paramset, submit the job.
+    cv_result_sets = defaultdict(lambda : [])
+    for name, estimator, param_grid in _estimator_param_grid(params_config):
+        logger.debug("Submitting jobs for {0}:".format(name))
+        for params in param_grid:
+            logger.debug("\tsubmitting {0}..."
+                         .format(format_params(params)))
+            result = pool.apply_async(_cross_validate,
+                                      [observations, estimator, params],
+                                      {'scoring': scoring, 'folds': folds})
+            cv_result_sets[name].append((params, result))
+
+    # Barrier synchronization
+    logger.info("Running gridsearch for {0} model/params pairs ..."
+                .format(sum(len(p_r) for p_r in cv_result_sets)))
+    grid_scores = []
+    for name, param_results in cv_result_sets.items():
+        for params, result in param_results:
+            scores = result.get()  # This is a line that blocks
+            grid_scores.append((name, params, scores.mean(), scores.std()))
+
+    # Write the rest of the report!  First, print the top 10 combinations
+    report.write("# Top scoring configurations\n")
+    grid_scores.sort(key=lambda gs: gs[2], reverse=True)
+    table = tabulate(
+        ((name, round(mean_score, 3), round(scores.std(), 3),
+          format_params(params))
+         for name, params, mean_score, scores in
+         grid_scores[:10]),
+        headers=["model", "mean(scores)", "std(scores)", "params"]
+    )
+    report.write(table + "\n")
     report.write("\n")
 
+    # Now print out scores for each model.
+    report.write("# Models\n")
+    for name, param_results in cv_result_sets.items():
+        report.write("## {0}\n".format(name))
+
+        param_scores = ((p, r.get()) for p, r in param_results)
+        param_stats = [(p, s.mean(), s.std()) for p, s in param_scores]
+        param_stats.sort(key=lambda v:v[1], reverse=True)
+
+        table = tabulate(
+            ((round(mean_score, 3), round(scores.std(), 3),
+              format_params(params))
+             for params, mean_score, scores in
+             param_stats),
+            headers=["mean(scores)", "std(scores)", "params"]
+        )
+        report.write(table + "\n")
+        report.write("\n")
+
     report.close()
 
 
@@ -184,45 +178,40 @@ def format_params(doc):
                      for k, v in doc.items())
 
 
-def gridsearch(observations, estimator, param_grid=None,
-               scoring='roc_auc', folds=5, processes=None, verbose=False):
-    """
-    Determine the best model via cross validation. This should be run on
-    training data with test data withheld.
-    """
-    feature_values, labels = (list(vals) for vals in zip(*observations))
-    param_grid = param_grid or {}
-
-    processes = processes or multiprocessing.cpu_count()
-
-    grid_model = grid_search.GridSearchCV(
-        cv=folds,
-        estimator=estimator,
-        param_grid=param_grid,
-        scoring=scoring,
-        n_jobs=processes
-    )
-
-    # To perform the gridsearch, we run fit()
-    feature_values, labels = (list(vals) for vals in zip(*observations))
-    grid_model.fit(feature_values, labels)
+def _estimator_param_grid(params_config):
+    for name, config in params_config.items():
+        try:
+            EstimatorClass = yamlconf.import_module(config['class'])
+            estimator = EstimatorClass()
+        except Exception:
+            logger.warn("Could not load estimator {0}"
+                        .format(config['class']))
+            logger.warn("Exception:\n" + traceback.format_exc())
+            continue
 
-    return grid_model
+        if not hasattr(estimator, "fit"):
+            logger.warn("Estimator {0} does not have a fit() method."
+                        .format(config['class']))
+            continue
 
+        param_grid = grid_search.ParameterGrid(config['params'])
 
-def test_model(observations, grid_model):
-    feature_values, labels = (list(vals) for vals in zip(*observations))
-    predictions = grid_model.predict(feature_values)
-    scores = get_scores(grid_model, feature_values)
+        yield name, estimator, param_grid
 
-    return f1_score(labels, predictions), roc_auc_score(labels, scores)
 
+def _cross_validate(observations, estimator, params, scoring="roc_auc",
+                    folds=5, verbose=False):
 
-# To compute an ROC score, you need scores for each example, either a class
-# probability or a distance from the decision boundary
-def get_scores(model, X):
-    try:
-        scores = model.decision_function(X)
-    except:
-        scores = model.predict_proba(X)[:, 1]
+    start = time.time()
+    feature_values, labels = (list(vect) for vect in zip(*observations))
+    estimator.set_params(**params)
+    scores = cross_validation.cross_val_score(estimator, feature_values,
+                                              labels, scoring=scoring,
+                                              cv=folds)
+    duration = time.time() - start
+    logging.debug("Cross-validated {0} with {1} in {2} hours: {3} ({4})"
+                  .format(estimator, format_params(params),
+                          round(duration / (60 * 60), 3),
+                          round(scores.mean(), 3),
+                          round(scores.std(), 3)))
     return scores

From 3cc9f600ecd2a8a53811a7557a8c1f80b984232c Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Mon, 30 Nov 2015 20:07:32 -0600
Subject: [PATCH 07/12] Fixes minor issue in svc params config.

---
 config/svc.params.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/svc.params.yaml b/config/svc.params.yaml
index 2ac68c06..04bd0576 100644
--- a/config/svc.params.yaml
+++ b/config/svc.params.yaml
@@ -4,7 +4,7 @@ SVC:
      -
       kernel: ["rbf"]
       probability: [true]
-      gamma: [0.001, 0.0001, "auto"]
+      gamma: [0.0, 0.001, 0.0001]
       cache_size: [1000]
       C: [0.1, 1, 10]
      -

From 13369a27d6e2d2b8a81774b501ca7a61ad1c8145 Mon Sep 17 00:00:00 2001
From: Aaron Halfaker <aaron.halfaker@gmail.com>
Date: Wed, 2 Dec 2015 00:37:21 +0000
Subject: [PATCH 08/12] Minor fix in Bernoulli spelling

---
 config/naive_bayes.params.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/config/naive_bayes.params.yaml b/config/naive_bayes.params.yaml
index fb67b1aa..2bb89610 100644
--- a/config/naive_bayes.params.yaml
+++ b/config/naive_bayes.params.yaml
@@ -1,7 +1,6 @@
 GaussianNB:
   class: sklearn.naive_bayes.GaussianNB
-  params:
-    alpha: [0.1, 1, 10]
+  params: {}
 BernoulliNB:
   class: sklearn.naive_bayes.BernouliNB
   params: {}

From 3ddd956e5bf45171d9148a614f2357748c05ed6d Mon Sep 17 00:00:00 2001
From: Aaron Halfaker <aaron.halfaker@gmail.com>
Date: Wed, 2 Dec 2015 00:37:33 +0000
Subject: [PATCH 09/12] Increments version to 0.7.4

---
 revscoring/__init__.py | 2 +-
 setup.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/revscoring/__init__.py b/revscoring/__init__.py
index 7bef0ea7..e20cdcc7 100644
--- a/revscoring/__init__.py
+++ b/revscoring/__init__.py
@@ -108,6 +108,6 @@
 from .languages import Language
 from .scorer_models import ScorerModel
 
-__version__ = "0.7.3"
+__version__ = "0.7.4"
 
 __all__ = [Datasource, Dependent, Extractor, Feature, Language, ScorerModel]
diff --git a/setup.py b/setup.py
index e09fde9c..90adff80 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ def requirements(fname):
 
 setup(
     name="revscoring",
-    version="0.7.3", # change in revscoring/__init__.py
+    version="0.7.4", # change in revscoring/__init__.py
     author="Aaron Halfaker",
     author_email="ahalfaker@wikimedia.org",
     description=("A set of utilities for generating quality scores for " + \

From 102be517adb5eec06652c6134e167ccf24a14389 Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Wed, 2 Dec 2015 11:40:36 -0600
Subject: [PATCH 10/12] Updates revscoring utility to list model_info and tune
 utilities.

---
 revscoring/revscoring.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/revscoring/revscoring.py b/revscoring/revscoring.py
index 4e7cfbe1..7be1b0e5 100644
--- a/revscoring/revscoring.py
+++ b/revscoring/revscoring.py
@@ -1,11 +1,15 @@
 """
 Provides access to a set of utilities for working with revision scorer models.
 
-Utilities
+Utilities:
 
-* score             Scores a set of revisions
+* score             Scores a set of revisions using a trained model
 * extract_features  Extracts a list of features for a set of revisions
-* train_test        Trains and tests a MLScorerModel with extracted features.
+* model_info        Reads a model-file and reports metadata and testing
+                    statistics
+* train_test        Trains and tests a MLScorerModel with extracted features
+* tune              Tunes a set of models against a training set to identify
+                    the best model/configuration
 
 Usage:
     revscoring (-h | --help)

From c3105c1df107a34de35181bd7219889d32f8e83f Mon Sep 17 00:00:00 2001
From: halfak <aaron.halfaker@gmail.com>
Date: Wed, 2 Dec 2015 11:48:51 -0600
Subject: [PATCH 11/12] Fixes test for sklearn classifier.

---
 revscoring/scorer_models/tests/test_scorer_model.py   |  9 +--------
 .../scorer_models/tests/test_sklearn_classifier.py    | 11 +++++++++++
 2 files changed, 12 insertions(+), 8 deletions(-)
 create mode 100644 revscoring/scorer_models/tests/test_sklearn_classifier.py

diff --git a/revscoring/scorer_models/tests/test_scorer_model.py b/revscoring/scorer_models/tests/test_scorer_model.py
index b7fbc4d1..bb76ac68 100644
--- a/revscoring/scorer_models/tests/test_scorer_model.py
+++ b/revscoring/scorer_models/tests/test_scorer_model.py
@@ -1,17 +1,10 @@
 from nose.tools import eq_
 
 from ...features import Feature
-from ..scorer_model import ScikitLearnClassifier, ScorerModel
+from ..scorer_model import ScorerModel
 
 
 def test_scorer_model():
     sm = ScorerModel([Feature("foo")], version="0.0.1")
 
     eq_(sm.version, "0.0.1")
-
-
-def test_sklean_classifier():
-    skc = ScikitLearnClassifier([Feature("foo")], classifier_model=None,
-                                version="0.0.1")
-
-    eq_(skc.version, "0.0.1")
diff --git a/revscoring/scorer_models/tests/test_sklearn_classifier.py b/revscoring/scorer_models/tests/test_sklearn_classifier.py
new file mode 100644
index 00000000..1ea44d29
--- /dev/null
+++ b/revscoring/scorer_models/tests/test_sklearn_classifier.py
@@ -0,0 +1,11 @@
+from nose.tools import eq_
+
+from ...features import Feature
+from ..sklearn_classifier import ScikitLearnClassifier
+
+
+def test_sklean_classifier():
+    skc = ScikitLearnClassifier([Feature("foo")], classifier_model=None,
+                                version="0.0.1")
+
+    eq_(skc.version, "0.0.1")

From 5840b9176acf4db258b3bdb86c7e80b0b4494a4f Mon Sep 17 00:00:00 2001
From: Aaron Halfaker <aaron.halfaker@gmail.com>
Date: Thu, 3 Dec 2015 21:03:13 +0000
Subject: [PATCH 12/12] Adds error handling to cross-validation

---
 revscoring/utilities/tune.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py
index dcdaa47b..66f2def6 100644
--- a/revscoring/utilities/tune.py
+++ b/revscoring/utilities/tune.py
@@ -205,13 +205,23 @@ def _cross_validate(observations, estimator, params, scoring="roc_auc",
     start = time.time()
     feature_values, labels = (list(vect) for vect in zip(*observations))
     estimator.set_params(**params)
-    scores = cross_validation.cross_val_score(estimator, feature_values,
-                                              labels, scoring=scoring,
-                                              cv=folds)
-    duration = time.time() - start
-    logging.debug("Cross-validated {0} with {1} in {2} hours: {3} ({4})"
-                  .format(estimator, format_params(params),
-                          round(duration / (60 * 60), 3),
-                          round(scores.mean(), 3),
-                          round(scores.std(), 3)))
-    return scores
+
+    try:
+        scores = cross_validation.cross_val_score(
+            estimator, feature_values, labels, scoring=scoring, cv=folds)
+
+        duration = time.time() - start
+        logging.debug("Cross-validated {0} with {1} in {2} hours: {3} ({4})"
+                      .format(estimator.__class__.__name__, 
+                              format_params(params),
+                              round(duration / (60 * 60), 3),
+                              round(scores.mean(), 3),
+                              round(scores.std(), 3)))
+        return scores
+
+    except Exception:
+        logger.warn("Could not load estimator {0}"
+                    .format(config['class']))
+        logger.warn("Exception:\n" + traceback.format_exc())
+        return [0]*folds
+