Merge pull request #219 from wiki-ai/tune

Adds hyperparameter tuning utility
wikimedia · Dec 4, 2015 · 3d16a58 · 3d16a58
2 parents fc03250 + 5840b91
commit 3d16a58
Show file tree

Hide file tree

Showing 21 changed files with 551 additions and 274 deletions.
diff --git a/config/gradient_boost.params.yaml b/config/gradient_boost.params.yaml
@@ -0,0 +1,7 @@
+GradientBoostingClassifier:
+  class: sklearn.ensemble.GradientBoostingClassifier
+  params:
+    n_estimators: [100, 300, 500, 700]
+    max_depth: [1, 3, 5, 7]
+    max_features: ["log2"]
+    learning_rate: [0.01, 0.1,  0.5, 1]
diff --git a/config/logistic_regression.params.yaml b/config/logistic_regression.params.yaml
@@ -0,0 +1,6 @@
+
+LogisticRegression:
+  class: sklearn.linear_model.LogisticRegression
+  params:
+    penalty: ["l1", "l2"]
+    C: [0.1, 1, 10]
diff --git a/config/naive_bayes.params.yaml b/config/naive_bayes.params.yaml
@@ -0,0 +1,10 @@
+GaussianNB:
+  class: sklearn.naive_bayes.GaussianNB
+  params: {}
+BernoulliNB:
+  class: sklearn.naive_bayes.BernouliNB
+  params: {}
+MultinomialNB:
+  class: sklearn.naive_bayes.MultinomialNB
+  params:
+  alpha: [0.1, 1, 10]
diff --git a/config/ptwiki_api.yaml b/config/ptwiki_api.yaml
diff --git a/config/ptwiki_svc.yaml b/config/ptwiki_svc.yaml
diff --git a/config/random_forest.params.yaml b/config/random_forest.params.yaml
@@ -0,0 +1,7 @@
+RandomForestClassifier:
+  class: sklearn.ensemble.RandomForestClassifier
+  params:
+    n_estimators: [10, 20, 40, 80, 160, 320, 640]
+    min_samples_leaf: [1, 3, 5, 7, 13]
+    max_features: ["log2"]
+    criterion: ["gini", "entropy"]
diff --git a/config/svc.params.yaml b/config/svc.params.yaml
@@ -0,0 +1,14 @@
+SVC:
+  class: sklearn.svm.SVC
+  params:
+     -
+      kernel: ["rbf"]
+      probability: [true]
+      gamma: [0.0, 0.001, 0.0001]
+      cache_size: [1000]
+      C: [0.1, 1, 10]
+     -
+      kernel: ["linear"]
+      probability: [true]
+      cache_size: [1000]
+      C: [0.1, 1, 10]
diff --git a/revscoring/__init__.py b/revscoring/__init__.py
@@ -108,6 +108,6 @@
 from .languages import Language
 from .scorer_models import ScorerModel
 
-__version__ = "0.7.3"
+__version__ = "0.7.4"
 
 __all__ = [Datasource, Dependent, Extractor, Feature, Language, ScorerModel]
diff --git a/revscoring/revscoring.py b/revscoring/revscoring.py
@@ -1,11 +1,15 @@
 """
 Provides access to a set of utilities for working with revision scorer models.
 
-Utilities
+Utilities:
 
-* score             Scores a set of revisions
+* score             Scores a set of revisions using a trained model
 * extract_features  Extracts a list of features for a set of revisions
-* train_test        Trains and tests a MLScorerModel with extracted features.
+* model_info        Reads a model-file and reports metadata and testing
+                    statistics
+* train_test        Trains and tests a MLScorerModel with extracted features
+* tune              Tunes a set of models against a training set to identify
+                    the best model/configuration
 
 Usage:
     revscoring (-h | --help)

diff --git a/revscoring/scorer_models/__init__.py b/revscoring/scorer_models/__init__.py
@@ -27,7 +27,8 @@
 from .svc import SVC, SVCModel, LinearSVC, LinearSVCModel, RBFSVC, RBFSVCModel
 from .nb import (NB, NBModel, GaussianNB, GaussianNBModel, MultinomialNB,
                  MultinomialNBModel, BernoulliNB, BernoulliNBModel)
-from .scorer_model import ScorerModel, MLScorerModel, ScikitLearnClassifier
+from .scorer_model import ScorerModel, MLScorerModel
+from .sklearn_classifier import ScikitLearnClassifier
 from .rf import RF, RFModel
 
 __all__ = [

diff --git a/revscoring/scorer_models/nb.py b/revscoring/scorer_models/nb.py
@@ -17,7 +17,7 @@
 
 from sklearn import naive_bayes
 
-from .scorer_model import ScikitLearnClassifier
+from .sklearn_classifier import ScikitLearnClassifier
 
 logger = logging.getLogger("revscoring.scorers.nb")
 

diff --git a/revscoring/scorer_models/rf.py b/revscoring/scorer_models/rf.py
@@ -9,7 +9,7 @@
 
 from sklearn.ensemble import RandomForestClassifier
 
-from .scorer_model import ScikitLearnClassifier
+from .sklearn_classifier import ScikitLearnClassifier
 
 logger = logging.getLogger("revscoring.scorers.rf")
 

diff --git a/revscoring/scorer_models/scorer_model.py b/revscoring/scorer_models/scorer_model.py
@@ -184,187 +184,3 @@ def from_config(cls, config, name, section_key="scorer_models"):
             return cls.load(open(section['model_file'], 'rb'))
         else:
             return cls(**{k: v for k, v in section.items() if k != "class"})
-
-
-class ScikitLearnClassifier(MLScorerModel):
-
-    def __init__(self, features, classifier_model, version=None):
-        super().__init__(features, version=version)
-        self.classifier_model = classifier_model
-        self.stats = None
-
-    def __getattr__(self, attr):
-        if attr is "stats":
-            return None
-        else:
-            raise AttributeError(attr)
-
-    def train(self, values_labels):
-        """
-
-        :Returns:
-            A dictionary with the fields:
-
-            * seconds_elapsed -- Time in seconds spent fitting the model
-        """
-        start = time.time()
-
-        values, labels = zip(*values_labels)
-
-        # Fit SVC model
-        self.classifier_model.fit(values, labels)
-        self.trained = time.time()
-
-        return {
-            'seconds_elapsed': time.time() - start
-        }
-
-    def score(self, feature_values):
-        """
-        Generates a score for a single revision based on a set of extracted
-        feature_values.
-
-        :Parameters:
-            feature_values : collection(`mixed`)
-                an ordered collection of values that correspond to the
-                `Feature` s provided to the constructor
-
-        :Returns:
-            A dict with the fields:
-
-            * predicion -- The most likely class
-            * probability -- A mapping of probabilities for input classes
-                             corresponding to the classes the classifier was
-                             trained on.  Generating this probability is
-                             slower than a simple prediction.
-        """
-        prediction = self.classifier_model.predict([feature_values])[0]
-        labels = self.classifier_model.classes_
-        probas = self.classifier_model.predict_proba([feature_values])[0]
-        probability = {label: proba for label, proba in zip(labels, probas)}
-
-        doc = {
-            'prediction': prediction,
-            'probability': probability
-        }
-        return normalize_json(doc)
-
-    def test(self, values_labels):
-        """
-        :Returns:
-            A dictionary of test statistics with the fields:
-
-            * accuracy -- The mean accuracy of classification
-            * table -- A truth table for classification
-            * roc
-                * auc -- The area under the ROC curve
-        """
-        values, labels = zip(*values_labels)
-
-        scores = [self.score(feature_values) for feature_values in values]
-
-        self.stats = {
-            'table': self._label_table(scores, labels),
-            'accuracy': self.classifier_model.score(values, labels),
-            'roc': self._roc_stats(scores, labels,
-                                   self.classifier_model.classes_)
-        }
-        return self.stats
-
-    def info(self):
-        return normalize_json({
-            'type': self.__class__.__name__,
-            'version': self.version,
-            'trained': self.trained,
-            'stats': self.stats
-        })
-
-    def format_info(self):
-        info = self.info()
-        formatted = io.StringIO()
-        formatted.write("ScikitLearnClassifier\n")
-        formatted.write(" - type: {0}\n".format(info.get('type')))
-        formatted.write(" - version: {0}\n".format(info.get('version')))
-        if isinstance(info['trained'], float):
-            date_string = datetime.fromtimestamp(info['trained']).isoformat()
-            formatted.write(" - trained: {0}\n".format(date_string))
-        else:
-            formatted.write(" - trained: {0}\n".format(info.get('trained')))
-
-        formatted.write("\n")
-        formatted.write(self.format_stats())
-        return formatted.getvalue()
-
-    def format_stats(self):
-        if self.stats is None:
-            return "No stats available"
-        else:
-            formatted = io.StringIO()
-            predicted_actuals = self.stats['table'].keys()
-            possible = list(set(actual for _, actual in predicted_actuals))
-            possible.sort()
-
-            formatted.write("Accuracy: {0}\n\n".format(self.stats['accuracy']))
-            if 'auc' in self.stats['roc']:
-                formatted.write("ROC-AUC: {0}\n\n"
-                                .format(self.stats['roc']['auc']))
-            else:
-                formatted.write("ROC-AUC:\n")
-
-                table_data = [[comparison_label,
-                               self.stats['roc'][comparison_label]['auc']]
-                              for comparison_label in possible]
-                formatted.write(tabulate(table_data))
-                formatted.write("\n\n")
-
-            table_data = []
-
-            for actual in possible:
-                table_data.append(
-                    [(str(actual))] +
-                    [self.stats['table'].get((predicted, actual), 0)
-                     for predicted in possible]
-                )
-            formatted.write(tabulate(
-                table_data,
-                headers=["~{0}".format(p) for p in possible]))
-
-            return formatted.getvalue()
-
-    @classmethod
-    def _roc_stats(cls, scores, labels, possible_labels):
-
-        if len(possible_labels) <= 2:
-            # Binary classification, class choice doesn't matter.
-            comparison_label = possible_labels[0]
-            return cls._roc_single_class(scores, labels, comparison_label)
-        else:
-            roc_stats = {}
-            for comparison_label in possible_labels:
-                roc_stats[comparison_label] = \
-                    cls._roc_single_class(scores, labels, comparison_label)
-
-            return roc_stats
-
-    @classmethod
-    def _roc_single_class(cls, scores, labels, comparison_label):
-        probabilities = [s['probability'][comparison_label]
-                         for s in scores]
-
-        true_positives = [l == comparison_label for l in labels]
-        fpr, tpr, thresholds = roc_curve(true_positives, probabilities)
-
-        return {
-            'auc': auc(fpr, tpr)
-        }
-
-    @staticmethod
-    def _label_table(scores, labels):
-
-        predicteds = [s['prediction'] for s in scores]
-
-        table = {}
-        for pair in zip(labels, predicteds):
-            table[pair] = table.get(pair, 0) + 1
-
-        return table