From 8fe45ae6eefabd04a1867e42a40d076c4ddd505d Mon Sep 17 00:00:00 2001 From: halfak Date: Fri, 27 Nov 2015 11:20:18 -0600 Subject: [PATCH 01/12] (WIP) work towards a grid model tuning script. --- config/all_classifiers.yaml | 49 +++++ revscoring/scorer_models/scorer_model.py | 184 ----------------- .../scorer_models/sklearn_classifier.py | 194 ++++++++++++++++++ revscoring/utilities/tune.py | 129 ++++++++++++ 4 files changed, 372 insertions(+), 184 deletions(-) create mode 100644 config/all_classifiers.yaml create mode 100644 revscoring/scorer_models/sklearn_classifier.py create mode 100644 revscoring/utilities/tune.py diff --git a/config/all_classifiers.yaml b/config/all_classifiers.yaml new file mode 100644 index 00000000..6474d1c3 --- /dev/null +++ b/config/all_classifiers.yaml @@ -0,0 +1,49 @@ + +GradientBoostingClassifier: + class: sklearn.ensemble.GradientBoostingClassifier + params: + n_estimators: [150, 250, 500] + max_depth: [4, 5, 6] + max_features: [log2] + learning_rate: [0.01] +RandomForestClassifier: + class: sklearn.ensemble.RandomForestClassifier + params: + n_estimators: [10, 20, 40, 80, 160, 320, 640], + min_samples_leaf: [1, 2, 4, 8, 16], + max_features: [auto, log2, None], + criterion: [gini, entropy] +LogisticRegression: + class: sklearn.ensemble.GradientBoostingClassifier + params: + - + penalty: "l1", + C: [0.1, 1, 10, 100], + solver: [liblinear] + - + penalty: "l2", + C: [0.1, 1, 10, 100], + solver: [newton-cg, lbfgs, sag] +SVC: + class: sklearn.svm.SVC + params: + - + kernel: [rbf], + probability: True, + gamma: [1e-3, 1e-4, auto], + C: [0.1, 1, 10, 100] + - + kernel: [linear], + probability: True, + C: [0.1, 1, 10, 100] +GaussianNB: + class: sklearn.naive_bayes.GaussianNB + params: + alpha: [0.1, 1, 10] +BernoulliNB: + class: sklearn.naive_bayes.BernouliNB + params: {} +MultinomialNB: + class: sklearn.naive_bayes.MultinomialNB + params: + alpha: [0.1, 1, 10] diff --git a/revscoring/scorer_models/scorer_model.py b/revscoring/scorer_models/scorer_model.py index 278903b0..df6f8269 100644 --- a/revscoring/scorer_models/scorer_model.py +++ b/revscoring/scorer_models/scorer_model.py @@ -184,187 +184,3 @@ def from_config(cls, config, name, section_key="scorer_models"): return cls.load(open(section['model_file'], 'rb')) else: return cls(**{k: v for k, v in section.items() if k != "class"}) - - -class ScikitLearnClassifier(MLScorerModel): - - def __init__(self, features, classifier_model, version=None): - super().__init__(features, version=version) - self.classifier_model = classifier_model - self.stats = None - - def __getattr__(self, attr): - if attr is "stats": - return None - else: - raise AttributeError(attr) - - def train(self, values_labels): - """ - - :Returns: - A dictionary with the fields: - - * seconds_elapsed -- Time in seconds spent fitting the model - """ - start = time.time() - - values, labels = zip(*values_labels) - - # Fit SVC model - self.classifier_model.fit(values, labels) - self.trained = time.time() - - return { - 'seconds_elapsed': time.time() - start - } - - def score(self, feature_values): - """ - Generates a score for a single revision based on a set of extracted - feature_values. - - :Parameters: - feature_values : collection(`mixed`) - an ordered collection of values that correspond to the - `Feature` s provided to the constructor - - :Returns: - A dict with the fields: - - * predicion -- The most likely class - * probability -- A mapping of probabilities for input classes - corresponding to the classes the classifier was - trained on. Generating this probability is - slower than a simple prediction. - """ - prediction = self.classifier_model.predict([feature_values])[0] - labels = self.classifier_model.classes_ - probas = self.classifier_model.predict_proba([feature_values])[0] - probability = {label: proba for label, proba in zip(labels, probas)} - - doc = { - 'prediction': prediction, - 'probability': probability - } - return normalize_json(doc) - - def test(self, values_labels): - """ - :Returns: - A dictionary of test statistics with the fields: - - * accuracy -- The mean accuracy of classification - * table -- A truth table for classification - * roc - * auc -- The area under the ROC curve - """ - values, labels = zip(*values_labels) - - scores = [self.score(feature_values) for feature_values in values] - - self.stats = { - 'table': self._label_table(scores, labels), - 'accuracy': self.classifier_model.score(values, labels), - 'roc': self._roc_stats(scores, labels, - self.classifier_model.classes_) - } - return self.stats - - def info(self): - return normalize_json({ - 'type': self.__class__.__name__, - 'version': self.version, - 'trained': self.trained, - 'stats': self.stats - }) - - def format_info(self): - info = self.info() - formatted = io.StringIO() - formatted.write("ScikitLearnClassifier\n") - formatted.write(" - type: {0}\n".format(info.get('type'))) - formatted.write(" - version: {0}\n".format(info.get('version'))) - if isinstance(info['trained'], float): - date_string = datetime.fromtimestamp(info['trained']).isoformat() - formatted.write(" - trained: {0}\n".format(date_string)) - else: - formatted.write(" - trained: {0}\n".format(info.get('trained'))) - - formatted.write("\n") - formatted.write(self.format_stats()) - return formatted.getvalue() - - def format_stats(self): - if self.stats is None: - return "No stats available" - else: - formatted = io.StringIO() - predicted_actuals = self.stats['table'].keys() - possible = list(set(actual for _, actual in predicted_actuals)) - possible.sort() - - formatted.write("Accuracy: {0}\n\n".format(self.stats['accuracy'])) - if 'auc' in self.stats['roc']: - formatted.write("ROC-AUC: {0}\n\n" - .format(self.stats['roc']['auc'])) - else: - formatted.write("ROC-AUC:\n") - - table_data = [[comparison_label, - self.stats['roc'][comparison_label]['auc']] - for comparison_label in possible] - formatted.write(tabulate(table_data)) - formatted.write("\n\n") - - table_data = [] - - for actual in possible: - table_data.append( - [(str(actual))] + - [self.stats['table'].get((predicted, actual), 0) - for predicted in possible] - ) - formatted.write(tabulate( - table_data, - headers=["~{0}".format(p) for p in possible])) - - return formatted.getvalue() - - @classmethod - def _roc_stats(cls, scores, labels, possible_labels): - - if len(possible_labels) <= 2: - # Binary classification, class choice doesn't matter. - comparison_label = possible_labels[0] - return cls._roc_single_class(scores, labels, comparison_label) - else: - roc_stats = {} - for comparison_label in possible_labels: - roc_stats[comparison_label] = \ - cls._roc_single_class(scores, labels, comparison_label) - - return roc_stats - - @classmethod - def _roc_single_class(cls, scores, labels, comparison_label): - probabilities = [s['probability'][comparison_label] - for s in scores] - - true_positives = [l == comparison_label for l in labels] - fpr, tpr, thresholds = roc_curve(true_positives, probabilities) - - return { - 'auc': auc(fpr, tpr) - } - - @staticmethod - def _label_table(scores, labels): - - predicteds = [s['prediction'] for s in scores] - - table = {} - for pair in zip(labels, predicteds): - table[pair] = table.get(pair, 0) + 1 - - return table diff --git a/revscoring/scorer_models/sklearn_classifier.py b/revscoring/scorer_models/sklearn_classifier.py new file mode 100644 index 00000000..47fdd212 --- /dev/null +++ b/revscoring/scorer_models/sklearn_classifier.py @@ -0,0 +1,194 @@ +import io +import time +from datetime import datetime + +from sklearn.grid_search import GridSearchCV +from sklearn.metrics import auc, roc_curve +from tabulate import tabulate + +from .scorer_model import MLScorerModel +from .util import normalize_json + + +class ScikitLearnClassifier(MLScorerModel): + + def __init__(self, features, classifier_model, version=None): + super().__init__(features, version=version) + self.classifier_model = classifier_model + self.stats = None + + def __getattr__(self, attr): + if attr is "stats": + return None + else: + raise AttributeError(attr) + + def train(self, values_labels): + """ + + :Returns: + A dictionary with the fields: + + * seconds_elapsed -- Time in seconds spent fitting the model + """ + start = time.time() + + values, labels = zip(*values_labels) + + # Fit SVC model + self.classifier_model.fit(values, labels) + self.trained = time.time() + + return { + 'seconds_elapsed': time.time() - start + } + + def score(self, feature_values): + """ + Generates a score for a single revision based on a set of extracted + feature_values. + + :Parameters: + feature_values : collection(`mixed`) + an ordered collection of values that correspond to the + `Feature` s provided to the constructor + + :Returns: + A dict with the fields: + + * predicion -- The most likely class + * probability -- A mapping of probabilities for input classes + corresponding to the classes the classifier was + trained on. Generating this probability is + slower than a simple prediction. + """ + prediction = self.classifier_model.predict([feature_values])[0] + labels = self.classifier_model.classes_ + probas = self.classifier_model.predict_proba([feature_values])[0] + probability = {label: proba for label, proba in zip(labels, probas)} + + doc = { + 'prediction': prediction, + 'probability': probability + } + return normalize_json(doc) + + def test(self, values_labels): + """ + :Returns: + A dictionary of test statistics with the fields: + + * accuracy -- The mean accuracy of classification + * table -- A truth table for classification + * roc + * auc -- The area under the ROC curve + """ + values, labels = zip(*values_labels) + + scores = [self.score(feature_values) for feature_values in values] + + self.stats = { + 'table': self._label_table(scores, labels), + 'accuracy': self.classifier_model.score(values, labels), + 'roc': self._roc_stats(scores, labels, + self.classifier_model.classes_) + } + return self.stats + + def info(self): + return normalize_json({ + 'type': self.__class__.__name__, + 'version': self.version, + 'trained': self.trained, + 'stats': self.stats + }) + + def format_info(self): + info = self.info() + formatted = io.StringIO() + formatted.write("ScikitLearnClassifier\n") + formatted.write(" - type: {0}\n".format(info.get('type'))) + formatted.write(" - version: {0}\n".format(info.get('version'))) + if isinstance(info['trained'], float): + date_string = datetime.fromtimestamp(info['trained']).isoformat() + formatted.write(" - trained: {0}\n".format(date_string)) + else: + formatted.write(" - trained: {0}\n".format(info.get('trained'))) + + formatted.write("\n") + formatted.write(self.format_stats()) + return formatted.getvalue() + + def format_stats(self): + if self.stats is None: + return "No stats available" + else: + formatted = io.StringIO() + predicted_actuals = self.stats['table'].keys() + possible = list(set(actual for _, actual in predicted_actuals)) + possible.sort() + + formatted.write("Accuracy: {0}\n\n".format(self.stats['accuracy'])) + if 'auc' in self.stats['roc']: + formatted.write("ROC-AUC: {0}\n\n" + .format(self.stats['roc']['auc'])) + else: + formatted.write("ROC-AUC:\n") + + table_data = [[comparison_label, + self.stats['roc'][comparison_label]['auc']] + for comparison_label in possible] + formatted.write(tabulate(table_data)) + formatted.write("\n\n") + + table_data = [] + + for actual in possible: + table_data.append( + [(str(actual))] + + [self.stats['table'].get((predicted, actual), 0) + for predicted in possible] + ) + formatted.write(tabulate( + table_data, + headers=["~{0}".format(p) for p in possible])) + + return formatted.getvalue() + + @classmethod + def _roc_stats(cls, scores, labels, possible_labels): + + if len(possible_labels) <= 2: + # Binary classification, class choice doesn't matter. + comparison_label = possible_labels[0] + return cls._roc_single_class(scores, labels, comparison_label) + else: + roc_stats = {} + for comparison_label in possible_labels: + roc_stats[comparison_label] = \ + cls._roc_single_class(scores, labels, comparison_label) + + return roc_stats + + @classmethod + def _roc_single_class(cls, scores, labels, comparison_label): + probabilities = [s['probability'][comparison_label] + for s in scores] + + true_positives = [l == comparison_label for l in labels] + fpr, tpr, thresholds = roc_curve(true_positives, probabilities) + + return { + 'auc': auc(fpr, tpr) + } + + @staticmethod + def _label_table(scores, labels): + + predicteds = [s['prediction'] for s in scores] + + table = {} + for pair in zip(labels, predicteds): + table[pair] = table.get(pair, 0) + 1 + + return table diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py new file mode 100644 index 00000000..89e3a893 --- /dev/null +++ b/revscoring/utilities/tune.py @@ -0,0 +1,129 @@ +""" +Tunes a set of models against a training set to identify the best +model/configuration. + +Usage: + tune [--observations=] + [--scoring=] + [--test-prop=] + [--folds=] + [--report=] + [--label-type=] + [--verbose] + [--debug] + +""" +import logging +import multiprocessing +import random + +import numpy as np +import yamlconf +from sklearn import cross_validation, grid_search +from sklearn.cross_validation import StratifiedKFold +from sklearn.metrics import (accuracy_score, auc, f1_score, + precision_recall_curve, + precision_recall_fscore_support, precision_score, + recall_score, roc_auc_score, roc_curve) + +logger = logging.getLogger(__name__) + + +def run(params_config, observations, scoring, test_prop, folds, report, + processes, verbose): + + # Split train and test + train_set, test_set = train_test_split(observations, test_prop=test_prop) + + best_fits = [] + + # For each estimator, run gridsearch. + for name, config in params_config: + logger.info("Running gridsearch for {0}".format(name)) + EstimatorClass = yamlconf.import_module(config['class']) + estimator = EstimatorClass() + if not hasattr(estimator, "fit"): + raise RuntimeError("Estimator {0} does not have a fit() method." + .format(config['class'])) + + logger.info("Running gridsearch for {0}...".format(name)) + grid_model = gridsearch(train_set, estimator, config['params'], + scoring=scoring, folds=folds, + processes=processes) + + logger.info("Completed gridsearch for {0}.".format(name)) + best_params, best_score, _ = max(grid_model.grid_scores_, + key=lambda x: x[1]) + logger.info("\tBest fit: {0}={1} with {2}" + .format(scoring, best_score, best_params)) + + f1, roc_auc = test_model(test_set, grid_model) + logger.info("\tTest set fit: f1={0}, roc_auc={1}\n" + .format(f1, roc_auc)) + + best_fits.append((name, best_params, best_score, f1, roc_auc)) + + # TODO: should be tabular + logger.info("\tGrid scores:") + for params, mean_score, scores in grid_model.grid_scores_: + logger.info("\t - %0.3f (+/-%0.03f) for %r" + % (mean_score, scores.std(), params)) + + +def train_test_split(observations, test_prop=0.25): + # Split train and test set from obs. + observations = list(observations) + random.shuffle(observations) + + test_set_size = int(len(observations) * test_prop) + test_set = observations[:test_set_size] + logger.debug("Test set: {0}".format(len(test_set))) + + train_set = observations[test_set_size:] + logger.debug("Train set: {0}".format(len(train_set))) + + return train_set, test_set + + +def gridsearch(observations, estimator, param_grid=None, + scoring='roc_auc', folds=5, processes=None): + """ + Determine the best model via cross validation. This should be run on + training data with test data withheld. + """ + param_grid = param_grid or {} + + processes = processes or multiprocessing.cpu_count() + + stratified_cv = cross_validation.StratifiedKFold(labels, n_folds=folds) + + grid_model = grid_search.GridSearchCV( + cv=stratified_cv, + estimator=estimator, + param_grid=param_grid, + scoring=scoring, + n_jobs=processes + ) + + # This line actually performs the gridsearch + feature_values, labels = (list(vals) for vals in zip(*observations)) + grid_model.fit(feature_values, labels) + + return grid_model + +def test_model(observations, grid_model): + + feature_values, labels = (list(vals) for vals in zip(*observations)) + predictions = model_grid.predict(feature_values) + scores = get_scores(model_grid, feature_values) + + return f1_score(labels, predictions), roc_auc_score(labels, scores) + +# To compute an ROC score, you need scores for each example, either a class probability +# of a distance from the decision boundary +def get_scores(model, X): + try: + scores = model.decision_function(X) + except: + scores = model.predict_proba(X)[:, 1] + return scores From b922e30e3f61d7fcea8380a066c3d4d90bbf8b7f Mon Sep 17 00:00:00 2001 From: halfak Date: Sat, 28 Nov 2015 13:42:33 -0600 Subject: [PATCH 02/12] (WIP) Working tune utility. Still testing for performance. --- config/linear_svc.params.yaml | 8 + ...s.yaml => sklearn_classifiers.params.yaml} | 24 +-- revscoring/scorer_models/__init__.py | 3 +- revscoring/scorer_models/nb.py | 2 +- revscoring/scorer_models/rf.py | 2 +- revscoring/scorer_models/svc.py | 2 +- revscoring/utilities/train_test.py | 65 ++---- revscoring/utilities/tune.py | 185 +++++++++++++----- revscoring/utilities/util.py | 60 ++++-- 9 files changed, 219 insertions(+), 132 deletions(-) create mode 100644 config/linear_svc.params.yaml rename config/{all_classifiers.yaml => sklearn_classifiers.params.yaml} (70%) diff --git a/config/linear_svc.params.yaml b/config/linear_svc.params.yaml new file mode 100644 index 00000000..8c4d2e93 --- /dev/null +++ b/config/linear_svc.params.yaml @@ -0,0 +1,8 @@ + +SVC: + class: sklearn.svm.SVC + params: + - + kernel: ['linear'] + probability: [true] + C: [0.1, 1, 10] diff --git a/config/all_classifiers.yaml b/config/sklearn_classifiers.params.yaml similarity index 70% rename from config/all_classifiers.yaml rename to config/sklearn_classifiers.params.yaml index 6474d1c3..277d65a5 100644 --- a/config/all_classifiers.yaml +++ b/config/sklearn_classifiers.params.yaml @@ -9,32 +9,32 @@ GradientBoostingClassifier: RandomForestClassifier: class: sklearn.ensemble.RandomForestClassifier params: - n_estimators: [10, 20, 40, 80, 160, 320, 640], - min_samples_leaf: [1, 2, 4, 8, 16], - max_features: [auto, log2, None], + n_estimators: [10, 20, 40, 80, 160, 320, 640] + min_samples_leaf: [1, 2, 4, 8, 16] + max_features: [auto, log2, None] criterion: [gini, entropy] LogisticRegression: class: sklearn.ensemble.GradientBoostingClassifier params: - - penalty: "l1", - C: [0.1, 1, 10, 100], + penalty: "l1" + C: [0.1, 1, 10, 100] solver: [liblinear] - - penalty: "l2", - C: [0.1, 1, 10, 100], + penalty: "l2" + C: [0.1, 1, 10, 100] solver: [newton-cg, lbfgs, sag] SVC: class: sklearn.svm.SVC params: - - kernel: [rbf], - probability: True, - gamma: [1e-3, 1e-4, auto], + kernel: [rbf] + probability: True + gamma: [1e-3, 1e-4, auto] C: [0.1, 1, 10, 100] - - kernel: [linear], - probability: True, + kernel: [linear] + probability: True C: [0.1, 1, 10, 100] GaussianNB: class: sklearn.naive_bayes.GaussianNB diff --git a/revscoring/scorer_models/__init__.py b/revscoring/scorer_models/__init__.py index deb40379..55565735 100644 --- a/revscoring/scorer_models/__init__.py +++ b/revscoring/scorer_models/__init__.py @@ -27,7 +27,8 @@ from .svc import SVC, SVCModel, LinearSVC, LinearSVCModel, RBFSVC, RBFSVCModel from .nb import (NB, NBModel, GaussianNB, GaussianNBModel, MultinomialNB, MultinomialNBModel, BernoulliNB, BernoulliNBModel) -from .scorer_model import ScorerModel, MLScorerModel, ScikitLearnClassifier +from .scorer_model import ScorerModel, MLScorerModel +from .sklearn_classifier import ScikitLearnClassifier from .rf import RF, RFModel __all__ = [ diff --git a/revscoring/scorer_models/nb.py b/revscoring/scorer_models/nb.py index 2aeb6216..10bf3afa 100644 --- a/revscoring/scorer_models/nb.py +++ b/revscoring/scorer_models/nb.py @@ -17,7 +17,7 @@ from sklearn import naive_bayes -from .scorer_model import ScikitLearnClassifier +from .sklearn_classifier import ScikitLearnClassifier logger = logging.getLogger("revscoring.scorers.nb") diff --git a/revscoring/scorer_models/rf.py b/revscoring/scorer_models/rf.py index d97b09f9..185074b6 100644 --- a/revscoring/scorer_models/rf.py +++ b/revscoring/scorer_models/rf.py @@ -9,7 +9,7 @@ from sklearn.ensemble import RandomForestClassifier -from .scorer_model import ScikitLearnClassifier +from .sklearn_classifier import ScikitLearnClassifier logger = logging.getLogger("revscoring.scorers.rf") diff --git a/revscoring/scorer_models/svc.py b/revscoring/scorer_models/svc.py index d8a588e3..bb4e6c51 100644 --- a/revscoring/scorer_models/svc.py +++ b/revscoring/scorer_models/svc.py @@ -20,7 +20,7 @@ from sklearn import svm -from .scorer_model import ScikitLearnClassifier +from .sklearn_classifier import ScikitLearnClassifier class SVC(ScikitLearnClassifier): diff --git a/revscoring/utilities/train_test.py b/revscoring/utilities/train_test.py index 8175a4a2..7e82dde5 100644 --- a/revscoring/utilities/train_test.py +++ b/revscoring/utilities/train_test.py @@ -40,8 +40,9 @@ import sys import docopt +import yamlconf -from .util import import_from_path +from . import util logger = logging.getLogger(__name__) @@ -54,8 +55,8 @@ def main(argv=None): format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' ) - ScorerModel = import_from_path(args['']) - features = import_from_path(args['']) + ScorerModel = yamlconf.import_module(args['']) + features = yamlconf.import_module(args['']) version = args['--version'] @@ -67,55 +68,29 @@ def main(argv=None): scorer_model = ScorerModel(features, version=version, **model_kwargs) if args['--values-labels'] == "": - values_labels_file = sys.stdin + observations_f = sys.stdin else: - values_labels_file = open(args['--values-labels'], 'r') + observations_f = open(args['--values-labels'], 'r') if args['--model-file'] == "": model_file = sys.stdout.buffer else: model_file = open(args['--model-file'], 'wb') - decode_label = DECODERS[args['--label-type']] + decode_label = util.DECODERS[args['--label-type']] - feature_labels = read_value_labels(values_labels_file, - scorer_model.features, - decode_label) + observations = util.read_observations(observations_f, + scorer_model.features, + decode_label) test_prop = float(args['--test-prop']) - run(feature_labels, model_file, scorer_model, test_prop) + run(observations, model_file, scorer_model, test_prop) -DECODERS = { - 'int': lambda v: int(v), - 'float': lambda v: float(v), - 'str': lambda v: str(v), - 'bool': lambda v: v in ("True", "true", "1", "T", "y", "Y") -} +def run(observations, model_file, scorer_model, test_prop): -def read_value_labels(f, features, decode_label): - for line in f: - parts = line.strip().split("\t") - values = parts[:-1] - label = parts[-1] - - label = decode_label(label) - - feature_values = [] - for feature, value in zip(features, values): - - if feature.returns == bool: - feature_values.append(value == "True") - else: - feature_values.append(feature.returns(value)) - - yield feature_values, label - - -def run(feature_labels, model_file, scorer_model, test_prop): - - scorer_model = _train_test(scorer_model, feature_labels, test_prop) + scorer_model = _train_test(scorer_model, observations, test_prop) sys.stderr.write(scorer_model.format_info()) @@ -124,19 +99,17 @@ def run(feature_labels, model_file, scorer_model, test_prop): scorer_model.dump(model_file) -def _train_test(scorer_model, feature_labels, test_prop): - feature_labels = list(feature_labels) - random.shuffle(feature_labels) - - test_set_size = int(len(feature_labels) * test_prop) - test_set = feature_labels[:test_set_size] +def _train_test(scorer_model, observations, test_prop): + train_set, test_set = util.train_test_split(observations, + test_prop=test_prop) + logger.debug("Test set: {0}".format(len(test_set))) - - train_set = feature_labels[test_set_size:] logger.debug("Train set: {0}".format(len(train_set))) + logger.info("Training model...") scorer_model.train(train_set) + logger.info("Testing model...") scorer_model.test(test_set) return scorer_model diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py index 89e3a893..bfd72afb 100644 --- a/revscoring/utilities/tune.py +++ b/revscoring/utilities/tune.py @@ -3,43 +3,108 @@ model/configuration. Usage: - tune [--observations=] - [--scoring=] - [--test-prop=] - [--folds=] - [--report=] - [--label-type=] - [--verbose] - [--debug] + tune [--observations=] + [--scoring=] + [--test-prop=] + [--folds=] + [--report=] + [--label-type=] + [--processes=] + [--verbose] + [--debug] + +Options: + The path to a YAML configuration file containing the + models and parameter values to search when tuning + The classpath to a feature_list to use when + interpreting the feature values of the observations + --observations= The path to a file containing observations to train + and test against. [default: ] + --scoring= The type of scoring strategy to optimize for when + choosing parameter sets [default: roc_auc] + --test-prop= The proportion of observations that should be held + asside for testing. [default: 0.25] + --folds= The number of cross-validation folds to try + [default: 5] + --report= Path to a file to write the tuning report to + [default: ] + --label-type= A type describing the value to expect as a label + [default: str] + --processes= The number of parallel processes to start for + model building [default: ] + --verbose Print progress information to stderr + --debug Print debug information to stderr """ +import datetime +import json import logging import multiprocessing -import random +import sys -import numpy as np +import docopt import yamlconf from sklearn import cross_validation, grid_search -from sklearn.cross_validation import StratifiedKFold -from sklearn.metrics import (accuracy_score, auc, f1_score, - precision_recall_curve, - precision_recall_fscore_support, precision_score, - recall_score, roc_auc_score, roc_curve) +from sklearn.metrics import f1_score, roc_auc_score +from tabulate import tabulate + +from . import util logger = logging.getLogger(__name__) -def run(params_config, observations, scoring, test_prop, folds, report, - processes, verbose): +def main(argv=None): + args = docopt.docopt(__doc__, argv=argv) + + logging.basicConfig( + level=logging.INFO if not args['--debug'] else logging.DEBUG, + format='%(asctime)s %(levelname)s:%(name)s -- %(message)s' + ) + + params_config = yamlconf.load(open(args[''])) + + features = yamlconf.import_module(args['']) + + label_decoder = util.DECODERS[args['--label-type']] + if args['--observations'] == "": + observations_f = sys.stdin + else: + observations_f = open(args['--observations']) + + observations = util.read_observations(observations_f, features, + label_decoder) + + scoring = args['--scoring'] + test_prop = float(args['--test-prop']) + folds = int(args['--folds']) + + if args['--report'] == "": + report = sys.stdout + else: + report = open(args['--report'], "w") + + if args['--processes'] == "": + processes = multiprocessing.cpu_count() + else: + processes = int(args['--processes']) + + verbose = args['--verbose'] + + run(params_config, features, observations, scoring, test_prop, folds, + report, processes, verbose) + + +def run(params_config, features, observations, scoring, test_prop, folds, + report, processes, verbose): # Split train and test - train_set, test_set = train_test_split(observations, test_prop=test_prop) + train_set, test_set = util.train_test_split(observations, + test_prop=test_prop) best_fits = [] # For each estimator, run gridsearch. - for name, config in params_config: - logger.info("Running gridsearch for {0}".format(name)) + for name, config in params_config.items(): EstimatorClass = yamlconf.import_module(config['class']) estimator = EstimatorClass() if not hasattr(estimator, "fit"): @@ -49,7 +114,7 @@ def run(params_config, observations, scoring, test_prop, folds, report, logger.info("Running gridsearch for {0}...".format(name)) grid_model = gridsearch(train_set, estimator, config['params'], scoring=scoring, folds=folds, - processes=processes) + processes=processes, verbose=verbose) logger.info("Completed gridsearch for {0}.".format(name)) best_params, best_score, _ = max(grid_model.grid_scores_, @@ -57,40 +122,56 @@ def run(params_config, observations, scoring, test_prop, folds, report, logger.info("\tBest fit: {0}={1} with {2}" .format(scoring, best_score, best_params)) - f1, roc_auc = test_model(test_set, grid_model) - logger.info("\tTest set fit: f1={0}, roc_auc={1}\n" - .format(f1, roc_auc)) + test_f1, test_auc = test_model(test_set, grid_model) + logger.info("\tTest fit: f1={0}, roc_auc={1}\n" + .format(test_f1, test_auc)) - best_fits.append((name, best_params, best_score, f1, roc_auc)) + best_fits.append((name, best_params, best_score, test_f1, test_auc)) - # TODO: should be tabular logger.info("\tGrid scores:") - for params, mean_score, scores in grid_model.grid_scores_: - logger.info("\t - %0.3f (+/-%0.03f) for %r" - % (mean_score, scores.std(), params)) - - -def train_test_split(observations, test_prop=0.25): - # Split train and test set from obs. - observations = list(observations) - random.shuffle(observations) - - test_set_size = int(len(observations) * test_prop) - test_set = observations[:test_set_size] - logger.debug("Test set: {0}".format(len(test_set))) - - train_set = observations[test_set_size:] - logger.debug("Train set: {0}".format(len(train_set))) - - return train_set, test_set + table = tabulate( + ((round(mean_score, 3), round(scores.std(), 3), + format_params(params)) + for params, mean_score, scores in + grid_model.grid_scores_), + headers=["mean(score)", "std(score)", "params"] + ) + for line in table.split("\n"): + logger.info("\t\t" + line) + + # Sort the results by the best fit + best_fits.sort(key=lambda r: r[2]) + possible_labels = set(label for _, label in observations) + + # Write out the report + report.write("# Model tuning report\n") + report.write("- Date: {0}\n".format(datetime.datetime().isoformat())) + report.write("- Train set: {0}\n".format(len(train_set))) + report.write("- Test set: {0}\n".format(len(test_set))) + report.write("- Labels: {0}\n".format(tuple(possible_labels))) + report.write("\n") + report.write("# Best fits\n") + report.write(tabulate( + ((name, format_params(par), round(score, 3), round(test_f1, 3), + round(test_auc, 3)) + for name, par, score, test_f1, test_auc in best_fits), + headers=["model", "parameters", "score", "test_f1", "test_auc"] + )) + + report.close() + + +def format_params(doc): + return ", ".join("{0}={1}".format(k, json.dumps(v)) for k, v in doc) def gridsearch(observations, estimator, param_grid=None, - scoring='roc_auc', folds=5, processes=None): + scoring='roc_auc', folds=5, processes=None, verbose=False): """ Determine the best model via cross validation. This should be run on training data with test data withheld. """ + feature_values, labels = (list(vals) for vals in zip(*observations)) param_grid = param_grid or {} processes = processes or multiprocessing.cpu_count() @@ -102,7 +183,8 @@ def gridsearch(observations, estimator, param_grid=None, estimator=estimator, param_grid=param_grid, scoring=scoring, - n_jobs=processes + n_jobs=processes, + verbose=verbose ) # This line actually performs the gridsearch @@ -111,16 +193,17 @@ def gridsearch(observations, estimator, param_grid=None, return grid_model -def test_model(observations, grid_model): +def test_model(observations, grid_model): feature_values, labels = (list(vals) for vals in zip(*observations)) - predictions = model_grid.predict(feature_values) - scores = get_scores(model_grid, feature_values) + predictions = grid_model.predict(feature_values) + scores = get_scores(grid_model, feature_values) return f1_score(labels, predictions), roc_auc_score(labels, scores) -# To compute an ROC score, you need scores for each example, either a class probability -# of a distance from the decision boundary + +# To compute an ROC score, you need scores for each example, either a class +# probability or a distance from the decision boundary def get_scores(model, X): try: scores = model.decision_function(X) diff --git a/revscoring/utilities/util.py b/revscoring/utilities/util.py index 077daa3a..d41e3693 100644 --- a/revscoring/utilities/util.py +++ b/revscoring/utilities/util.py @@ -1,23 +1,5 @@ +import random import sys -from importlib import import_module - -sys.path.insert(0, ".") # Necessary for working in other modules - - -def import_from_path(path): - try: - module = import_module(path) - return module - except ImportError: - parts = path.split(".") - module_path = ".".join(parts[:-1]) - attribute_name = parts[-1] - - module = import_module(module_path) - - attribute = getattr(module, attribute_name) - - return attribute def encode(val, none_val="NULL"): @@ -29,3 +11,43 @@ def encode(val, none_val="NULL"): val = str(val) return val.replace("\t", "\\t").replace("\n", "\\n") + + +DECODERS = { + 'int': lambda v: int(v), + 'float': lambda v: float(v), + 'str': lambda v: str(v), + 'bool': lambda v: v in ("True", "true", "1", "T", "y", "Y") +} + + +def read_observations(f, features, decode_label): + for line in f: + parts = line.strip().split("\t") + values = parts[:-1] + label = parts[-1] + + label = decode_label(label) + + feature_values = [] + for feature, value in zip(features, values): + + if feature.returns == bool: + feature_values.append(value == "True") + else: + feature_values.append(feature.returns(value)) + + yield feature_values, label + + +def train_test_split(observations, test_prop=0.25): + # Split train and test set from obs. + observations = list(observations) + random.shuffle(observations) + + test_set_size = int(len(observations) * test_prop) + + test_set = observations[:test_set_size] + train_set = observations[test_set_size:] + + return train_set, test_set From f5c9ee963f5df044477f32559e8633baaf04f251 Mon Sep 17 00:00:00 2001 From: halfak Date: Sat, 28 Nov 2015 16:22:32 -0600 Subject: [PATCH 03/12] Minor fixes to tuning. Complete test run. --- config/linear_svc.params.yaml | 2 +- config/sklearn_classifiers.params.yaml | 24 ++++++++--------- revscoring/utilities/tune.py | 36 ++++++++++++++++---------- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/config/linear_svc.params.yaml b/config/linear_svc.params.yaml index 8c4d2e93..b6c91bc9 100644 --- a/config/linear_svc.params.yaml +++ b/config/linear_svc.params.yaml @@ -5,4 +5,4 @@ SVC: - kernel: ['linear'] probability: [true] - C: [0.1, 1, 10] + C: [0.1, 1] diff --git a/config/sklearn_classifiers.params.yaml b/config/sklearn_classifiers.params.yaml index 277d65a5..aaf1a718 100644 --- a/config/sklearn_classifiers.params.yaml +++ b/config/sklearn_classifiers.params.yaml @@ -4,37 +4,37 @@ GradientBoostingClassifier: params: n_estimators: [150, 250, 500] max_depth: [4, 5, 6] - max_features: [log2] + max_features: ["log2"] learning_rate: [0.01] RandomForestClassifier: class: sklearn.ensemble.RandomForestClassifier params: n_estimators: [10, 20, 40, 80, 160, 320, 640] min_samples_leaf: [1, 2, 4, 8, 16] - max_features: [auto, log2, None] - criterion: [gini, entropy] + max_features: ["auto", "log2", null] + criterion: ["gini", "entropy"] LogisticRegression: class: sklearn.ensemble.GradientBoostingClassifier params: - - penalty: "l1" + penalty: ["l1"] C: [0.1, 1, 10, 100] - solver: [liblinear] + solver: ["liblinear"] - - penalty: "l2" + penalty: ["l2"] C: [0.1, 1, 10, 100] - solver: [newton-cg, lbfgs, sag] + solver: ["newton-cg", "lbfgs", "sag"] SVC: class: sklearn.svm.SVC params: - - kernel: [rbf] - probability: True - gamma: [1e-3, 1e-4, auto] + kernel: ["rbf"] + probability: [true] + gamma: [0.001, 0.0001, "auto"] C: [0.1, 1, 10, 100] - - kernel: [linear] - probability: True + kernel: ["linear"] + probability: [true] C: [0.1, 1, 10, 100] GaussianNB: class: sklearn.naive_bayes.GaussianNB diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py index bfd72afb..ec6ecee2 100644 --- a/revscoring/utilities/tune.py +++ b/revscoring/utilities/tune.py @@ -41,6 +41,7 @@ import logging import multiprocessing import sys +import time import docopt import yamlconf @@ -111,16 +112,25 @@ def run(params_config, features, observations, scoring, test_prop, folds, raise RuntimeError("Estimator {0} does not have a fit() method." .format(config['class'])) + parameter_grid = grid_search.ParameterGrid(config['params']) logger.info("Running gridsearch for {0}...".format(name)) + logger.debug("{0} parameter sets:".format(len(parameter_grid))) + for params in parameter_grid: + logger.debug(" - {0}".format(format_params(params))) + logger.debug("{0} folds per parameter set".format(folds)) + + start = time.time() grid_model = gridsearch(train_set, estimator, config['params'], scoring=scoring, folds=folds, processes=processes, verbose=verbose) - logger.info("Completed gridsearch for {0}.".format(name)) + logger.info("Completed gridsearch for {0} in {1} hours." + .format(name, round((time.time() - start) / (60 * 60), 3))) best_params, best_score, _ = max(grid_model.grid_scores_, key=lambda x: x[1]) logger.info("\tBest fit: {0}={1} with {2}" - .format(scoring, best_score, best_params)) + .format(scoring, round(best_score, 3), + format_params(best_params))) test_f1, test_auc = test_model(test_set, grid_model) logger.info("\tTest fit: f1={0}, roc_auc={1}\n" @@ -140,15 +150,16 @@ def run(params_config, features, observations, scoring, test_prop, folds, logger.info("\t\t" + line) # Sort the results by the best fit - best_fits.sort(key=lambda r: r[2]) - possible_labels = set(label for _, label in observations) + best_fits.sort(key=lambda r: r[2], reverse=True) + possible_labels = set(label for _, label in train_set) # Write out the report report.write("# Model tuning report\n") - report.write("- Date: {0}\n".format(datetime.datetime().isoformat())) + report.write("- Date: {0}\n".format(datetime.datetime.now().isoformat())) report.write("- Train set: {0}\n".format(len(train_set))) report.write("- Test set: {0}\n".format(len(test_set))) - report.write("- Labels: {0}\n".format(tuple(possible_labels))) + report.write("- Labels: {0}\n".format(json.dumps(list(possible_labels)))) + report.write("- Scoring: {0}\n".format(scoring)) report.write("\n") report.write("# Best fits\n") report.write(tabulate( @@ -157,12 +168,14 @@ def run(params_config, features, observations, scoring, test_prop, folds, for name, par, score, test_f1, test_auc in best_fits), headers=["model", "parameters", "score", "test_f1", "test_auc"] )) + report.write("\n") report.close() def format_params(doc): - return ", ".join("{0}={1}".format(k, json.dumps(v)) for k, v in doc) + return ", ".join("{0}={1}".format(k, json.dumps(v)) + for k, v in doc.items()) def gridsearch(observations, estimator, param_grid=None, @@ -176,18 +189,15 @@ def gridsearch(observations, estimator, param_grid=None, processes = processes or multiprocessing.cpu_count() - stratified_cv = cross_validation.StratifiedKFold(labels, n_folds=folds) - grid_model = grid_search.GridSearchCV( - cv=stratified_cv, + cv=folds, estimator=estimator, param_grid=param_grid, scoring=scoring, - n_jobs=processes, - verbose=verbose + n_jobs=processes ) - # This line actually performs the gridsearch + # To perform the gridsearch, we run fit() feature_values, labels = (list(vals) for vals in zip(*observations)) grid_model.fit(feature_values, labels) From fc37462bcf6155f55f2e092445f32b066d081b00 Mon Sep 17 00:00:00 2001 From: halfak Date: Sat, 28 Nov 2015 19:06:05 -0600 Subject: [PATCH 04/12] Removes old config ptwiki config files that were never used. --- config/ptwiki_api.yaml | 4 ---- config/ptwiki_svc.yaml | 4 ---- 2 files changed, 8 deletions(-) delete mode 100644 config/ptwiki_api.yaml delete mode 100644 config/ptwiki_svc.yaml diff --git a/config/ptwiki_api.yaml b/config/ptwiki_api.yaml deleted file mode 100644 index cdb94539..00000000 --- a/config/ptwiki_api.yaml +++ /dev/null @@ -1,4 +0,0 @@ - -class: revscores.APIExtractor -url: https://pt.wikipedia.org/w/api.php -language: revscores.language.Portuguese diff --git a/config/ptwiki_svc.yaml b/config/ptwiki_svc.yaml deleted file mode 100644 index 0b46bf9b..00000000 --- a/config/ptwiki_svc.yaml +++ /dev/null @@ -1,4 +0,0 @@ -/* This is just thinking out loud */ - -class: revscores.scorers.LinearSVC -file: ptwiki_svc.model From e944636b99963bc115d72ae5ac5034ff5d32c4dc Mon Sep 17 00:00:00 2001 From: halfak Date: Sat, 28 Nov 2015 19:44:07 -0600 Subject: [PATCH 05/12] Cleanup to tuning utility and add config files for each classifier's param space. --- config/gradient_boost.params.yaml | 7 ++ config/linear_svc.params.yaml | 8 --- config/logistic_regression.params.yaml | 6 ++ config/naive_bayes.params.yaml | 11 +++ config/random_forest.params.yaml | 7 ++ config/sklearn_classifiers.params.yaml | 49 -------------- config/svc.params.yaml | 12 ++++ revscoring/utilities/tune.py | 92 ++++++++++++++------------ 8 files changed, 92 insertions(+), 100 deletions(-) create mode 100644 config/gradient_boost.params.yaml delete mode 100644 config/linear_svc.params.yaml create mode 100644 config/logistic_regression.params.yaml create mode 100644 config/naive_bayes.params.yaml create mode 100644 config/random_forest.params.yaml delete mode 100644 config/sklearn_classifiers.params.yaml create mode 100644 config/svc.params.yaml diff --git a/config/gradient_boost.params.yaml b/config/gradient_boost.params.yaml new file mode 100644 index 00000000..3645d5c8 --- /dev/null +++ b/config/gradient_boost.params.yaml @@ -0,0 +1,7 @@ +GradientBoostingClassifier: + class: sklearn.ensemble.GradientBoostingClassifier + params: + n_estimators: [100, 300, 500, 700] + max_depth: [1, 3, 5, 7] + max_features: ["log2"] + learning_rate: [0.01, 0.1, 0.5, 1] diff --git a/config/linear_svc.params.yaml b/config/linear_svc.params.yaml deleted file mode 100644 index b6c91bc9..00000000 --- a/config/linear_svc.params.yaml +++ /dev/null @@ -1,8 +0,0 @@ - -SVC: - class: sklearn.svm.SVC - params: - - - kernel: ['linear'] - probability: [true] - C: [0.1, 1] diff --git a/config/logistic_regression.params.yaml b/config/logistic_regression.params.yaml new file mode 100644 index 00000000..3116dec0 --- /dev/null +++ b/config/logistic_regression.params.yaml @@ -0,0 +1,6 @@ + +LogisticRegression: + class: sklearn.linear_model.LogisticRegression + params: + penalty: ["l1", "l2"] + C: [0.1, 1, 10] diff --git a/config/naive_bayes.params.yaml b/config/naive_bayes.params.yaml new file mode 100644 index 00000000..fb67b1aa --- /dev/null +++ b/config/naive_bayes.params.yaml @@ -0,0 +1,11 @@ +GaussianNB: + class: sklearn.naive_bayes.GaussianNB + params: + alpha: [0.1, 1, 10] +BernoulliNB: + class: sklearn.naive_bayes.BernouliNB + params: {} +MultinomialNB: + class: sklearn.naive_bayes.MultinomialNB + params: + alpha: [0.1, 1, 10] diff --git a/config/random_forest.params.yaml b/config/random_forest.params.yaml new file mode 100644 index 00000000..31550d48 --- /dev/null +++ b/config/random_forest.params.yaml @@ -0,0 +1,7 @@ +RandomForestClassifier: + class: sklearn.ensemble.RandomForestClassifier + params: + n_estimators: [10, 20, 40, 80, 160, 320, 640] + min_samples_leaf: [1, 3, 5, 7, 13] + max_features: ["log2"] + criterion: ["gini", "entropy"] diff --git a/config/sklearn_classifiers.params.yaml b/config/sklearn_classifiers.params.yaml deleted file mode 100644 index aaf1a718..00000000 --- a/config/sklearn_classifiers.params.yaml +++ /dev/null @@ -1,49 +0,0 @@ - -GradientBoostingClassifier: - class: sklearn.ensemble.GradientBoostingClassifier - params: - n_estimators: [150, 250, 500] - max_depth: [4, 5, 6] - max_features: ["log2"] - learning_rate: [0.01] -RandomForestClassifier: - class: sklearn.ensemble.RandomForestClassifier - params: - n_estimators: [10, 20, 40, 80, 160, 320, 640] - min_samples_leaf: [1, 2, 4, 8, 16] - max_features: ["auto", "log2", null] - criterion: ["gini", "entropy"] -LogisticRegression: - class: sklearn.ensemble.GradientBoostingClassifier - params: - - - penalty: ["l1"] - C: [0.1, 1, 10, 100] - solver: ["liblinear"] - - - penalty: ["l2"] - C: [0.1, 1, 10, 100] - solver: ["newton-cg", "lbfgs", "sag"] -SVC: - class: sklearn.svm.SVC - params: - - - kernel: ["rbf"] - probability: [true] - gamma: [0.001, 0.0001, "auto"] - C: [0.1, 1, 10, 100] - - - kernel: ["linear"] - probability: [true] - C: [0.1, 1, 10, 100] -GaussianNB: - class: sklearn.naive_bayes.GaussianNB - params: - alpha: [0.1, 1, 10] -BernoulliNB: - class: sklearn.naive_bayes.BernouliNB - params: {} -MultinomialNB: - class: sklearn.naive_bayes.MultinomialNB - params: - alpha: [0.1, 1, 10] diff --git a/config/svc.params.yaml b/config/svc.params.yaml new file mode 100644 index 00000000..3ab35a2c --- /dev/null +++ b/config/svc.params.yaml @@ -0,0 +1,12 @@ +SVC: + class: sklearn.svm.SVC + params: + - + kernel: ["rbf"] + probability: [true] + gamma: [0.001, 0.0001, "auto"] + C: [0.1, 1, 10] + - + kernel: ["linear"] + probability: [true] + C: [0.1, 1, 10] diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py index ec6ecee2..2ce2175b 100644 --- a/revscoring/utilities/tune.py +++ b/revscoring/utilities/tune.py @@ -42,10 +42,11 @@ import multiprocessing import sys import time +import traceback import docopt import yamlconf -from sklearn import cross_validation, grid_search +from sklearn import grid_search from sklearn.metrics import f1_score, roc_auc_score from tabulate import tabulate @@ -106,48 +107,53 @@ def run(params_config, features, observations, scoring, test_prop, folds, # For each estimator, run gridsearch. for name, config in params_config.items(): - EstimatorClass = yamlconf.import_module(config['class']) - estimator = EstimatorClass() - if not hasattr(estimator, "fit"): - raise RuntimeError("Estimator {0} does not have a fit() method." - .format(config['class'])) - - parameter_grid = grid_search.ParameterGrid(config['params']) - logger.info("Running gridsearch for {0}...".format(name)) - logger.debug("{0} parameter sets:".format(len(parameter_grid))) - for params in parameter_grid: - logger.debug(" - {0}".format(format_params(params))) - logger.debug("{0} folds per parameter set".format(folds)) - - start = time.time() - grid_model = gridsearch(train_set, estimator, config['params'], - scoring=scoring, folds=folds, - processes=processes, verbose=verbose) - - logger.info("Completed gridsearch for {0} in {1} hours." - .format(name, round((time.time() - start) / (60 * 60), 3))) - best_params, best_score, _ = max(grid_model.grid_scores_, - key=lambda x: x[1]) - logger.info("\tBest fit: {0}={1} with {2}" - .format(scoring, round(best_score, 3), - format_params(best_params))) - - test_f1, test_auc = test_model(test_set, grid_model) - logger.info("\tTest fit: f1={0}, roc_auc={1}\n" - .format(test_f1, test_auc)) - - best_fits.append((name, best_params, best_score, test_f1, test_auc)) - - logger.info("\tGrid scores:") - table = tabulate( - ((round(mean_score, 3), round(scores.std(), 3), - format_params(params)) - for params, mean_score, scores in - grid_model.grid_scores_), - headers=["mean(score)", "std(score)", "params"] - ) - for line in table.split("\n"): - logger.info("\t\t" + line) + try: + EstimatorClass = yamlconf.import_module(config['class']) + estimator = EstimatorClass() + if not hasattr(estimator, "fit"): + raise RuntimeError("Estimator {0} does not have a fit() method." + .format(config['class'])) + + parameter_grid = grid_search.ParameterGrid(config['params']) + logger.info("Running gridsearch for {0}...".format(name)) + logger.debug("{0} parameter sets:".format(len(parameter_grid))) + for params in parameter_grid: + logger.debug(" - {0}".format(format_params(params))) + logger.debug("{0} folds per parameter set".format(folds)) + + start = time.time() + grid_model = gridsearch(train_set, estimator, config['params'], + scoring=scoring, folds=folds, + processes=processes, verbose=verbose) + + logger.info("Completed gridsearch for {0} in {1} hours." + .format(name, round((time.time() - start) / (60 * 60), 3))) + best_params, best_score, _ = max(grid_model.grid_scores_, + key=lambda x: x[1]) + logger.info("\tBest fit: {0}={1} with {2}" + .format(scoring, round(best_score, 3), + format_params(best_params))) + + test_f1, test_auc = test_model(test_set, grid_model) + logger.info("\tTest fit: f1={0}, roc_auc={1}\n" + .format(test_f1, test_auc)) + + best_fits.append((name, best_params, best_score, test_f1, test_auc)) + + logger.info("\tGrid scores:") + table = tabulate( + ((round(mean_score, 3), round(scores.std(), 3), + format_params(params)) + for params, mean_score, scores in + grid_model.grid_scores_), + headers=["mean(score)", "std(score)", "params"] + ) + for line in table.split("\n"): + logger.info("\t\t" + line) + except Exception: + logger.warn("An error occurred while trying to fit {0}" + .format(name)) + logger.warn("Exception:\n" + traceback.format_exc()) # Sort the results by the best fit best_fits.sort(key=lambda r: r[2], reverse=True) From 1c4787be04a66025a94923bf322ae82d763a928e Mon Sep 17 00:00:00 2001 From: halfak Date: Mon, 30 Nov 2015 19:58:31 -0600 Subject: [PATCH 06/12] Switches tuning utility to use multiprocessing directly. --- config/svc.params.yaml | 2 + revscoring/utilities/tune.py | 211 +++++++++++++++++------------------ 2 files changed, 102 insertions(+), 111 deletions(-) diff --git a/config/svc.params.yaml b/config/svc.params.yaml index 3ab35a2c..2ac68c06 100644 --- a/config/svc.params.yaml +++ b/config/svc.params.yaml @@ -5,8 +5,10 @@ SVC: kernel: ["rbf"] probability: [true] gamma: [0.001, 0.0001, "auto"] + cache_size: [1000] C: [0.1, 1, 10] - kernel: ["linear"] probability: [true] + cache_size: [1000] C: [0.1, 1, 10] diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py index 2ce2175b..dcdaa47b 100644 --- a/revscoring/utilities/tune.py +++ b/revscoring/utilities/tune.py @@ -22,8 +22,6 @@ and test against. [default: ] --scoring= The type of scoring strategy to optimize for when choosing parameter sets [default: roc_auc] - --test-prop= The proportion of observations that should be held - asside for testing. [default: 0.25] --folds= The number of cross-validation folds to try [default: 5] --report= Path to a file to write the tuning report to @@ -43,14 +41,15 @@ import sys import time import traceback +from collections import defaultdict import docopt import yamlconf -from sklearn import grid_search -from sklearn.metrics import f1_score, roc_auc_score +from sklearn import cross_validation, grid_search from tabulate import tabulate from . import util +from .. import __version__ logger = logging.getLogger(__name__) @@ -65,7 +64,8 @@ def main(argv=None): params_config = yamlconf.load(open(args[''])) - features = yamlconf.import_module(args['']) + features_path = args[''] + features = yamlconf.import_module(features_path) label_decoder = util.DECODERS[args['--label-type']] if args['--observations'] == "": @@ -77,7 +77,6 @@ def main(argv=None): label_decoder) scoring = args['--scoring'] - test_prop = float(args['--test-prop']) folds = int(args['--folds']) if args['--report'] == "": @@ -92,90 +91,85 @@ def main(argv=None): verbose = args['--verbose'] - run(params_config, features, observations, scoring, test_prop, folds, + run(params_config, features_path, observations, scoring, folds, report, processes, verbose) -def run(params_config, features, observations, scoring, test_prop, folds, +def run(params_config, features_path, observations, scoring, folds, report, processes, verbose): - # Split train and test - train_set, test_set = util.train_test_split(observations, - test_prop=test_prop) + observations = list(observations) - best_fits = [] + # Prepare the worker pool + logger.debug("Starting up multiprocessing pool (processes={0})" + .format(processes)) + pool = multiprocessing.Pool(processes=processes) - # For each estimator, run gridsearch. - for name, config in params_config.items(): - try: - EstimatorClass = yamlconf.import_module(config['class']) - estimator = EstimatorClass() - if not hasattr(estimator, "fit"): - raise RuntimeError("Estimator {0} does not have a fit() method." - .format(config['class'])) - - parameter_grid = grid_search.ParameterGrid(config['params']) - logger.info("Running gridsearch for {0}...".format(name)) - logger.debug("{0} parameter sets:".format(len(parameter_grid))) - for params in parameter_grid: - logger.debug(" - {0}".format(format_params(params))) - logger.debug("{0} folds per parameter set".format(folds)) - - start = time.time() - grid_model = gridsearch(train_set, estimator, config['params'], - scoring=scoring, folds=folds, - processes=processes, verbose=verbose) - - logger.info("Completed gridsearch for {0} in {1} hours." - .format(name, round((time.time() - start) / (60 * 60), 3))) - best_params, best_score, _ = max(grid_model.grid_scores_, - key=lambda x: x[1]) - logger.info("\tBest fit: {0}={1} with {2}" - .format(scoring, round(best_score, 3), - format_params(best_params))) - - test_f1, test_auc = test_model(test_set, grid_model) - logger.info("\tTest fit: f1={0}, roc_auc={1}\n" - .format(test_f1, test_auc)) - - best_fits.append((name, best_params, best_score, test_f1, test_auc)) - - logger.info("\tGrid scores:") - table = tabulate( - ((round(mean_score, 3), round(scores.std(), 3), - format_params(params)) - for params, mean_score, scores in - grid_model.grid_scores_), - headers=["mean(score)", "std(score)", "params"] - ) - for line in table.split("\n"): - logger.info("\t\t" + line) - except Exception: - logger.warn("An error occurred while trying to fit {0}" - .format(name)) - logger.warn("Exception:\n" + traceback.format_exc()) - - # Sort the results by the best fit - best_fits.sort(key=lambda r: r[2], reverse=True) - possible_labels = set(label for _, label in train_set) - - # Write out the report + # Start writing the model tuning report + possible_labels = set(label for _, label in observations) report.write("# Model tuning report\n") + report.write("- Revscoring version: {0}\n".format(__version__)) + report.write("- Features: {0}\n".format(features_path)) report.write("- Date: {0}\n".format(datetime.datetime.now().isoformat())) - report.write("- Train set: {0}\n".format(len(train_set))) - report.write("- Test set: {0}\n".format(len(test_set))) + report.write("- Observations: {0}\n".format(len(observations))) report.write("- Labels: {0}\n".format(json.dumps(list(possible_labels)))) report.write("- Scoring: {0}\n".format(scoring)) + report.write("- Folds: {0}\n".format(folds)) report.write("\n") - report.write("# Best fits\n") - report.write(tabulate( - ((name, format_params(par), round(score, 3), round(test_f1, 3), - round(test_auc, 3)) - for name, par, score, test_f1, test_auc in best_fits), - headers=["model", "parameters", "score", "test_f1", "test_auc"] - )) + + # For each estimator and paramset, submit the job. + cv_result_sets = defaultdict(lambda : []) + for name, estimator, param_grid in _estimator_param_grid(params_config): + logger.debug("Submitting jobs for {0}:".format(name)) + for params in param_grid: + logger.debug("\tsubmitting {0}..." + .format(format_params(params))) + result = pool.apply_async(_cross_validate, + [observations, estimator, params], + {'scoring': scoring, 'folds': folds}) + cv_result_sets[name].append((params, result)) + + # Barrier synchronization + logger.info("Running gridsearch for {0} model/params pairs ..." + .format(sum(len(p_r) for p_r in cv_result_sets))) + grid_scores = [] + for name, param_results in cv_result_sets.items(): + for params, result in param_results: + scores = result.get() # This is a line that blocks + grid_scores.append((name, params, scores.mean(), scores.std())) + + # Write the rest of the report! First, print the top 10 combinations + report.write("# Top scoring configurations\n") + grid_scores.sort(key=lambda gs: gs[2], reverse=True) + table = tabulate( + ((name, round(mean_score, 3), round(scores.std(), 3), + format_params(params)) + for name, params, mean_score, scores in + grid_scores[:10]), + headers=["model", "mean(scores)", "std(scores)", "params"] + ) + report.write(table + "\n") report.write("\n") + # Now print out scores for each model. + report.write("# Models\n") + for name, param_results in cv_result_sets.items(): + report.write("## {0}\n".format(name)) + + param_scores = ((p, r.get()) for p, r in param_results) + param_stats = [(p, s.mean(), s.std()) for p, s in param_scores] + param_stats.sort(key=lambda v:v[1], reverse=True) + + table = tabulate( + ((round(mean_score, 3), round(scores.std(), 3), + format_params(params)) + for params, mean_score, scores in + param_stats), + headers=["mean(scores)", "std(scores)", "params"] + ) + report.write(table + "\n") + report.write("\n") + report.close() @@ -184,45 +178,40 @@ def format_params(doc): for k, v in doc.items()) -def gridsearch(observations, estimator, param_grid=None, - scoring='roc_auc', folds=5, processes=None, verbose=False): - """ - Determine the best model via cross validation. This should be run on - training data with test data withheld. - """ - feature_values, labels = (list(vals) for vals in zip(*observations)) - param_grid = param_grid or {} - - processes = processes or multiprocessing.cpu_count() - - grid_model = grid_search.GridSearchCV( - cv=folds, - estimator=estimator, - param_grid=param_grid, - scoring=scoring, - n_jobs=processes - ) - - # To perform the gridsearch, we run fit() - feature_values, labels = (list(vals) for vals in zip(*observations)) - grid_model.fit(feature_values, labels) +def _estimator_param_grid(params_config): + for name, config in params_config.items(): + try: + EstimatorClass = yamlconf.import_module(config['class']) + estimator = EstimatorClass() + except Exception: + logger.warn("Could not load estimator {0}" + .format(config['class'])) + logger.warn("Exception:\n" + traceback.format_exc()) + continue - return grid_model + if not hasattr(estimator, "fit"): + logger.warn("Estimator {0} does not have a fit() method." + .format(config['class'])) + continue + param_grid = grid_search.ParameterGrid(config['params']) -def test_model(observations, grid_model): - feature_values, labels = (list(vals) for vals in zip(*observations)) - predictions = grid_model.predict(feature_values) - scores = get_scores(grid_model, feature_values) + yield name, estimator, param_grid - return f1_score(labels, predictions), roc_auc_score(labels, scores) +def _cross_validate(observations, estimator, params, scoring="roc_auc", + folds=5, verbose=False): -# To compute an ROC score, you need scores for each example, either a class -# probability or a distance from the decision boundary -def get_scores(model, X): - try: - scores = model.decision_function(X) - except: - scores = model.predict_proba(X)[:, 1] + start = time.time() + feature_values, labels = (list(vect) for vect in zip(*observations)) + estimator.set_params(**params) + scores = cross_validation.cross_val_score(estimator, feature_values, + labels, scoring=scoring, + cv=folds) + duration = time.time() - start + logging.debug("Cross-validated {0} with {1} in {2} hours: {3} ({4})" + .format(estimator, format_params(params), + round(duration / (60 * 60), 3), + round(scores.mean(), 3), + round(scores.std(), 3))) return scores From 3cc9f600ecd2a8a53811a7557a8c1f80b984232c Mon Sep 17 00:00:00 2001 From: halfak Date: Mon, 30 Nov 2015 20:07:32 -0600 Subject: [PATCH 07/12] Fixes minor issue in svc params config. --- config/svc.params.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/svc.params.yaml b/config/svc.params.yaml index 2ac68c06..04bd0576 100644 --- a/config/svc.params.yaml +++ b/config/svc.params.yaml @@ -4,7 +4,7 @@ SVC: - kernel: ["rbf"] probability: [true] - gamma: [0.001, 0.0001, "auto"] + gamma: [0.0, 0.001, 0.0001] cache_size: [1000] C: [0.1, 1, 10] - From 13369a27d6e2d2b8a81774b501ca7a61ad1c8145 Mon Sep 17 00:00:00 2001 From: Aaron Halfaker Date: Wed, 2 Dec 2015 00:37:21 +0000 Subject: [PATCH 08/12] Minor fix in Bernoulli spelling --- config/naive_bayes.params.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/naive_bayes.params.yaml b/config/naive_bayes.params.yaml index fb67b1aa..2bb89610 100644 --- a/config/naive_bayes.params.yaml +++ b/config/naive_bayes.params.yaml @@ -1,7 +1,6 @@ GaussianNB: class: sklearn.naive_bayes.GaussianNB - params: - alpha: [0.1, 1, 10] + params: {} BernoulliNB: class: sklearn.naive_bayes.BernouliNB params: {} From 3ddd956e5bf45171d9148a614f2357748c05ed6d Mon Sep 17 00:00:00 2001 From: Aaron Halfaker Date: Wed, 2 Dec 2015 00:37:33 +0000 Subject: [PATCH 09/12] Increments version to 0.7.4 --- revscoring/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/revscoring/__init__.py b/revscoring/__init__.py index 7bef0ea7..e20cdcc7 100644 --- a/revscoring/__init__.py +++ b/revscoring/__init__.py @@ -108,6 +108,6 @@ from .languages import Language from .scorer_models import ScorerModel -__version__ = "0.7.3" +__version__ = "0.7.4" __all__ = [Datasource, Dependent, Extractor, Feature, Language, ScorerModel] diff --git a/setup.py b/setup.py index e09fde9c..90adff80 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def requirements(fname): setup( name="revscoring", - version="0.7.3", # change in revscoring/__init__.py + version="0.7.4", # change in revscoring/__init__.py author="Aaron Halfaker", author_email="ahalfaker@wikimedia.org", description=("A set of utilities for generating quality scores for " + \ From 102be517adb5eec06652c6134e167ccf24a14389 Mon Sep 17 00:00:00 2001 From: halfak Date: Wed, 2 Dec 2015 11:40:36 -0600 Subject: [PATCH 10/12] Updates revscoring utility to list model_info and tune utilities. --- revscoring/revscoring.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/revscoring/revscoring.py b/revscoring/revscoring.py index 4e7cfbe1..7be1b0e5 100644 --- a/revscoring/revscoring.py +++ b/revscoring/revscoring.py @@ -1,11 +1,15 @@ """ Provides access to a set of utilities for working with revision scorer models. -Utilities +Utilities: -* score Scores a set of revisions +* score Scores a set of revisions using a trained model * extract_features Extracts a list of features for a set of revisions -* train_test Trains and tests a MLScorerModel with extracted features. +* model_info Reads a model-file and reports metadata and testing + statistics +* train_test Trains and tests a MLScorerModel with extracted features +* tune Tunes a set of models against a training set to identify + the best model/configuration Usage: revscoring (-h | --help) From c3105c1df107a34de35181bd7219889d32f8e83f Mon Sep 17 00:00:00 2001 From: halfak Date: Wed, 2 Dec 2015 11:48:51 -0600 Subject: [PATCH 11/12] Fixes test for sklearn classifier. --- revscoring/scorer_models/tests/test_scorer_model.py | 9 +-------- .../scorer_models/tests/test_sklearn_classifier.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 8 deletions(-) create mode 100644 revscoring/scorer_models/tests/test_sklearn_classifier.py diff --git a/revscoring/scorer_models/tests/test_scorer_model.py b/revscoring/scorer_models/tests/test_scorer_model.py index b7fbc4d1..bb76ac68 100644 --- a/revscoring/scorer_models/tests/test_scorer_model.py +++ b/revscoring/scorer_models/tests/test_scorer_model.py @@ -1,17 +1,10 @@ from nose.tools import eq_ from ...features import Feature -from ..scorer_model import ScikitLearnClassifier, ScorerModel +from ..scorer_model import ScorerModel def test_scorer_model(): sm = ScorerModel([Feature("foo")], version="0.0.1") eq_(sm.version, "0.0.1") - - -def test_sklean_classifier(): - skc = ScikitLearnClassifier([Feature("foo")], classifier_model=None, - version="0.0.1") - - eq_(skc.version, "0.0.1") diff --git a/revscoring/scorer_models/tests/test_sklearn_classifier.py b/revscoring/scorer_models/tests/test_sklearn_classifier.py new file mode 100644 index 00000000..1ea44d29 --- /dev/null +++ b/revscoring/scorer_models/tests/test_sklearn_classifier.py @@ -0,0 +1,11 @@ +from nose.tools import eq_ + +from ...features import Feature +from ..sklearn_classifier import ScikitLearnClassifier + + +def test_sklean_classifier(): + skc = ScikitLearnClassifier([Feature("foo")], classifier_model=None, + version="0.0.1") + + eq_(skc.version, "0.0.1") From 5840b9176acf4db258b3bdb86c7e80b0b4494a4f Mon Sep 17 00:00:00 2001 From: Aaron Halfaker Date: Thu, 3 Dec 2015 21:03:13 +0000 Subject: [PATCH 12/12] Adds error handling to cross-validation --- revscoring/utilities/tune.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/revscoring/utilities/tune.py b/revscoring/utilities/tune.py index dcdaa47b..66f2def6 100644 --- a/revscoring/utilities/tune.py +++ b/revscoring/utilities/tune.py @@ -205,13 +205,23 @@ def _cross_validate(observations, estimator, params, scoring="roc_auc", start = time.time() feature_values, labels = (list(vect) for vect in zip(*observations)) estimator.set_params(**params) - scores = cross_validation.cross_val_score(estimator, feature_values, - labels, scoring=scoring, - cv=folds) - duration = time.time() - start - logging.debug("Cross-validated {0} with {1} in {2} hours: {3} ({4})" - .format(estimator, format_params(params), - round(duration / (60 * 60), 3), - round(scores.mean(), 3), - round(scores.std(), 3))) - return scores + + try: + scores = cross_validation.cross_val_score( + estimator, feature_values, labels, scoring=scoring, cv=folds) + + duration = time.time() - start + logging.debug("Cross-validated {0} with {1} in {2} hours: {3} ({4})" + .format(estimator.__class__.__name__, + format_params(params), + round(duration / (60 * 60), 3), + round(scores.mean(), 3), + round(scores.std(), 3))) + return scores + + except Exception: + logger.warn("Could not load estimator {0}" + .format(config['class'])) + logger.warn("Exception:\n" + traceback.format_exc()) + return [0]*folds +